in source/src/main/grep.c [2305:2612]
SEXP attribute_hidden do_regexpr(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP pat, text, ans;
regex_t reg;
regmatch_t regmatch[10];
R_xlen_t i, n;
int rc, igcase_opt, perl_opt, fixed_opt, useBytes;
const char *spat = NULL; /* -Wall */
const char *s = NULL;
pcre *re_pcre = NULL /* -Wall */;
pcre_extra *re_pe = NULL;
const unsigned char *tables = NULL /* -Wall */;
Rboolean use_UTF8 = FALSE, use_WC = FALSE;
const void *vmax;
int capture_count, *ovector = NULL, ovector_size = 0, /* -Wall */
name_count, name_entry_size, info_code;
char *name_table;
SEXP capture_names = R_NilValue;
int nwarn = 0;
checkArity(op, args);
pat = CAR(args); args = CDR(args);
text = CAR(args); args = CDR(args);
igcase_opt = asLogical(CAR(args)); args = CDR(args);
perl_opt = asLogical(CAR(args)); args = CDR(args);
fixed_opt = asLogical(CAR(args)); args = CDR(args);
useBytes = asLogical(CAR(args)); args = CDR(args);
if (igcase_opt == NA_INTEGER) igcase_opt = 0;
if (perl_opt == NA_INTEGER) perl_opt = 0;
if (fixed_opt == NA_INTEGER) fixed_opt = 0;
if (useBytes == NA_INTEGER) useBytes = 0;
if (fixed_opt && igcase_opt)
warning(_("argument '%s' will be ignored"), "ignore.case = TRUE");
if (fixed_opt && perl_opt) {
warning(_("argument '%s' will be ignored"), "perl = TRUE");
perl_opt = 0;
}
/* Note that excluding NAs differs from grep/sub */
if (!isString(pat) || LENGTH(pat) < 1 || STRING_ELT(pat, 0) == NA_STRING)
error(_("invalid '%s' argument"), "pattern");
if (LENGTH(pat) > 1)
warning(_("argument '%s' has length > 1 and only the first element will be used"), "pattern");
if (!isString(text))
error(_("invalid '%s' argument"), "text");
n = XLENGTH(text);
if (!useBytes) {
Rboolean onlyASCII = IS_ASCII(STRING_ELT(pat, 0));
if (onlyASCII)
for (i = 0; i < n; i++) {
if(STRING_ELT(text, i) == NA_STRING) continue;
if (!IS_ASCII(STRING_ELT(text, i))) {
onlyASCII = FALSE;
break;
}
}
useBytes = onlyASCII;
}
if (!useBytes) {
Rboolean haveBytes = IS_BYTES(STRING_ELT(pat, 0));
if (!haveBytes)
for (i = 0; i < n; i++)
if (IS_BYTES(STRING_ELT(text, i))) {
haveBytes = TRUE;
break;
}
if(haveBytes) {
useBytes = TRUE;
}
}
if (!useBytes && !use_UTF8) {
/* As from R 2.10.0 we use UTF-8 mode in PCRE in all MBCS locales,
and as from 2.11.0 in TRE too. */
if (!fixed_opt && mbcslocale) use_UTF8 = TRUE;
else if (IS_UTF8(STRING_ELT(pat, 0))) use_UTF8 = TRUE;
if (!use_UTF8)
for (i = 0; i < n; i++)
if (IS_UTF8(STRING_ELT(text, i))) {
use_UTF8 = TRUE;
break;
}
}
if (!fixed_opt && !perl_opt) {
/* if we have non-ASCII text in a DBCS locale, we need to use wchar */
if (!useBytes && mbcslocale && !utf8locale) use_UTF8 =TRUE;
use_WC = use_UTF8; use_UTF8 = FALSE;
}
if (useBytes)
spat = CHAR(STRING_ELT(pat, 0));
else if (use_WC) ;
else if (use_UTF8) {
spat = translateCharUTF8(STRING_ELT(pat, 0));
if (!utf8Valid(spat)) error(_("regular expression is invalid UTF-8"));
} else {
spat = translateChar(STRING_ELT(pat, 0));
if (mbcslocale && !mbcsValid(spat))
error(_("regular expression is invalid in this locale"));
}
if (fixed_opt) ;
else if (perl_opt) {
int cflags = 0, erroffset;
const char *errorptr;
if (igcase_opt) cflags |= PCRE_CASELESS;
if (!useBytes && use_UTF8) cflags |= PCRE_UTF8;
// PCRE docs say this is not needed, but it is on Windows
tables = pcre_maketables();
re_pcre = pcre_compile(spat, cflags, &errorptr, &erroffset, tables);
if (!re_pcre) {
if (errorptr)
warning(_("PCRE pattern compilation error\n\t'%s'\n\tat '%s'\n"),
errorptr, spat+erroffset);
error(_("invalid regular expression '%s'"), spat);
}
if (n > 10) {
re_pe = pcre_study(re_pcre, 0, &errorptr);
if (errorptr)
warning(_("PCRE pattern study error\n\t'%s'\n"), errorptr);
}
/* also extract info for named groups */
pcre_fullinfo(re_pcre, re_pe, PCRE_INFO_NAMECOUNT, &name_count);
pcre_fullinfo(re_pcre, re_pe, PCRE_INFO_NAMEENTRYSIZE, &name_entry_size);
pcre_fullinfo(re_pcre, re_pe, PCRE_INFO_NAMETABLE, &name_table);
info_code =
pcre_fullinfo(re_pcre, re_pe, PCRE_INFO_CAPTURECOUNT,
&capture_count);
if(info_code < 0)
error(_("'pcre_fullinfo' returned '%d' "), info_code);
ovector_size = (capture_count + 1) * 3;
ovector = (int *) malloc(ovector_size*sizeof(int));
SEXP thisname;
PROTECT(capture_names = allocVector(STRSXP, capture_count));
for(i = 0; i < name_count; i++) {
char *entry = name_table + name_entry_size * i;
PROTECT(thisname = mkChar(entry + 2));
int capture_num = (entry[0]<<8) + entry[1] - 1;
SET_STRING_ELT(capture_names, capture_num, thisname);
UNPROTECT(1);
}
} else {
int cflags = REG_EXTENDED;
if (igcase_opt) cflags |= REG_ICASE;
if (!use_WC)
rc = tre_regcompb(®, spat, cflags);
else
rc = tre_regwcomp(®, wtransChar(STRING_ELT(pat, 0)), cflags);
if (rc) reg_report(rc, ®, spat);
}
if (PRIMVAL(op) == 0) { /* regexpr */
SEXP matchlen, capture_start, capturelen;
int *is, *il;
PROTECT(ans = allocVector(INTSXP, n));
/* Protect in case install("match.length") allocates */
PROTECT(matchlen = allocVector(INTSXP, n));
setAttrib(ans, install("match.length"), matchlen);
if(useBytes) {
setAttrib(ans, install("useBytes"), R_TrueValue);
}
UNPROTECT(1);
if (perl_opt && capture_count) {
if (n > INT_MAX) error("too long a vector");
int nn = (int) n;
SEXP dmn;
PROTECT(dmn = allocVector(VECSXP, 2));
SET_VECTOR_ELT(dmn, 1, capture_names);
PROTECT(capture_start = allocMatrix(INTSXP, nn, capture_count));
setAttrib(capture_start, R_DimNamesSymbol, dmn);
setAttrib(ans, install("capture.start"), capture_start);
PROTECT(capturelen = allocMatrix(INTSXP, nn, capture_count));
setAttrib(capturelen, R_DimNamesSymbol, dmn);
setAttrib(ans, install("capture.length"), capturelen);
setAttrib(ans, install("capture.names"), capture_names);
UNPROTECT(3);
is = INTEGER(capture_start);
il = INTEGER(capturelen);
// initiialization needed for NA inputs: PR#16484
for (i = 0 ; i < n * capture_count ; i++)
is[i] = il[i] = NA_INTEGER;
} else is = il = NULL; /* not actually used */
vmax = vmaxget();
for (i = 0 ; i < n ; i++) {
// if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt();
if (STRING_ELT(text, i) == NA_STRING) {
INTEGER(matchlen)[i] = INTEGER(ans)[i] = NA_INTEGER;
} else {
if (useBytes)
s = CHAR(STRING_ELT(text, i));
else if (use_WC) ;
else if (use_UTF8) {
s = translateCharUTF8(STRING_ELT(text, i));
if (!utf8Valid(s)) {
if(nwarn++ < NWARN)
warning(_("input string %d is invalid UTF-8"), i+1);
INTEGER(ans)[i] = INTEGER(matchlen)[i] = -1;
continue;
}
} else {
s = translateChar(STRING_ELT(text, i));
if (mbcslocale && !mbcsValid(s)) {
if(nwarn++ < NWARN)
warning(_("input string %d is invalid in this locale"), i+1);
INTEGER(ans)[i] = INTEGER(matchlen)[i] = -1;
continue;
}
}
if (fixed_opt) {
int st = fgrep_one(spat, s, useBytes, use_UTF8, NULL);
INTEGER(ans)[i] = (st > -1)?(st+1):-1;
if (!useBytes && use_UTF8) {
INTEGER(matchlen)[i] = INTEGER(ans)[i] >= 0 ?
(int) utf8towcs(NULL, spat, 0):-1;
} else if (!useBytes && mbcslocale) {
INTEGER(matchlen)[i] = INTEGER(ans)[i] >= 0 ?
(int) mbstowcs(NULL, spat, 0):-1;
} else
INTEGER(matchlen)[i] = INTEGER(ans)[i] >= 0 ?
(int) strlen(spat):-1;
} else if (perl_opt) {
int rc;
rc = pcre_exec(re_pcre, re_pe, s, (int) strlen(s), 0, 0,
ovector, ovector_size);
if (rc >= 0) {
extract_match_and_groups(use_UTF8, ovector,
capture_count,
// don't use this for large i
INTEGER(ans) + i,
INTEGER(matchlen) + i,
is + i, il + i,
s, (int) n);
} else {
INTEGER(ans)[i] = INTEGER(matchlen)[i] = -1;
for(int cn = 0; cn < capture_count; cn++) {
R_xlen_t ind = i + cn*n;
is[ind] = il[ind] = -1;
}
}
} else {
if (!use_WC)
rc = tre_regexecb(®, s, 1, regmatch, 0);
else
rc = tre_regwexec(®, wtransChar(STRING_ELT(text, i)),
1, regmatch, 0);
if (rc == 0) {
int st = regmatch[0].rm_so;
INTEGER(ans)[i] = st + 1; /* index from one */
INTEGER(matchlen)[i] = regmatch[0].rm_eo - st;
} else INTEGER(ans)[i] = INTEGER(matchlen)[i] = -1;
}
}
vmaxset(vmax);
}
} else {
SEXP elt;
PROTECT(ans = allocVector(VECSXP, n));
vmax = vmaxget();
for (i = 0 ; i < n ; i++) {
// if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt();
if (STRING_ELT(text, i) == NA_STRING) {
elt = gregexpr_NAInputAns();
} else {
if (fixed_opt || perl_opt) {
if (useBytes)
s = CHAR(STRING_ELT(text, i));
else if (use_UTF8) {
s = translateCharUTF8(STRING_ELT(text, i));
} else
s = translateChar(STRING_ELT(text, i));
if (!useBytes && !use_UTF8 && mbcslocale && !mbcsValid(s)) {
if (nwarn++ < NWARN)
warning(_("input string %d is invalid in this locale"),
i+1);
elt = gregexpr_BadStringAns();
} else {
if (fixed_opt)
elt = gregexpr_fixed(spat, s, useBytes, use_UTF8);
else
elt = gregexpr_perl(spat, s, re_pcre, re_pe,
useBytes, use_UTF8, ovector,
ovector_size, capture_count,
capture_names);
}
} else
elt = gregexpr_Regexc(®, STRING_ELT(text, i),
useBytes, use_WC);
}
SET_VECTOR_ELT(ans, i, elt);
vmaxset(vmax);
}
}
if (fixed_opt) ;
else if (perl_opt) {
if (re_pe) pcre_free(re_pe);
pcre_free(re_pcre);
pcre_free((void *)tables);
UNPROTECT(1);
free(ovector);
} else
tre_regfree(®);
UNPROTECT(1);
return ans;
}