in source/src/main/grep.c [1487:1921]
SEXP attribute_hidden do_gsub(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP pat, rep, text, ans;
regex_t reg;
regmatch_t regmatch[10];
R_xlen_t i, n;
int j, ns, nns, nmatch, offset, rc;
int global, igcase_opt, perl_opt, fixed_opt, useBytes, eflags, last_end;
char *u, *cbuf;
const char *spat = NULL, *srep = NULL, *s = NULL;
size_t patlen = 0, replen = 0;
Rboolean use_UTF8 = FALSE, use_WC = FALSE;
const wchar_t *wrep = NULL;
pcre *re_pcre = NULL;
pcre_extra *re_pe = NULL;
const unsigned char *tables = NULL;
const void *vmax = vmaxget();
checkArity(op, args);
global = PRIMVAL(op);
pat = CAR(args); args = CDR(args);
rep = CAR(args); args = CDR(args);
text = CAR(args); args = CDR(args);
igcase_opt = asLogical(CAR(args)); args = CDR(args);
perl_opt = asLogical(CAR(args)); args = CDR(args);
fixed_opt = asLogical(CAR(args)); args = CDR(args);
useBytes = asLogical(CAR(args)); args = CDR(args);
if (igcase_opt == NA_INTEGER) igcase_opt = 0;
if (perl_opt == NA_INTEGER) perl_opt = 0;
if (fixed_opt == NA_INTEGER) fixed_opt = 0;
if (useBytes == NA_INTEGER) useBytes = 0;
if (fixed_opt && igcase_opt)
warning(_("argument '%s' will be ignored"), "ignore.case = TRUE");
if (fixed_opt && perl_opt) {
warning(_("argument '%s' will be ignored"), "perl = TRUE");
perl_opt = 0;
}
if (!isString(pat) || LENGTH(pat) < 1)
error(_("invalid '%s' argument"), "pattern");
if (LENGTH(pat) > 1)
warning(_("argument '%s' has length > 1 and only the first element will be used"), "pattern");
if (!isString(rep) || LENGTH(rep) < 1)
error(_("invalid '%s' argument"), "replacement");
if (LENGTH(rep) > 1)
warning(_("argument '%s' has length > 1 and only the first element will be used"), "replacement");
if (!isString(text))
error(_("invalid '%s' argument"), "text");
n = XLENGTH(text);
/* This contradicts the code below that has NA matching NA */
if (STRING_ELT(pat, 0) == NA_STRING) {
PROTECT(ans = allocVector(STRSXP, n));
for (i = 0; i < n; i++) SET_STRING_ELT(ans, i, NA_STRING);
UNPROTECT(1);
return ans;
}
if (!useBytes) {
Rboolean onlyASCII = IS_ASCII(STRING_ELT(pat, 0));
if (onlyASCII)
for (i = 0; i < n; i++) {
if(STRING_ELT(text, i) == NA_STRING) continue;
if (!IS_ASCII(STRING_ELT(text, i))) {
onlyASCII = FALSE;
break;
}
}
useBytes = onlyASCII;
}
if (!useBytes) {
Rboolean haveBytes = IS_BYTES(STRING_ELT(pat, 0));
if (!haveBytes)
for (i = 0; i < n; i++)
if (IS_BYTES(STRING_ELT(text, i))) {
haveBytes = TRUE;
break;
}
if(haveBytes) {
useBytes = TRUE;
}
}
if (!useBytes) {
if (!fixed_opt && mbcslocale) use_UTF8 = TRUE;
else if (IS_UTF8(STRING_ELT(pat, 0))) use_UTF8 = TRUE;
if (!use_UTF8)
for (i = 0; i < n; i++)
if (IS_UTF8(STRING_ELT(text, i))) {
use_UTF8 = TRUE;
break;
}
}
if (!fixed_opt && !perl_opt) {
/* if we have non-ASCII text in a DBCS locale, we need to use wchar */
if (!useBytes && mbcslocale && !utf8locale) use_UTF8 =TRUE;
use_WC = use_UTF8; use_UTF8 = FALSE;
}
if (useBytes) {
spat = CHAR(STRING_ELT(pat, 0));
srep = CHAR(STRING_ELT(rep, 0));
} else if (use_WC) ;
else if (use_UTF8) {
spat = translateCharUTF8(STRING_ELT(pat, 0));
if (!utf8Valid(spat)) error(_("'pattern' is invalid UTF-8"));
srep = translateCharUTF8(STRING_ELT(rep, 0));
if (!utf8Valid(srep)) error(_("'replacement' is invalid UTF-8"));
} else {
spat = translateChar(STRING_ELT(pat, 0));
if (mbcslocale && !mbcsValid(spat))
error(_("'pattern' is invalid in this locale"));
srep = translateChar(STRING_ELT(rep, 0));
if (mbcslocale && !mbcsValid(srep))
error(_("'replacement' is invalid in this locale"));
}
if (fixed_opt) {
patlen = strlen(spat);
if (!patlen) error(_("zero-length pattern"));
replen = strlen(srep);
} else if (perl_opt) {
int cflags = 0, erroffset;
const char *errorptr;
if (use_UTF8) cflags |= PCRE_UTF8;
if (igcase_opt) cflags |= PCRE_CASELESS;
// PCRE docs say this is not needed, but it is on Windows
tables = pcre_maketables();
re_pcre = pcre_compile(spat, cflags, &errorptr, &erroffset, tables);
if (!re_pcre) {
if (errorptr)
warning(_("PCRE pattern compilation error\n\t'%s'\n\tat '%s'\n"),
errorptr, spat+erroffset);
error(_("invalid regular expression '%s'"), spat);
}
if (n > 10) {
re_pe = pcre_study(re_pcre, 0, &errorptr);
if (errorptr)
warning(_("PCRE pattern study error\n\t'%s'\n"), errorptr);
}
replen = strlen(srep);
} else {
int cflags = REG_EXTENDED;
if (igcase_opt) cflags |= REG_ICASE;
if (!use_WC) {
rc = tre_regcompb(®, spat, cflags);
if (rc) reg_report(rc, ®, spat);
replen = strlen(srep);
} else {
rc = tre_regwcomp(®, wtransChar(STRING_ELT(pat, 0)), cflags);
if (rc) reg_report(rc, ®, CHAR(STRING_ELT(pat, 0)));
wrep = wtransChar(STRING_ELT(rep, 0));
replen = wcslen(wrep);
}
}
PROTECT(ans = allocVector(STRSXP, n));
vmax = vmaxget();
for (i = 0 ; i < n ; i++) {
// if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt();
/* NA pattern was handled above */
if (STRING_ELT(text, i) == NA_STRING) {
SET_STRING_ELT(ans, i, NA_STRING);
continue;
}
if (useBytes)
s = CHAR(STRING_ELT(text, i));
else if (use_WC) ;
else if (use_UTF8) {
s = translateCharUTF8(STRING_ELT(text, i));
if (!utf8Valid(s)) error(("input string %d is invalid UTF-8"), i+1);
} else {
s = translateChar(STRING_ELT(text, i));
if (mbcslocale && !mbcsValid(s))
error(("input string %d is invalid in this locale"), i+1);
}
if (fixed_opt) {
int st, nr, slen = (int) strlen(s);
ns = slen;
st = fgrep_one_bytes(spat, s, ns, useBytes, use_UTF8);
if (st < 0)
SET_STRING_ELT(ans, i, STRING_ELT(text, i));
else if (STRING_ELT(rep, 0) == NA_STRING)
SET_STRING_ELT(ans, i, NA_STRING);
else {
if (global) { /* need to find max number of matches */
const char *ss= s;
int sst = st;
nr = 0;
do {
nr++;
ss += sst+patlen;
slen -= (int)(sst+patlen);
} while((sst = fgrep_one_bytes(spat, ss, slen, useBytes, use_UTF8)) >= 0);
} else nr = 1;
cbuf = u = Calloc(ns + nr*(replen - patlen) + 1, char);
*u = '\0';
slen = ns;
do {
strncpy(u, s, st);
u += st;
s += st+patlen;
slen -= (int)(st+patlen);
strncpy(u, srep, replen);
u += replen;
} while(global && (st = fgrep_one_bytes(spat, s, slen, useBytes, use_UTF8)) >= 0);
strcpy(u, s);
if (useBytes)
SET_STRING_ELT(ans, i, mkChar(cbuf));
else if (use_UTF8)
SET_STRING_ELT(ans, i, mkCharCE(cbuf, CE_UTF8));
else
SET_STRING_ELT(ans, i, markKnown(cbuf, STRING_ELT(text, i)));
Free(cbuf);
}
} else if (perl_opt) {
int ncap, maxrep, ovector[30], eflag;
memset(ovector, 0, 30*sizeof(int)); /* zero for unknown patterns */
ns = (int) strlen(s);
/* worst possible scenario is to put a copy of the
replacement after every character, unless there are
backrefs */
maxrep = (int)(replen + (ns-2) * count_subs(srep));
if (global) {
/* Integer overflow has been seen */
double dnns = ns * (maxrep + 1.) + 1000;
if (dnns > 10000) dnns = (double)(2*ns + replen + 1000);
nns = (int) dnns;
} else nns = ns + maxrep + 1000;
u = cbuf = Calloc(nns, char);
offset = 0; nmatch = 0; eflag = 0; last_end = -1;
/* ncap is one more than the number of capturing patterns */
while ((ncap = pcre_exec(re_pcre, re_pe, s, ns, offset, eflag,
ovector, 30)) >= 0) {
/* printf("%s, %d, %d %d\n", s, offset,
ovector[0], ovector[1]); */
nmatch++;
for (j = offset; j < ovector[0]; j++) *u++ = s[j];
if (ovector[1] > last_end) {
u = pcre_string_adj(u, s, srep, ovector, use_UTF8);
last_end = ovector[1];
}
offset = ovector[1];
if (s[offset] == '\0' || !global) break;
if (ovector[1] == ovector[0]) {
/* advance by a char */
if (use_UTF8) {
int used, pos = 0;
while( (used = utf8clen(s[pos])) ) {
pos += used;
if (pos > offset) {
for (j = offset; j < pos; j++) *u++ = s[j];
offset = pos;
break;
}
}
} else
*u++ = s[offset++];
}
if (nns < (u - cbuf) + (ns-offset) + maxrep + 100) {
char *tmp;
if (nns > INT_MAX/2) error(_("result string is too long"));
nns *= 2;
tmp = Realloc(cbuf, nns, char);
u = tmp + (u - cbuf);
cbuf = tmp;
}
eflag = PCRE_NOTBOL; /* probably not needed */
}
if (nmatch == 0)
SET_STRING_ELT(ans, i, STRING_ELT(text, i));
else if (STRING_ELT(rep, 0) == NA_STRING)
SET_STRING_ELT(ans, i, NA_STRING);
else {
/* copy the tail */
if (nns < (u - cbuf) + (ns-offset)+1) {
char *tmp;
if (nns > INT_MAX/2) error(_("result string is too long"));
nns *= 2;
tmp = Realloc(cbuf, nns, char);
u = tmp + (u - cbuf);
cbuf = tmp;
}
for (j = offset ; s[j] ; j++) *u++ = s[j];
*u = '\0';
if (useBytes)
SET_STRING_ELT(ans, i, mkChar(cbuf));
else if (use_UTF8)
SET_STRING_ELT(ans, i, mkCharCE(cbuf, CE_UTF8));
else
SET_STRING_ELT(ans, i, markKnown(cbuf, STRING_ELT(text, i)));
}
Free(cbuf);
} else if (!use_WC) {
int maxrep;
/* extended regexp in bytes */
ns = (int) strlen(s);
/* worst possible scenario is to put a copy of the
replacement after every character, unless there are
backrefs */
maxrep = (int)(replen + (ns-2) * count_subs(srep));
if (global) {
double dnns = ns * (maxrep + 1.) + 1000;
if (dnns > 10000) dnns = (double)(2*ns + replen + 1000);
nns = (int) dnns;
} else nns = ns + maxrep + 1000;
u = cbuf = Calloc(nns, char);
offset = 0; nmatch = 0; eflags = 0; last_end = -1;
while (tre_regexecb(®, s+offset, 10, regmatch, eflags) == 0) {
/* printf("%s, %d %d\n", &s[offset],
regmatch[0].rm_so, regmatch[0].rm_eo); */
nmatch++;
for (j = 0; j < regmatch[0].rm_so ; j++)
*u++ = s[offset+j];
if (offset+regmatch[0].rm_eo > last_end) {
u = string_adj(u, s+offset, srep, regmatch);
last_end = offset+regmatch[0].rm_eo;
}
offset += regmatch[0].rm_eo;
if (s[offset] == '\0' || !global) break;
if (regmatch[0].rm_eo == regmatch[0].rm_so)
*u++ = s[offset++];
if (nns < (u - cbuf) + (ns-offset) + maxrep + 100) {
char *tmp;
if (nns > INT_MAX/2) error(_("result string is too long"));
nns *= 2;
tmp = Realloc(cbuf, nns, char);
u = tmp + (u - cbuf);
cbuf = tmp;
}
eflags = REG_NOTBOL;
}
if (nmatch == 0)
SET_STRING_ELT(ans, i, STRING_ELT(text, i));
else if (STRING_ELT(rep, 0) == NA_STRING)
SET_STRING_ELT(ans, i, NA_STRING);
else {
/* copy the tail */
if (nns < (u - cbuf) + (ns-offset)+1) {
char *tmp;
if (nns > INT_MAX/2) error(_("result string is too long"));
nns *= 2;
tmp = Realloc(cbuf, nns, char);
u = tmp + (u - cbuf);
cbuf = tmp;
}
for (j = offset ; s[j] ; j++) *u++ = s[j];
*u = '\0';
if (useBytes)
SET_STRING_ELT(ans, i, mkChar(cbuf));
else
SET_STRING_ELT(ans, i, markKnown(cbuf, STRING_ELT(text, i)));
}
Free(cbuf);
} else {
/* extended regexp in wchar_t */
const wchar_t *s = wtransChar(STRING_ELT(text, i));
wchar_t *u, *cbuf;
int maxrep;
ns = (int) wcslen(s);
maxrep = (int)(replen + (ns-2) * wcount_subs(wrep));
if (global) {
/* worst possible scenario is to put a copy of the
replacement after every character */
double dnns = ns * (maxrep + 1.) + 1000;
if (dnns > 10000) dnns = 2*ns + maxrep + 1000;
nns = (int) dnns;
} else nns = ns + maxrep + 1000;
u = cbuf = Calloc(nns, wchar_t);
offset = 0; nmatch = 0; eflags = 0; last_end = -1;
while (tre_regwexec(®, s+offset, 10, regmatch, eflags) == 0) {
nmatch++;
for (j = 0; j < regmatch[0].rm_so ; j++)
*u++ = s[offset+j];
if (offset+regmatch[0].rm_eo > last_end) {
u = wstring_adj(u, s+offset, wrep, regmatch);
last_end = offset+regmatch[0].rm_eo;
}
offset += regmatch[0].rm_eo;
if (s[offset] == L'\0' || !global) break;
if (regmatch[0].rm_eo == regmatch[0].rm_so)
*u++ = s[offset++];
if (nns < (u - cbuf) + (ns-offset) + maxrep + 100) {
wchar_t *tmp;
/* This could fail at smaller value on a 32-bit platform:
it is merely an integer overflow check */
if (nns > INT_MAX/2) error(_("result string is too long"));
nns *= 2;
tmp = Realloc(cbuf, nns, wchar_t);
u = tmp + (u - cbuf);
cbuf = tmp;
}
eflags = REG_NOTBOL;
}
if (nmatch == 0)
SET_STRING_ELT(ans, i, STRING_ELT(text, i));
else if (STRING_ELT(rep, 0) == NA_STRING)
SET_STRING_ELT(ans, i, NA_STRING);
else {
/* copy the tail */
if (nns < (u - cbuf) + (ns-offset)+1) {
wchar_t *tmp;
if (nns > INT_MAX/2) error(_("result string is too long"));
nns *= 2;
tmp = Realloc(cbuf, nns, wchar_t);
u = tmp + (u - cbuf);
cbuf = tmp;
}
for (j = offset ; s[j] ; j++) *u++ = s[j];
*u = L'\0';
SET_STRING_ELT(ans, i, mkCharW(cbuf));
}
Free(cbuf);
}
vmaxset(vmax);
}
if (fixed_opt) ;
else if (perl_opt) {
if (re_pe) pcre_free(re_pe);
pcre_free(re_pcre);
pcre_free((void *)tables);
} else tre_regfree(®);
SHALLOW_DUPLICATE_ATTRIB(ans, text);
/* This copied the class, if any */
UNPROTECT(1);
return ans;
}