in source/src/main/grep.c [136:643]
SEXP attribute_hidden do_strsplit(SEXP call, SEXP op, SEXP args, SEXP env)
{
SEXP args0 = args, ans, tok, x;
R_xlen_t i, itok, len, tlen;
size_t j, ntok;
int fixed_opt, perl_opt, useBytes;
char *pt = NULL; wchar_t *wpt = NULL;
const char *buf, *split = "", *bufp;
const unsigned char *tables = NULL;
Rboolean use_UTF8 = FALSE, haveBytes = FALSE;
const void *vmax, *vmax2;
int nwarn = 0;
checkArity(op, args);
x = CAR(args); args = CDR(args);
tok = CAR(args); args = CDR(args);
fixed_opt = asLogical(CAR(args)); args = CDR(args);
perl_opt = asLogical(CAR(args)); args = CDR(args);
useBytes = asLogical(CAR(args));
if (fixed_opt == NA_INTEGER) fixed_opt = 0;
if (perl_opt == NA_INTEGER) perl_opt = 0;
if (useBytes == NA_INTEGER) useBytes = 0;
if (fixed_opt && perl_opt) {
warning(_("argument '%s' will be ignored"), "perl = TRUE");
perl_opt = 0;
}
if (!isString(x) || !isString(tok)) error(_("non-character argument"));
len = XLENGTH(x);
tlen = XLENGTH(tok);
/* treat split = NULL as split = "" */
if (!tlen) { tlen = 1; SETCADR(args0, tok = mkString("")); }
if (!useBytes) {
for (i = 0; i < tlen; i++)
if (IS_BYTES(STRING_ELT(tok, i))) {
haveBytes = TRUE; break;
}
if (!haveBytes)
for (i = 0; i < len; i++)
if (IS_BYTES(STRING_ELT(x, i))) {
haveBytes = TRUE;
break;
}
if (haveBytes) {
useBytes = TRUE;
} else {
if (perl_opt && mbcslocale) use_UTF8 = TRUE;
if (!use_UTF8)
for (i = 0; i < tlen; i++)
if (IS_UTF8(STRING_ELT(tok, i))) {
use_UTF8 = TRUE; break;
}
if (!use_UTF8)
for (i = 0; i < len; i++)
if (IS_UTF8(STRING_ELT(x, i))) {
use_UTF8 = TRUE;
break;
}
}
}
/* group by token for efficiency with PCRE/TRE versions */
PROTECT(ans = allocVector(VECSXP, len));
vmax = vmaxget();
for (itok = 0; itok < tlen; itok++) {
SEXP this = STRING_ELT(tok, itok);
if (this == NA_STRING) { /* NA token doesn't split */
for (i = itok; i < len; i += tlen)
SET_VECTOR_ELT(ans, i, ScalarString(STRING_ELT(x, i)));
continue;
} else if (!CHAR(this)[0]) { /* empty */
vmax2 = vmaxget();
for (i = itok; i < len; i += tlen) {
SEXP t;
if (STRING_ELT(x, i) == NA_STRING) {
SET_VECTOR_ELT(ans, i, ScalarString(NA_STRING));
continue;
}
if (useBytes)
buf = CHAR(STRING_ELT(x, i));
else if (use_UTF8) {
buf = translateCharUTF8(STRING_ELT(x, i));
if (!utf8Valid(buf)) {
if(nwarn++ < NWARN)
warning(_("input string %d is invalid UTF-8"), i+1);
SET_VECTOR_ELT(ans, i, ScalarString(NA_STRING));
continue;
}
} else {
buf = translateChar(STRING_ELT(x, i));
if (mbcslocale && !mbcsValid(buf)) {
if(nwarn++ < NWARN)
warning(_("input string %d is invalid in this locale"), i+1);
SET_VECTOR_ELT(ans, i, ScalarString(NA_STRING));
continue;
}
}
if (!useBytes && (use_UTF8 || mbcslocale) && !strIsASCII(buf)) {
/* split into individual characters (not bytes) */
char bf[20 /* > MB_CUR_MAX */];
const char *p = buf;
size_t used;
mbstate_t mb_st;
ssize_t nt; /* need to check error on size_t */
if (use_UTF8) {
for (ntok = 0; *p; p += used, ntok++)
used = utf8clen(*p);
p = buf;
PROTECT(t = allocVector(STRSXP, ntok));
for (j = 0; j < ntok; j++, p += used) {
used = utf8clen(*p);
memcpy(bf, p, used); bf[used] = '\0';
SET_STRING_ELT(t, j, mkCharCE(bf, CE_UTF8));
}
} else if ((nt = mbstowcs(NULL, buf, 0)) < 0) {
PROTECT(t = ScalarString(NA_STRING));
} else {
ntok = nt;
mbs_init(&mb_st);
PROTECT(t = allocVector(STRSXP, ntok));
for (j = 0; j < ntok; j++, p += used) {
/* This is valid as we have already checked */
used = mbrtowc(NULL, p, MB_CUR_MAX, &mb_st);
memcpy(bf, p, used); bf[used] = '\0';
SET_STRING_ELT(t, j, markKnown(bf, STRING_ELT(x, i)));
}
}
} else {
/* useBytes or ASCII or
single-byte locale and not marked as UTF-8 */
char bf[2];
ntok = strlen(buf);
PROTECT(t = allocVector(STRSXP, ntok));
bf[1] = '\0';
for (j = 0; j < ntok; j++) {
bf[0] = buf[j];
SET_STRING_ELT(t, j, markKnown(bf, STRING_ELT(x, i)));
}
}
SET_VECTOR_ELT(ans, i, t);
UNPROTECT(1);
vmaxset(vmax2);
}
} else if (fixed_opt) {
const char *laststart, *ebuf;
if (useBytes)
split = CHAR(STRING_ELT(tok, itok));
else if (use_UTF8) {
split = translateCharUTF8(STRING_ELT(tok, itok));
if (!utf8Valid(split))
error(_("'split' string %d is invalid UTF-8"), itok+1);
} else {
split = translateChar(STRING_ELT(tok, itok));
if (mbcslocale && !mbcsValid(split))
error(_("'split' string %d is invalid in this locale"),
itok+1);
}
int slen = (int) strlen(split);
vmax2 = vmaxget();
for (i = itok; i < len; i += tlen) {
SEXP t;
if (STRING_ELT(x, i) == NA_STRING) {
SET_VECTOR_ELT(ans, i, ScalarString(NA_STRING));
continue;
}
if (useBytes)
buf = CHAR(STRING_ELT(x, i));
else if (use_UTF8) {
buf = translateCharUTF8(STRING_ELT(x, i));
if (!utf8Valid(buf)) {
if(nwarn++ < NWARN)
warning(_("input string %d is invalid UTF-8"), i+1);
SET_VECTOR_ELT(ans, i, ScalarString(NA_STRING));
continue;
}
} else {
buf = translateChar(STRING_ELT(x, i));
if (mbcslocale && !mbcsValid(buf)) {
if(nwarn++ < NWARN)
warning(_("input string %d is invalid in this locale"), i+1);
SET_VECTOR_ELT(ans, i, ScalarString(NA_STRING));
continue;
}
}
/* find out how many splits there will be */
size_t ntok = 0;
/* This is UTF-8 safe since it compares whole strings */
laststart = buf;
ebuf = buf + strlen(buf);
for (bufp = buf; bufp < ebuf; bufp++) {
if ((slen == 1 && *bufp != *split) ||
(slen > 1 && strncmp(bufp, split, slen))) continue;
ntok++;
bufp += MAX(slen - 1, 0);
laststart = bufp+1;
}
bufp = laststart;
SET_VECTOR_ELT(ans, i,
t = allocVector(STRSXP, ntok + (*bufp ? 1 : 0)));
/* and fill with the splits */
laststart = bufp = buf;
pt = Realloc(pt, strlen(buf)+1, char);
for (size_t j = 0; j < ntok; j++) {
/* This is UTF-8 safe since it compares whole
strings, but <MBCS-FIXME> it would be more
efficient to skip along by chars.
*/
for (; bufp < ebuf; bufp++) {
if ((slen == 1 && *bufp != *split) ||
(slen > 1 && strncmp(bufp, split, slen))) continue;
if (slen) {
strncpy(pt, laststart, bufp - laststart);
pt[bufp - laststart] = '\0';
} else {
pt[0] = *bufp; pt[1] ='\0';
}
bufp += MAX(slen-1, 0);
laststart = bufp+1;
if (use_UTF8)
SET_STRING_ELT(t, j, mkCharCE(pt, CE_UTF8));
else
SET_STRING_ELT(t, j, markKnown(pt, STRING_ELT(x, i)));
break;
}
bufp = laststart;
}
if (*bufp) {
if (use_UTF8)
SET_STRING_ELT(t, ntok, mkCharCE(bufp, CE_UTF8));
else
SET_STRING_ELT(t, ntok, markKnown(bufp, STRING_ELT(x, i)));
}
vmaxset(vmax2);
}
} else if (perl_opt) {
pcre *re_pcre;
pcre_extra *re_pe;
int erroffset, ovector[30];
const char *errorptr;
int options = 0;
if (use_UTF8) options = PCRE_UTF8;
if (useBytes)
split = CHAR(STRING_ELT(tok, itok));
else if (use_UTF8) {
split = translateCharUTF8(STRING_ELT(tok, itok));
if (!utf8Valid(split))
error(_("'split' string %d is invalid UTF-8"), itok+1);
} else {
split = translateChar(STRING_ELT(tok, itok));
if (mbcslocale && !mbcsValid(split))
error(_("'split' string %d is invalid in this locale"), itok+1);
}
// PCRE docs say this is not needed, but it is on Windows
if (!tables) tables = pcre_maketables();
re_pcre = pcre_compile(split, options,
&errorptr, &erroffset, tables);
if (!re_pcre) {
if (errorptr)
warning(_("PCRE pattern compilation error\n\t'%s'\n\tat '%s'\n"),
errorptr, split+erroffset);
error(_("invalid split pattern '%s'"), split);
}
re_pe = pcre_study(re_pcre, 0, &errorptr);
if (errorptr)
warning(_("PCRE pattern study error\n\t'%s'\n"), errorptr);
vmax2 = vmaxget();
for (i = itok; i < len; i += tlen) {
SEXP t;
if (STRING_ELT(x, i) == NA_STRING) {
SET_VECTOR_ELT(ans, i, ScalarString(NA_STRING));
continue;
}
if (useBytes)
buf = CHAR(STRING_ELT(x, i));
else if (use_UTF8) {
buf = translateCharUTF8(STRING_ELT(x, i));
if (!utf8Valid(buf)) {
if(nwarn++ < NWARN)
warning(_("input string %d is invalid UTF-8"), i+1);
SET_VECTOR_ELT(ans, i, ScalarString(NA_STRING));
continue;
}
} else {
buf = translateChar(STRING_ELT(x, i));
if (mbcslocale && !mbcsValid(buf)) {
if(nwarn++ < NWARN)
warning(_("input string %d is invalid in this locale"), i+1);
SET_VECTOR_ELT(ans, i, ScalarString(NA_STRING));
continue;
}
}
/* find out how many splits there will be */
ntok = 0;
bufp = buf;
if (*bufp) {
while(pcre_exec(re_pcre, re_pe, bufp, (int) strlen(bufp),
0, 0, ovector, 30) >= 0) {
/* Empty matches get the next char, so move by one. */
bufp += MAX(ovector[1], 1);
ntok++;
if (*bufp == '\0')
break;
}
}
SET_VECTOR_ELT(ans, i,
t = allocVector(STRSXP, ntok + (*bufp ? 1 : 0)));
/* and fill with the splits */
bufp = buf;
pt = Realloc(pt, strlen(buf)+1, char);
for (j = 0; j < ntok; j++) {
pcre_exec(re_pcre, re_pe, bufp, (int) strlen(bufp), 0, 0,
ovector, 30);
if (ovector[1] > 0) {
/* Match was non-empty. */
if (ovector[0] > 0)
strncpy(pt, bufp, ovector[0]);
pt[ovector[0]] = '\0';
bufp += ovector[1];
} else {
/* Match was empty. */
pt[0] = *bufp;
pt[1] = '\0';
bufp++;
}
if (use_UTF8)
SET_STRING_ELT(t, j, mkCharCE(pt, CE_UTF8));
else
SET_STRING_ELT(t, j, markKnown(pt, STRING_ELT(x, i)));
}
if (*bufp) {
if (use_UTF8)
SET_STRING_ELT(t, ntok, mkCharCE(bufp, CE_UTF8));
else
SET_STRING_ELT(t, ntok, markKnown(bufp, STRING_ELT(x, i)));
}
vmaxset(vmax2);
}
pcre_free(re_pe);
pcre_free(re_pcre);
} else if (!useBytes && use_UTF8) { /* ERE in wchar_t */
regex_t reg;
regmatch_t regmatch[1];
int rc;
int cflags = REG_EXTENDED;
const wchar_t *wbuf, *wbufp, *wsplit;
/* Careful: need to distinguish empty (rm_eo == 0) from
non-empty (rm_eo > 0) matches. In the former case, the
token extracted is the next character. Otherwise, it is
everything before the start of the match, which may be
the empty string (not a ``token'' in the strict sense).
*/
wsplit = wtransChar(STRING_ELT(tok, itok));
if ((rc = tre_regwcomp(®, wsplit, cflags)))
reg_report(rc, ®, translateChar(STRING_ELT(tok, itok)));
vmax2 = vmaxget();
for (i = itok; i < len; i += tlen) {
SEXP t;
if (STRING_ELT(x, i) == NA_STRING) {
SET_VECTOR_ELT(ans, i, ScalarString(NA_STRING));
continue;
}
wbuf = wtransChar(STRING_ELT(x, i));
/* find out how many splits there will be */
ntok = 0;
wbufp = wbuf;
if (*wbufp) {
while(tre_regwexec(®, wbufp, 1, regmatch, 0) == 0) {
/* Empty matches get the next char, so move by one. */
wbufp += MAX(regmatch[0].rm_eo, 1);
ntok++;
if (!*wbufp) break;
}
}
SET_VECTOR_ELT(ans, i,
t = allocVector(STRSXP, ntok + (*wbufp ? 1 : 0)));
/* and fill with the splits */
wbufp = wbuf;
wpt = Realloc(wpt, wcslen(wbuf)+1, wchar_t);
for (j = 0; j < ntok; j++) {
tre_regwexec(®, wbufp, 1, regmatch, 0);
if (regmatch[0].rm_eo > 0) {
/* Match was non-empty. */
if (regmatch[0].rm_so > 0)
wcsncpy(wpt, wbufp, regmatch[0].rm_so);
wpt[regmatch[0].rm_so] = 0;
wbufp += regmatch[0].rm_eo;
} else {
/* Match was empty. */
wpt[0] = *wbufp;
wpt[1] = 0;
wbufp++;
}
SET_STRING_ELT(t, j,
mkCharWLen(wpt, regmatch[0].rm_so));
}
if (*wbufp)
SET_STRING_ELT(t, ntok,
mkCharWLen(wbufp, (int) wcslen(wbufp)));
vmaxset(vmax2);
}
tre_regfree(®);
} else { /* ERE in normal chars -- single byte or MBCS */
regex_t reg;
regmatch_t regmatch[1];
int rc;
int cflags = REG_EXTENDED;
/* Careful: need to distinguish empty (rm_eo == 0) from
non-empty (rm_eo > 0) matches. In the former case, the
token extracted is the next character. Otherwise, it is
everything before the start of the match, which may be
the empty string (not a ``token'' in the strict sense).
*/
/* never use_UTF8 */
if (useBytes)
split = CHAR(STRING_ELT(tok, itok));
else {
split = translateChar(STRING_ELT(tok, itok));
if (mbcslocale && !mbcsValid(split))
error(_("'split' string %d is invalid in this locale"), itok+1);
}
if ((rc = tre_regcomp(®, split, cflags)))
reg_report(rc, ®, split);
vmax2 = vmaxget();
for (i = itok; i < len; i += tlen) {
SEXP t;
if (STRING_ELT(x, i) == NA_STRING) {
SET_VECTOR_ELT(ans, i, ScalarString(NA_STRING));
continue;
}
/* never use_UTF8 */
if (useBytes)
buf = CHAR(STRING_ELT(x, i));
else {
buf = translateChar(STRING_ELT(x, i));
if (mbcslocale && !mbcsValid(buf)) {
if(nwarn++ < NWARN)
warning(_("input string %d is invalid in this locale"), i+1);
SET_VECTOR_ELT(ans, i, ScalarString(NA_STRING));
continue;
}
}
/* find out how many splits there will be */
ntok = 0;
bufp = buf;
if (*bufp) {
while(tre_regexec(®, bufp, 1, regmatch, 0) == 0) {
/* Empty matches get the next char, so move by one. */
bufp += MAX(regmatch[0].rm_eo, 1);
ntok++;
if (*bufp == '\0') break;
}
}
SET_VECTOR_ELT(ans, i,
t = allocVector(STRSXP, ntok + (*bufp ? 1 : 0)));
/* and fill with the splits */
bufp = buf;
pt = Realloc(pt, strlen(buf)+1, char);
for (j = 0; j < ntok; j++) {
tre_regexec(®, bufp, 1, regmatch, 0);
if (regmatch[0].rm_eo > 0) {
/* Match was non-empty. */
if (regmatch[0].rm_so > 0)
strncpy(pt, bufp, regmatch[0].rm_so);
pt[regmatch[0].rm_so] = '\0';
bufp += regmatch[0].rm_eo;
} else {
/* Match was empty. */
pt[0] = *bufp;
pt[1] = '\0';
bufp++;
}
SET_STRING_ELT(t, j, markKnown(pt, STRING_ELT(x, i)));
}
if (*bufp)
SET_STRING_ELT(t, ntok, markKnown(bufp, STRING_ELT(x, i)));
vmaxset(vmax2);
}
tre_regfree(®);
}
vmaxset(vmax);
}
if (getAttrib(x, R_NamesSymbol) != R_NilValue)
namesgets(ans, getAttrib(x, R_NamesSymbol));
UNPROTECT(1);
Free(pt); Free(wpt);
if (tables) pcre_free((void *)tables);
return ans;
}