in src/couch_quickjs/quickjs/libregexp.c [1119:1607]
static int re_parse_term(REParseState *s, BOOL is_backward_dir)
{
const uint8_t *p;
int c, last_atom_start, quant_min, quant_max, last_capture_count;
BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
CharRange cr_s, *cr = &cr_s;
last_atom_start = -1;
last_capture_count = 0;
p = s->buf_ptr;
c = *p;
switch(c) {
case '^':
p++;
re_emit_op(s, REOP_line_start);
break;
case '$':
p++;
re_emit_op(s, REOP_line_end);
break;
case '.':
p++;
last_atom_start = s->byte_code.size;
last_capture_count = s->capture_count;
if (is_backward_dir)
re_emit_op(s, REOP_prev);
re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
if (is_backward_dir)
re_emit_op(s, REOP_prev);
break;
case '{':
if (s->is_unicode) {
return re_parse_error(s, "syntax error");
} else if (!is_digit(p[1])) {
/* Annex B: we accept '{' not followed by digits as a
normal atom */
goto parse_class_atom;
} else {
const uint8_t *p1 = p + 1;
/* Annex B: error if it is like a repetition count */
parse_digits(&p1, TRUE);
if (*p1 == ',') {
p1++;
if (is_digit(*p1)) {
parse_digits(&p1, TRUE);
}
}
if (*p1 != '}') {
goto parse_class_atom;
}
}
/* fall thru */
case '*':
case '+':
case '?':
return re_parse_error(s, "nothing to repeat");
case '(':
if (p[1] == '?') {
if (p[2] == ':') {
p += 3;
last_atom_start = s->byte_code.size;
last_capture_count = s->capture_count;
s->buf_ptr = p;
if (re_parse_disjunction(s, is_backward_dir))
return -1;
p = s->buf_ptr;
if (re_parse_expect(s, &p, ')'))
return -1;
} else if ((p[2] == '=' || p[2] == '!')) {
is_neg = (p[2] == '!');
is_backward_lookahead = FALSE;
p += 3;
goto lookahead;
} else if (p[2] == '<' &&
(p[3] == '=' || p[3] == '!')) {
int pos;
is_neg = (p[3] == '!');
is_backward_lookahead = TRUE;
p += 4;
/* lookahead */
lookahead:
/* Annex B allows lookahead to be used as an atom for
the quantifiers */
if (!s->is_unicode && !is_backward_lookahead) {
last_atom_start = s->byte_code.size;
last_capture_count = s->capture_count;
}
pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
s->buf_ptr = p;
if (re_parse_disjunction(s, is_backward_lookahead))
return -1;
p = s->buf_ptr;
if (re_parse_expect(s, &p, ')'))
return -1;
re_emit_op(s, REOP_match);
/* jump after the 'match' after the lookahead is successful */
if (dbuf_error(&s->byte_code))
return -1;
put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
} else if (p[2] == '<') {
p += 3;
if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
&p)) {
return re_parse_error(s, "invalid group name");
}
if (find_group_name(s, s->u.tmp_buf) > 0) {
return re_parse_error(s, "duplicate group name");
}
/* group name with a trailing zero */
dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
strlen(s->u.tmp_buf) + 1);
s->has_named_captures = 1;
goto parse_capture;
} else {
return re_parse_error(s, "invalid group");
}
} else {
int capture_index;
p++;
/* capture without group name */
dbuf_putc(&s->group_names, 0);
parse_capture:
if (s->capture_count >= CAPTURE_COUNT_MAX)
return re_parse_error(s, "too many captures");
last_atom_start = s->byte_code.size;
last_capture_count = s->capture_count;
capture_index = s->capture_count++;
re_emit_op_u8(s, REOP_save_start + is_backward_dir,
capture_index);
s->buf_ptr = p;
if (re_parse_disjunction(s, is_backward_dir))
return -1;
p = s->buf_ptr;
re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir,
capture_index);
if (re_parse_expect(s, &p, ')'))
return -1;
}
break;
case '\\':
switch(p[1]) {
case 'b':
case 'B':
re_emit_op(s, REOP_word_boundary + (p[1] != 'b'));
p += 2;
break;
case 'k':
{
const uint8_t *p1;
int dummy_res;
p1 = p;
if (p1[2] != '<') {
/* annex B: we tolerate invalid group names in non
unicode mode if there is no named capture
definition */
if (s->is_unicode || re_has_named_captures(s))
return re_parse_error(s, "expecting group name");
else
goto parse_class_atom;
}
p1 += 3;
if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
&p1)) {
if (s->is_unicode || re_has_named_captures(s))
return re_parse_error(s, "invalid group name");
else
goto parse_class_atom;
}
c = find_group_name(s, s->u.tmp_buf);
if (c < 0) {
/* no capture name parsed before, try to look
after (inefficient, but hopefully not common */
c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
if (c < 0) {
if (s->is_unicode || re_has_named_captures(s))
return re_parse_error(s, "group name not defined");
else
goto parse_class_atom;
}
}
p = p1;
}
goto emit_back_reference;
case '0':
p += 2;
c = 0;
if (s->is_unicode) {
if (is_digit(*p)) {
return re_parse_error(s, "invalid decimal escape in regular expression");
}
} else {
/* Annex B.1.4: accept legacy octal */
if (*p >= '0' && *p <= '7') {
c = *p++ - '0';
if (*p >= '0' && *p <= '7') {
c = (c << 3) + *p++ - '0';
}
}
}
goto normal_char;
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8':
case '9':
{
const uint8_t *q = ++p;
c = parse_digits(&p, FALSE);
if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
if (!s->is_unicode) {
/* Annex B.1.4: accept legacy octal */
p = q;
if (*p <= '7') {
c = 0;
if (*p <= '3')
c = *p++ - '0';
if (*p >= '0' && *p <= '7') {
c = (c << 3) + *p++ - '0';
if (*p >= '0' && *p <= '7') {
c = (c << 3) + *p++ - '0';
}
}
} else {
c = *p++;
}
goto normal_char;
}
return re_parse_error(s, "back reference out of range in regular expression");
}
emit_back_reference:
last_atom_start = s->byte_code.size;
last_capture_count = s->capture_count;
re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
}
break;
default:
goto parse_class_atom;
}
break;
case '[':
last_atom_start = s->byte_code.size;
last_capture_count = s->capture_count;
if (is_backward_dir)
re_emit_op(s, REOP_prev);
if (re_parse_char_class(s, &p))
return -1;
if (is_backward_dir)
re_emit_op(s, REOP_prev);
break;
case ']':
case '}':
if (s->is_unicode)
return re_parse_error(s, "syntax error");
goto parse_class_atom;
default:
parse_class_atom:
c = get_class_atom(s, cr, &p, FALSE);
if ((int)c < 0)
return -1;
normal_char:
last_atom_start = s->byte_code.size;
last_capture_count = s->capture_count;
if (is_backward_dir)
re_emit_op(s, REOP_prev);
if (c >= CLASS_RANGE_BASE) {
int ret;
/* Note: canonicalization is not needed */
ret = re_emit_range(s, cr);
cr_free(cr);
if (ret)
return -1;
} else {
if (s->ignore_case)
c = lre_canonicalize(c, s->is_unicode);
if (c <= 0xffff)
re_emit_op_u16(s, REOP_char, c);
else
re_emit_op_u32(s, REOP_char32, c);
}
if (is_backward_dir)
re_emit_op(s, REOP_prev);
break;
}
/* quantifier */
if (last_atom_start >= 0) {
c = *p;
switch(c) {
case '*':
p++;
quant_min = 0;
quant_max = INT32_MAX;
goto quantifier;
case '+':
p++;
quant_min = 1;
quant_max = INT32_MAX;
goto quantifier;
case '?':
p++;
quant_min = 0;
quant_max = 1;
goto quantifier;
case '{':
{
const uint8_t *p1 = p;
/* As an extension (see ES6 annex B), we accept '{' not
followed by digits as a normal atom */
if (!is_digit(p[1])) {
if (s->is_unicode)
goto invalid_quant_count;
break;
}
p++;
quant_min = parse_digits(&p, TRUE);
quant_max = quant_min;
if (*p == ',') {
p++;
if (is_digit(*p)) {
quant_max = parse_digits(&p, TRUE);
if (quant_max < quant_min) {
invalid_quant_count:
return re_parse_error(s, "invalid repetition count");
}
} else {
quant_max = INT32_MAX; /* infinity */
}
}
if (*p != '}' && !s->is_unicode) {
/* Annex B: normal atom if invalid '{' syntax */
p = p1;
break;
}
if (re_parse_expect(s, &p, '}'))
return -1;
}
quantifier:
greedy = TRUE;
if (*p == '?') {
p++;
greedy = FALSE;
}
if (last_atom_start < 0) {
return re_parse_error(s, "nothing to repeat");
}
if (greedy) {
int len, pos;
if (quant_max > 0) {
/* specific optimization for simple quantifiers */
if (dbuf_error(&s->byte_code))
goto out_of_memory;
len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
s->byte_code.size - last_atom_start);
if (len > 0) {
re_emit_op(s, REOP_match);
if (dbuf_insert(&s->byte_code, last_atom_start, 17))
goto out_of_memory;
pos = last_atom_start;
s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
put_u32(&s->byte_code.buf[pos],
s->byte_code.size - last_atom_start - 17);
pos += 4;
put_u32(&s->byte_code.buf[pos], quant_min);
pos += 4;
put_u32(&s->byte_code.buf[pos], quant_max);
pos += 4;
put_u32(&s->byte_code.buf[pos], len);
pos += 4;
goto done;
}
}
if (dbuf_error(&s->byte_code))
goto out_of_memory;
}
/* the spec tells that if there is no advance when
running the atom after the first quant_min times,
then there is no match. We remove this test when we
are sure the atom always advances the position. */
add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start,
s->byte_code.size - last_atom_start);
{
int len, pos;
len = s->byte_code.size - last_atom_start;
if (quant_min == 0) {
/* need to reset the capture in case the atom is
not executed */
if (last_capture_count != s->capture_count) {
if (dbuf_insert(&s->byte_code, last_atom_start, 3))
goto out_of_memory;
s->byte_code.buf[last_atom_start++] = REOP_save_reset;
s->byte_code.buf[last_atom_start++] = last_capture_count;
s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
}
if (quant_max == 0) {
s->byte_code.size = last_atom_start;
} else if (quant_max == 1 || quant_max == INT32_MAX) {
BOOL has_goto = (quant_max == INT32_MAX);
if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check))
goto out_of_memory;
s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
greedy;
put_u32(s->byte_code.buf + last_atom_start + 1,
len + 5 * has_goto + add_zero_advance_check * 2);
if (add_zero_advance_check) {
s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
re_emit_op(s, REOP_check_advance);
}
if (has_goto)
re_emit_goto(s, REOP_goto, last_atom_start);
} else {
if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check))
goto out_of_memory;
pos = last_atom_start;
s->byte_code.buf[pos++] = REOP_push_i32;
put_u32(s->byte_code.buf + pos, quant_max);
pos += 4;
s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2);
pos += 4;
if (add_zero_advance_check) {
s->byte_code.buf[pos++] = REOP_push_char_pos;
re_emit_op(s, REOP_check_advance);
}
re_emit_goto(s, REOP_loop, last_atom_start + 5);
re_emit_op(s, REOP_drop);
}
} else if (quant_min == 1 && quant_max == INT32_MAX &&
!add_zero_advance_check) {
re_emit_goto(s, REOP_split_next_first - greedy,
last_atom_start);
} else {
if (quant_min == 1) {
/* nothing to add */
} else {
if (dbuf_insert(&s->byte_code, last_atom_start, 5))
goto out_of_memory;
s->byte_code.buf[last_atom_start] = REOP_push_i32;
put_u32(s->byte_code.buf + last_atom_start + 1,
quant_min);
last_atom_start += 5;
re_emit_goto(s, REOP_loop, last_atom_start);
re_emit_op(s, REOP_drop);
}
if (quant_max == INT32_MAX) {
pos = s->byte_code.size;
re_emit_op_u32(s, REOP_split_goto_first + greedy,
len + 5 + add_zero_advance_check * 2);
if (add_zero_advance_check)
re_emit_op(s, REOP_push_char_pos);
/* copy the atom */
dbuf_put_self(&s->byte_code, last_atom_start, len);
if (add_zero_advance_check)
re_emit_op(s, REOP_check_advance);
re_emit_goto(s, REOP_goto, pos);
} else if (quant_max > quant_min) {
re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
pos = s->byte_code.size;
re_emit_op_u32(s, REOP_split_goto_first + greedy,
len + 5 + add_zero_advance_check * 2);
if (add_zero_advance_check)
re_emit_op(s, REOP_push_char_pos);
/* copy the atom */
dbuf_put_self(&s->byte_code, last_atom_start, len);
if (add_zero_advance_check)
re_emit_op(s, REOP_check_advance);
re_emit_goto(s, REOP_loop, pos);
re_emit_op(s, REOP_drop);
}
}
last_atom_start = -1;
}
break;
default:
break;
}
}
done:
s->buf_ptr = p;
return 0;
out_of_memory:
return re_parse_out_of_memory(s);
}