in router/src/routing/src/sql_lexer.cc [648:1371]
static int lex_one_token(Lexer_yystype *yylval, THD *thd) {
uchar c = 0;
bool comment_closed;
int tokval, result_state;
uint length;
enum my_lex_states state;
Lex_input_stream *lip = &thd->m_parser_state->m_lip;
const CHARSET_INFO *cs = thd->charset();
const my_lex_states *state_map = cs->state_maps->main_map;
const uchar *ident_map = cs->ident_map;
assert(lip);
lip->yylval = yylval; // The global state
lip->start_token();
state = lip->next_state;
lip->next_state = MY_LEX_START;
for (;;) {
switch (state) {
case MY_LEX_START: // Start of token
// Skip starting whitespace
while (state_map[c = lip->yyPeek()] == MY_LEX_SKIP) {
if (c == '\n') lip->yylineno++;
lip->yySkip();
}
/* Start of real token */
lip->restart_token();
c = lip->yyGet();
state = state_map[c];
break;
case MY_LEX_CHAR: // Unknown or single char token
case MY_LEX_SKIP: // This should not happen
if (c == '-' && lip->yyPeek() == '-' &&
(my_isspace(cs, lip->yyPeekn(1)) ||
my_iscntrl(cs, lip->yyPeekn(1)))) {
state = MY_LEX_COMMENT;
break;
}
if (c == '-' && lip->yyPeek() == '>') // '->'
{
lip->yySkip();
lip->next_state = MY_LEX_START;
if (lip->yyPeek() == '>') {
lip->yySkip();
return JSON_UNQUOTED_SEPARATOR_SYM;
}
return JSON_SEPARATOR_SYM;
}
if (c != ')') lip->next_state = MY_LEX_START; // Allow signed numbers
/*
Check for a placeholder: it should not precede a possible identifier
because of binlogging: when a placeholder is replaced with its value
in a query for the binlog, the query must stay grammatically correct.
*/
if (c == '?' && lip->stmt_prepare_mode && !ident_map[lip->yyPeek()])
return (PARAM_MARKER);
return ((int)c);
case MY_LEX_IDENT_OR_NCHAR:
if (lip->yyPeek() != '\'') {
state = MY_LEX_IDENT;
break;
}
/* Found N'string' */
lip->yySkip(); // Skip '
if (!(yylval->lex_str.str = get_text(lip, 2, 1))) {
state = MY_LEX_CHAR; // Read char by char
break;
}
yylval->lex_str.length = lip->yytoklen;
return (NCHAR_STRING);
case MY_LEX_IDENT_OR_DOLLAR_QUOTED_TEXT: {
#ifdef DOLLAR_QUOTED_STRING_SYM
int len = 0; /* Length of the tag of the dollar quote */
uchar p = lip->yyPeek(); /* Character succeeding first $ */
// Find $ character after the tag
while (p != '$' && ident_map[p] &&
lip->get_ptr() + len <= lip->get_end_of_query()) {
if (use_mb(cs)) {
int l =
my_ismbchar(cs, lip->get_ptr() + len, lip->get_end_of_query());
if (l > 1) len += l - 1;
}
p = lip->yyPeekn(++len);
}
if (p != '$') { /* Not a dollar quote, could be an identifier */
push_deprecated_warn_no_replacement(
lip->m_thd, "$ as the first character of an unquoted identifier");
state = MY_LEX_IDENT;
break;
} else {
LEX_CSTRING text = get_dollar_quoted_text(lip, len);
if (text.length == NULL_CSTR.length)
return ABORT_SYM; // error: unterminated text
else {
yylval->lex_str.str = const_cast<char *>(text.str);
yylval->lex_str.length = text.length;
lip->body_utf8_append(text.str);
lip->body_utf8_append_literal(thd, &yylval->lex_str, cs,
text.str + text.length);
return DOLLAR_QUOTED_STRING_SYM; // $$ ... $$
}
}
#else
state = MY_LEX_IDENT;
push_deprecated_warn_no_replacement(
lip->m_thd, "$ as the first character of an unquoted identifier");
break;
#endif
}
case MY_LEX_IDENT_OR_HEX:
if (lip->yyPeek() == '\'') { // Found x'hex-number'
state = MY_LEX_HEX_NUMBER;
break;
}
[[fallthrough]];
case MY_LEX_IDENT_OR_BIN:
if (lip->yyPeek() == '\'') { // Found b'bin-number'
state = MY_LEX_BIN_NUMBER;
break;
}
[[fallthrough]];
case MY_LEX_IDENT:
const char *start;
if (use_mb(cs)) {
result_state = IDENT_QUOTED;
switch (my_mbcharlen(cs, lip->yyGetLast())) {
case 1:
break;
case 0:
if (my_mbmaxlenlen(cs) < 2) break;
[[fallthrough]];
default:
int l =
my_ismbchar(cs, lip->get_ptr() - 1, lip->get_end_of_query());
if (l == 0) {
state = MY_LEX_CHAR;
continue;
}
lip->skip_binary(l - 1);
}
while (ident_map[c = lip->yyGet()]) {
switch (my_mbcharlen(cs, c)) {
case 1:
break;
case 0:
if (my_mbmaxlenlen(cs) < 2) break;
[[fallthrough]];
default:
int l;
if ((l = my_ismbchar(cs, lip->get_ptr() - 1,
lip->get_end_of_query())) == 0)
break;
lip->skip_binary(l - 1);
}
}
} else {
for (result_state = c; ident_map[c = lip->yyGet()]; result_state |= c)
;
/* If there were non-ASCII characters, mark that we must convert */
result_state = result_state & 0x80 ? IDENT_QUOTED : IDENT;
}
length = lip->yyLength();
start = lip->get_ptr();
if (lip->ignore_space) {
/*
If we find a space then this can't be an identifier. We notice this
below by checking start != lex->ptr.
*/
for (; state_map[c] == MY_LEX_SKIP; c = lip->yyGet()) {
if (c == '\n') lip->yylineno++;
}
}
if (start == lip->get_ptr() && c == '.' && ident_map[lip->yyPeek()])
lip->next_state = MY_LEX_IDENT_SEP;
else { // '(' must follow directly if function
lip->yyUnget();
if ((tokval = find_keyword(lip, length, c == '('))) {
lip->next_state = MY_LEX_START; // Allow signed numbers
return (tokval); // Was keyword
}
lip->yySkip(); // next state does a unget
}
yylval->lex_str = get_token(lip, 0, length);
/*
Note: "SELECT _bla AS 'alias'"
_bla should be considered as a IDENT if charset haven't been found.
So we don't use MYF(MY_WME) with get_charset_by_csname to avoid
producing an error.
*/
if (yylval->lex_str.str[0] == '_') {
auto charset_name = yylval->lex_str.str + 1;
const CHARSET_INFO *underscore_cs =
get_charset_by_csname(charset_name, MY_CS_PRIMARY, MYF(0));
if (underscore_cs) {
lip->warn_on_deprecated_charset(underscore_cs, charset_name);
// 255 is my_charset_utf8mb4_0900_ai_ci.number
static const auto *utf8mb4_0900_ai_ci = get_charset(255, 0);
if (underscore_cs == utf8mb4_0900_ai_ci) {
/*
If underscore_cs is utf8mb4, and the collation of underscore_cs
is the default collation of utf8mb4, then update underscore_cs
with a value of the default_collation_for_utf8mb4 system
variable:
*/
underscore_cs = thd->variables.default_collation_for_utf8mb4;
}
yylval->charset = underscore_cs;
lip->m_underscore_cs = underscore_cs;
lip->body_utf8_append(lip->m_cpp_text_start,
lip->get_cpp_tok_start() + length);
return (UNDERSCORE_CHARSET);
}
}
lip->body_utf8_append(lip->m_cpp_text_start);
lip->body_utf8_append_literal(thd, &yylval->lex_str, cs,
lip->m_cpp_text_end);
return (result_state); // IDENT or IDENT_QUOTED
case MY_LEX_IDENT_SEP: // Found ident and now '.'
yylval->lex_str.str = const_cast<char *>(lip->get_ptr());
yylval->lex_str.length = 1;
c = lip->yyGet(); // should be '.'
if (uchar next_c = lip->yyPeek(); ident_map[next_c]) {
lip->next_state =
MY_LEX_IDENT_START; // Next is an ident (not a keyword)
if (next_c == '$') // We got .$ident
push_deprecated_warn_no_replacement(
lip->m_thd,
"$ as the first character of an unquoted identifier");
} else // Probably ` or "
lip->next_state = MY_LEX_START;
return ((int)c);
case MY_LEX_NUMBER_IDENT: // number or ident which num-start
if (lip->yyGetLast() == '0') {
c = lip->yyGet();
if (c == 'x') {
while (my_isxdigit(cs, (c = lip->yyGet())))
;
if ((lip->yyLength() >= 3) && !ident_map[c]) {
/* skip '0x' */
yylval->lex_str = get_token(lip, 2, lip->yyLength() - 2);
return (HEX_NUM);
}
lip->yyUnget();
state = MY_LEX_IDENT_START;
break;
} else if (c == 'b') {
while ((c = lip->yyGet()) == '0' || c == '1')
;
if ((lip->yyLength() >= 3) && !ident_map[c]) {
/* Skip '0b' */
yylval->lex_str = get_token(lip, 2, lip->yyLength() - 2);
return (BIN_NUM);
}
lip->yyUnget();
state = MY_LEX_IDENT_START;
break;
}
lip->yyUnget();
}
while (my_isdigit(cs, (c = lip->yyGet())))
;
if (!ident_map[c]) { // Can't be identifier
state = MY_LEX_INT_OR_REAL;
break;
}
if (c == 'e' || c == 'E') {
// The following test is written this way to allow numbers of type 1e1
if (my_isdigit(cs, lip->yyPeek()) || (c = (lip->yyGet())) == '+' ||
c == '-') { // Allow 1E+10
if (my_isdigit(cs,
lip->yyPeek())) // Number must have digit after sign
{
lip->yySkip();
while (my_isdigit(cs, lip->yyGet()))
;
yylval->lex_str = get_token(lip, 0, lip->yyLength());
return (FLOAT_NUM);
}
}
lip->yyUnget();
}
[[fallthrough]];
case MY_LEX_IDENT_START: // We come here after '.'
result_state = IDENT;
if (use_mb(cs)) {
result_state = IDENT_QUOTED;
while (ident_map[c = lip->yyGet()]) {
switch (my_mbcharlen(cs, c)) {
case 1:
break;
case 0:
if (my_mbmaxlenlen(cs) < 2) break;
[[fallthrough]];
default:
int l;
if ((l = my_ismbchar(cs, lip->get_ptr() - 1,
lip->get_end_of_query())) == 0)
break;
lip->skip_binary(l - 1);
}
}
} else {
for (result_state = 0; ident_map[c = lip->yyGet()]; result_state |= c)
;
/* If there were non-ASCII characters, mark that we must convert */
result_state = result_state & 0x80 ? IDENT_QUOTED : IDENT;
}
if (c == '.' && ident_map[lip->yyPeek()])
lip->next_state = MY_LEX_IDENT_SEP; // Next is '.'
yylval->lex_str = get_token(lip, 0, lip->yyLength());
lip->body_utf8_append(lip->m_cpp_text_start);
lip->body_utf8_append_literal(thd, &yylval->lex_str, cs,
lip->m_cpp_text_end);
return (result_state);
case MY_LEX_USER_VARIABLE_DELIMITER: // Found quote char
{
uint double_quotes = 0;
char quote_char = c; // Used char
for (;;) {
c = lip->yyGet();
if (c == 0) {
lip->yyUnget();
return ABORT_SYM; // Unmatched quotes
}
int var_length;
if ((var_length = my_mbcharlen(cs, c)) == 1) {
if (c == quote_char) {
if (lip->yyPeek() != quote_char) break;
c = lip->yyGet();
double_quotes++;
continue;
}
} else if (use_mb(cs)) {
if ((var_length = my_ismbchar(cs, lip->get_ptr() - 1,
lip->get_end_of_query())))
lip->skip_binary(var_length - 1);
}
}
if (double_quotes)
yylval->lex_str = get_quoted_token(
lip, 1, lip->yyLength() - double_quotes - 1, quote_char);
else
yylval->lex_str = get_token(lip, 1, lip->yyLength() - 1);
if (c == quote_char) lip->yySkip(); // Skip end `
lip->next_state = MY_LEX_START;
lip->body_utf8_append(lip->m_cpp_text_start);
lip->body_utf8_append_literal(thd, &yylval->lex_str, cs,
lip->m_cpp_text_end);
return (IDENT_QUOTED);
}
case MY_LEX_INT_OR_REAL: // Complete int or incomplete real
if (c != '.') { // Found complete integer number.
yylval->lex_str = get_token(lip, 0, lip->yyLength());
return int_token(yylval->lex_str.str, (uint)yylval->lex_str.length);
}
[[fallthrough]];
case MY_LEX_REAL: // Incomplete real number
while (my_isdigit(cs, c = lip->yyGet()))
;
if (c == 'e' || c == 'E') {
c = lip->yyGet();
if (c == '-' || c == '+') c = lip->yyGet(); // Skip sign
if (!my_isdigit(cs, c)) { // No digit after sign
state = MY_LEX_CHAR;
break;
}
while (my_isdigit(cs, lip->yyGet()))
;
yylval->lex_str = get_token(lip, 0, lip->yyLength());
return (FLOAT_NUM);
}
yylval->lex_str = get_token(lip, 0, lip->yyLength());
return (DECIMAL_NUM);
case MY_LEX_HEX_NUMBER: // Found x'hexstring'
lip->yySkip(); // Accept opening '
while (my_isxdigit(cs, (c = lip->yyGet())))
;
if (c != '\'') return (ABORT_SYM); // Illegal hex constant
lip->yySkip(); // Accept closing '
length = lip->yyLength(); // Length of hexnum+3
if ((length % 2) == 0) return (ABORT_SYM); // odd number of hex digits
yylval->lex_str = get_token(lip,
2, // skip x'
length - 3); // don't count x' and last '
return (HEX_NUM);
case MY_LEX_BIN_NUMBER: // Found b'bin-string'
lip->yySkip(); // Accept opening '
while ((c = lip->yyGet()) == '0' || c == '1')
;
if (c != '\'') return (ABORT_SYM); // Illegal hex constant
lip->yySkip(); // Accept closing '
length = lip->yyLength(); // Length of bin-num + 3
yylval->lex_str = get_token(lip,
2, // skip b'
length - 3); // don't count b' and last '
return (BIN_NUM);
case MY_LEX_CMP_OP: // Incomplete comparison operator
if (state_map[lip->yyPeek()] == MY_LEX_CMP_OP ||
state_map[lip->yyPeek()] == MY_LEX_LONG_CMP_OP)
lip->yySkip();
if ((tokval = find_keyword(lip, lip->yyLength() + 1, false))) {
lip->next_state = MY_LEX_START; // Allow signed numbers
return (tokval);
}
state = MY_LEX_CHAR; // Something fishy found
break;
case MY_LEX_LONG_CMP_OP: // Incomplete comparison operator
if (state_map[lip->yyPeek()] == MY_LEX_CMP_OP ||
state_map[lip->yyPeek()] == MY_LEX_LONG_CMP_OP) {
lip->yySkip();
if (state_map[lip->yyPeek()] == MY_LEX_CMP_OP) lip->yySkip();
}
if ((tokval = find_keyword(lip, lip->yyLength() + 1, false))) {
lip->next_state = MY_LEX_START; // Found long op
return (tokval);
}
state = MY_LEX_CHAR; // Something fishy found
break;
case MY_LEX_BOOL:
if (c != lip->yyPeek()) {
state = MY_LEX_CHAR;
break;
}
lip->yySkip();
tokval = find_keyword(lip, 2, false); // Is a bool operator
lip->next_state = MY_LEX_START; // Allow signed numbers
return (tokval);
case MY_LEX_STRING_OR_DELIMITER:
if (thd->variables.sql_mode & MODE_ANSI_QUOTES) {
state = MY_LEX_USER_VARIABLE_DELIMITER;
break;
}
/* " used for strings */
[[fallthrough]];
case MY_LEX_STRING: // Incomplete text string
if (!(yylval->lex_str.str = get_text(lip, 1, 1))) {
state = MY_LEX_CHAR; // Read char by char
break;
}
yylval->lex_str.length = lip->yytoklen;
lip->body_utf8_append(lip->m_cpp_text_start);
lip->body_utf8_append_literal(
thd, &yylval->lex_str,
lip->m_underscore_cs ? lip->m_underscore_cs : cs,
lip->m_cpp_text_end);
lip->m_underscore_cs = nullptr;
return (TEXT_STRING);
case MY_LEX_COMMENT: // Comment
thd->m_parser_state->add_comment();
while ((c = lip->yyGet()) != '\n' && c)
;
lip->yyUnget(); // Safety against eof
state = MY_LEX_START; // Try again
break;
case MY_LEX_LONG_COMMENT: /* Long C comment? */
if (lip->yyPeek() != '*') {
state = MY_LEX_CHAR; // Probable division
break;
}
thd->m_parser_state->add_comment();
/* Reject '/' '*', since we might need to turn off the echo */
lip->yyUnget();
lip->save_in_comment_state();
if (lip->yyPeekn(2) == '!') {
lip->in_comment = DISCARD_COMMENT;
/* Accept '/' '*' '!', but do not keep this marker. */
lip->set_echo(false);
lip->yySkip();
lip->yySkip();
lip->yySkip();
/*
The special comment format is very strict:
'/' '*' '!', followed by exactly
1 digit (major), 2 digits (minor), then 2 digits (dot).
32302 -> 3.23.02
50032 -> 5.0.32
50114 -> 5.1.14
*/
char version_str[6];
if (my_isdigit(cs, (version_str[0] = lip->yyPeekn(0))) &&
my_isdigit(cs, (version_str[1] = lip->yyPeekn(1))) &&
my_isdigit(cs, (version_str[2] = lip->yyPeekn(2))) &&
my_isdigit(cs, (version_str[3] = lip->yyPeekn(3))) &&
my_isdigit(cs, (version_str[4] = lip->yyPeekn(4)))) {
version_str[5] = 0;
ulong version;
version = strtol(version_str, nullptr, 10);
if (version <= MYSQL_VERSION_ID) {
/* Accept 'M' 'm' 'm' 'd' 'd' */
lip->yySkipn(5);
/* Expand the content of the special comment as real code */
lip->set_echo(true);
state = MY_LEX_START;
break; /* Do not treat contents as a comment. */
} else {
/*
Patch and skip the conditional comment to avoid it
being propagated infinitely (eg. to a slave).
*/
char *pcom = lip->yyUnput(' ');
comment_closed = !consume_comment(lip, 1);
if (!comment_closed) {
*pcom = '!';
}
/* version allowed to have one level of comment inside. */
}
} else {
/* Not a version comment. */
state = MY_LEX_START;
lip->set_echo(true);
break;
}
} else {
if (lip->in_comment != NO_COMMENT) {
#ifdef WITH_PUSH_WARNING
push_warning(
lip->m_thd, Sql_condition::SL_WARNING,
ER_WARN_DEPRECATED_SYNTAX_NO_REPLACEMENT,
ER_THD(lip->m_thd, ER_WARN_DEPRECATED_NESTED_COMMENT_SYNTAX));
#endif
}
lip->in_comment = PRESERVE_COMMENT;
lip->yySkip(); // Accept /
lip->yySkip(); // Accept *
comment_closed = !consume_comment(lip, 0);
/* regular comments can have zero comments inside. */
}
/*
Discard:
- regular '/' '*' comments,
- special comments '/' '*' '!' for a future version,
by scanning until we find a closing '*' '/' marker.
Nesting regular comments isn't allowed. The first
'*' '/' returns the parser to the previous state.
/#!VERSI oned containing /# regular #/ is allowed #/
Inside one versioned comment, another versioned comment
is treated as a regular discardable comment. It gets
no special parsing.
*/
/* Unbalanced comments with a missing '*' '/' are a syntax error */
if (!comment_closed) return (ABORT_SYM);
state = MY_LEX_START; // Try again
lip->restore_in_comment_state();
break;
case MY_LEX_END_LONG_COMMENT:
if ((lip->in_comment != NO_COMMENT) && lip->yyPeek() == '/') {
/* Reject '*' '/' */
lip->yyUnget();
/* Accept '*' '/', with the proper echo */
lip->set_echo(lip->in_comment == PRESERVE_COMMENT);
lip->yySkipn(2);
/* And start recording the tokens again */
lip->set_echo(true);
/*
C-style comments are replaced with a single space (as it
is in C and C++). If there is already a whitespace
character at this point in the stream, the space is
not inserted.
See also ISO/IEC 9899:1999 §5.1.1.2
("Programming languages — C")
*/
if (!my_isspace(cs, lip->yyPeek()) &&
lip->get_cpp_ptr() != lip->get_cpp_buf() &&
!my_isspace(cs, *(lip->get_cpp_ptr() - 1)))
lip->cpp_inject(' ');
lip->in_comment = NO_COMMENT;
state = MY_LEX_START;
} else
state = MY_LEX_CHAR; // Return '*'
break;
case MY_LEX_SET_VAR: // Check if ':='
if (lip->yyPeek() != '=') {
state = MY_LEX_CHAR; // Return ':'
break;
}
lip->yySkip();
return (SET_VAR);
case MY_LEX_SEMICOLON: // optional line terminator
state = MY_LEX_CHAR; // Return ';'
break;
case MY_LEX_EOL:
if (lip->eof()) {
lip->yyUnget(); // Reject the last '\0'
lip->set_echo(false);
lip->yySkip();
lip->set_echo(true);
/* Unbalanced comments with a missing '*' '/' are a syntax error */
if (lip->in_comment != NO_COMMENT) return (ABORT_SYM);
lip->next_state = MY_LEX_END; // Mark for next loop
return (END_OF_INPUT);
}
state = MY_LEX_CHAR;
break;
case MY_LEX_END:
lip->next_state = MY_LEX_END;
return (0); // We found end of input last time
/* Actually real shouldn't start with . but allow them anyhow */
case MY_LEX_REAL_OR_POINT:
if (my_isdigit(cs, lip->yyPeek()))
state = MY_LEX_REAL; // Real
else {
state = MY_LEX_IDENT_SEP; // return '.'
lip->yyUnget(); // Put back '.'
}
break;
case MY_LEX_USER_END: // end '@' of user@hostname
switch (state_map[lip->yyPeek()]) {
case MY_LEX_STRING:
case MY_LEX_USER_VARIABLE_DELIMITER:
case MY_LEX_STRING_OR_DELIMITER:
break;
case MY_LEX_USER_END:
lip->next_state = MY_LEX_SYSTEM_VAR;
break;
default:
lip->next_state = MY_LEX_HOSTNAME;
break;
}
yylval->lex_str.str = const_cast<char *>(lip->get_ptr());
yylval->lex_str.length = 1;
return ((int)'@');
case MY_LEX_HOSTNAME: // end '@' of user@hostname
for (c = lip->yyGet();
my_isalnum(cs, c) || c == '.' || c == '_' || c == '$';
c = lip->yyGet())
;
yylval->lex_str = get_token(lip, 0, lip->yyLength());
return (LEX_HOSTNAME);
case MY_LEX_SYSTEM_VAR:
yylval->lex_str.str = const_cast<char *>(lip->get_ptr());
yylval->lex_str.length = 1;
lip->yySkip(); // Skip '@'
lip->next_state =
(state_map[lip->yyPeek()] == MY_LEX_USER_VARIABLE_DELIMITER
? MY_LEX_START
: MY_LEX_IDENT_OR_KEYWORD);
return ((int)'@');
case MY_LEX_IDENT_OR_KEYWORD:
/*
We come here when we have found two '@' in a row.
We should now be able to handle:
[(global | local | session) .]variable_name
*/
for (result_state = 0; ident_map[c = lip->yyGet()]; result_state |= c)
;
/* If there were non-ASCII characters, mark that we must convert */
result_state = result_state & 0x80 ? IDENT_QUOTED : IDENT;
if (c == '.') lip->next_state = MY_LEX_IDENT_SEP;
length = lip->yyLength();
if (length == 0) return (ABORT_SYM); // Names must be nonempty.
if ((tokval = find_keyword(lip, length, false))) {
lip->yyUnget(); // Put back 'c'
return (tokval); // Was keyword
}
yylval->lex_str = get_token(lip, 0, length);
lip->body_utf8_append(lip->m_cpp_text_start);
lip->body_utf8_append_literal(thd, &yylval->lex_str, cs,
lip->m_cpp_text_end);
return (result_state);
}
}
}