in library/sql.parser/source/myx_unicode_scanner.cpp [714:1399]
int MYSQLlex(void **arg, void *yyl)
{
reg1 uchar c = 0;
int tokval, result_state;
uint length;
enum my_lex_states state;
LEX_STRING tmp_lex_string;
//LEX *lex= ((THD *)yythd)->lex;
LEX *lex= (LEX *)yyl;
SqlAstNode **yylval= (SqlAstNode **) arg;
//CHARSET_INFO *cs= ((THD *) yythd)->charset();
CHARSET_INFO *cs= lex->charset;
uchar *state_map= cs->state_map;
uchar *ident_map= cs->ident_map;
lex->yylval=yylval; // The global state
token_start_lineno= lex->yylineno;
lex->tok_end_prev= lex->tok_end;
lex->tok_start_prev= lex->tok_start;
lex->tok_start=lex->tok_end=lex->ptr;
state=lex->next_state;
lex->next_state=MY_LEX_OPERATOR_OR_IDENT;
for (;;)
{
if (parser_is_stopped)
break;
switch (state) {
case MY_LEX_OPERATOR_OR_IDENT: // Next is operator or keyword
case MY_LEX_START: // Start of token
// Skip startspace
for (c=yyGet() ; state_map[c] == MY_LEX_SKIP ; c= yyGet()) ;
lex->tok_start=lex->ptr-1; // Start of real token
state= (enum my_lex_states) state_map[c];
token_start_lineno= lex->yylineno;
break;
case MY_LEX_ESCAPE:
if (yyGet() == 'N')
{ // Allow \N as shortcut for NULL
new_ast_terminal_node(lex, /*"\\N", */2, 0);
#if 0
yylval->lex_str.str=(char*) "\\N";
yylval->lex_str.length=2;
#endif
return NULL_SYM;
}
case MY_LEX_CHAR: // Unknown or single char token
case MY_LEX_SKIP: // This should not happen
if (c == '-' && yyPeek() == '-' &&
(my_isspace(cs,yyPeek2()) ||
my_iscntrl(cs,yyPeek2())))
{
state=MY_LEX_COMMENT;
break;
}
lex->ptr= lex->tok_start;//
new_ast_terminal_node(lex, /*(const char*) (lex->ptr=lex->tok_start), */1, 0);
#if 0
yylval->lex_str.str=(char*) (lex->ptr=lex->tok_start);// Set to first chr
yylval->lex_str.length=1;
#endif
c=yyGet();
if (c != ')')
lex->next_state= MY_LEX_START; // Allow signed numbers
if (c == ',')
lex->tok_start=lex->ptr; // Let tok_start point at next item
/*
Check for a placeholder: it should not precede a possible identifier
because of binlogging: when a placeholder is replaced with
its value in a query for the binlog, the query must stay
grammatically correct.
*/
else if (c == '?' && lex->stmt_prepare_mode && !ident_map[yyPeek()])
return(PARAM_MARKER);
return((int) c);
case MY_LEX_IDENT_OR_NCHAR:
if (yyPeek() != '\'')
{ // Found x'hex-number'
state= MY_LEX_IDENT;
break;
}
(void) yyGet(); // Skip '
while ((c = yyGet()) && (c !='\'')) ;
length=(uint)(lex->ptr - lex->tok_start); // Length of hexnum+3
if (c != '\'')
{
return(ABORT_SYM); // Illegal hex constant
}
(void) yyGet(); // get_token makes an unget
tmp_lex_string= get_token(lex,length);
new_ast_terminal_node(lex, /*tmp_lex_string.str+2, */tmp_lex_string.length-3, tmp_lex_string.str);
#if 0
yylval->lex_str=get_token(lex,length);
yylval->lex_str.str+=2; // Skip x'
yylval->lex_str.length-=3; // Don't count x' and last '
#endif
lex->yytoklen-=3;
return (NCHAR_STRING);
case MY_LEX_IDENT_OR_HEX:
if (yyPeek() == '\'')
{ // Found x'hex-number'
state= MY_LEX_HEX_NUMBER;
break;
}
case MY_LEX_IDENT_OR_BIN:
if (yyPeek() == '\'')
{ // Found b'bin-number'
state= MY_LEX_BIN_NUMBER;
break;
}
case MY_LEX_IDENT:
const uchar *start;
#if defined(USE_MB) && defined(USE_MB_IDENT)
if (use_mb(cs))
{
result_state= IDENT_QUOTED;
//result_state= QUOTED;
if (my_mbcharlen(cs, yyGetLast()) > 1)
{
int l = my_ismbchar(cs,
(const char *)lex->ptr-1,
(const char *)lex->end_of_query);
if (l == 0) {
state = MY_LEX_CHAR;
continue;
}
lex->ptr += l - 1;
}
while (ident_map[c=yyGet()])
{
if (my_mbcharlen(cs, c) > 1)
{
int l;
if ((l = my_ismbchar(cs,
(const char *)lex->ptr-1,
(const char *)lex->end_of_query)) == 0)
break;
lex->ptr += l-1;
}
}
}
else
#endif
{
for (result_state= c; ident_map[c= yyGet()]; result_state|= c) ;
/* If there were non-ASCII characters, mark that we must convert */
result_state= result_state & 0x80 ? IDENT_QUOTED : IDENT;
}
length= (uint) (lex->ptr - lex->tok_start)-1;
start= lex->ptr;
if (lex->ignore_space)
{
/*
If we find a space then this can't be an identifier. We notice this
below by checking start != lex->ptr.
*/
for (; state_map[c] == MY_LEX_SKIP ; c= yyGet()) ;
}
if ((start == lex->ptr) && (c == '.') && (ident_map[yyPeek()]))
lex->next_state=MY_LEX_IDENT_SEP;
else
{ // '(' must follow directly if function
yyUnget();
if ((tokval = find_keyword(lex,length,c == '(')))
{
lex->next_state= MY_LEX_START; // Allow signed numbers
return(tokval); // Was keyword
}
yySkip(); // next state does a unget
}
tmp_lex_string= get_token(lex,length);
new_ast_terminal_node(lex, /*tmp_lex_string.str, */tmp_lex_string.length, 0);
#if 0
yylval->lex_str=get_token(lex,length);
#endif
/*
Note: "SELECT _bla AS 'alias'"
_bla should be considered as a IDENT if charset haven't been found.
So we don't use MYF(MY_WME) with get_charset_by_csname to avoid
producing an error.
*/
//if ((yylval->lex_str.str[0]=='_') &&
// (lex->charset=get_charset_by_csname(yylval->lex_str.str+1,
// MY_CS_PRIMARY,MYF(0))))
// return(UNDERSCORE_CHARSET);
if (tmp_lex_string.str[0]=='_')
{
CHARSET_INFO *charset= get_charset_by_csname(tmp_lex_string.str+1, MY_CS_PRIMARY, MYF(0));
if (charset)
{
/*
serg (WB rev-eng context):
don't change charset, because nobody will set it back. rev-eng staff works only with utf8 charset.
*/
//lex->charset= charset;
free(tmp_lex_string.str);
return (UNDERSCORE_CHARSET);
}
}
free(tmp_lex_string.str);
return(result_state); // IDENT or IDENT_QUOTED
case MY_LEX_IDENT_SEP: // Found ident and now '.'
new_ast_terminal_node(lex, /*(char*)lex->ptr, */1, 0);
#if 0
yylval->lex_str.str=(char*) lex->ptr;
yylval->lex_str.length=1;
#endif
c=yyGet(); // should be '.'
lex->next_state= MY_LEX_IDENT_START;// Next is an ident (not a keyword)
if (!ident_map[yyPeek()]) // Probably ` or "
lex->next_state= MY_LEX_START;
return((int) c);
case MY_LEX_NUMBER_IDENT: // number or ident which num-start
while (my_isdigit(cs,(c = yyGet()))) ;
if (!ident_map[c])
{ // Can't be identifier
state=MY_LEX_INT_OR_REAL;
break;
}
if (c == 'e' || c == 'E')
{
// The following test is written this way to allow numbers of type 1e1
if (my_isdigit(cs,yyPeek()) ||
(c=(yyGet())) == '+' || c == '-')
{ // Allow 1E+10
if (my_isdigit(cs,yyPeek())) // Number must have digit after sign
{
yySkip();
while (my_isdigit(cs,yyGet()))
;
tmp_lex_string=get_token(lex,yyLength());
new_ast_terminal_node(lex, /*tmp_lex_string.str, */tmp_lex_string.length, 0);
#if 0
yylval->lex_str=get_token(lex,yyLength());
#endif
return(FLOAT_NUM);
}
}
yyUnget(); /* purecov: inspected */
}
else if (c == 'x' && (lex->ptr - lex->tok_start) == 2 &&
lex->tok_start[0] == '0' )
{ // Varbinary
while (my_isxdigit(cs,(c = yyGet()))) ;
if ((lex->ptr - lex->tok_start) >= 4 && !ident_map[c])
{
tmp_lex_string= get_token(lex,yyLength());
new_ast_terminal_node(lex, /*tmp_lex_string.str + 2, */tmp_lex_string.length, tmp_lex_string.str);
#if 0
yylval->lex_str=get_token(lex,yyLength());
yylval->lex_str.str+=2; // Skip 0x
yylval->lex_str.length-=2;
#endif
//lex->yytoklen-=2;
return (HEX_NUM);
}
yyUnget();
}
else if (c == 'b' && (lex->ptr - lex->tok_start) == 2 &&
lex->tok_start[0] == '0' )
{ // b'bin-number'
while (my_isxdigit(cs,(c = yyGet()))) ;
if ((lex->ptr - lex->tok_start) >= 4 && !ident_map[c])
{
tmp_lex_string= get_token(lex,yyLength());
new_ast_terminal_node(lex, /*tmp_lex_string.str + 2, */tmp_lex_string.length - 2, tmp_lex_string.str);
#if 0
yylval->lex_str= get_token(lex, yyLength());
yylval->lex_str.str+= 2; // Skip 0x
yylval->lex_str.length-= 2;
#endif
lex->yytoklen-= 2;
return (BIN_NUM);
}
yyUnget();
}
// fall through
case MY_LEX_IDENT_START: // We come here after '.'
result_state= IDENT;
#if defined(USE_MB) && defined(USE_MB_IDENT)
if (use_mb(cs))
{
result_state= IDENT_QUOTED;
while (ident_map[c=yyGet()])
{
if (my_mbcharlen(cs, c) > 1)
{
int l;
if ((l = my_ismbchar(cs,
(const char *)lex->ptr-1,
(const char *)lex->end_of_query)) == 0)
break;
lex->ptr += l-1;
}
}
}
else
#endif
{
for (result_state=0; ident_map[c= yyGet()]; result_state|= c) ;
/* If there were non-ASCII characters, mark that we must convert */
result_state= result_state & 0x80 ? IDENT_QUOTED : IDENT;
}
if (c == '.' && ident_map[yyPeek()])
lex->next_state=MY_LEX_IDENT_SEP;// Next is '.'
tmp_lex_string= get_token(lex,yyLength());
new_ast_terminal_node(lex, /*tmp_lex_string.str, */tmp_lex_string.length, tmp_lex_string.str);
#if 0
yylval->lex_str= get_token(lex,yyLength());
#endif
return(result_state);
case MY_LEX_USER_VARIABLE_DELIMITER: // Found quote char
{
uint double_quotes= 0;
char quote_char= c; // Used char
lex->tok_start=lex->ptr; // Skip first `
while ((c=yyGet()))
{
int length;
if ((length= my_mbcharlen(cs, c)) == 1)
{
if (c == (uchar) NAMES_SEP_CHAR)
break; /* Old .frm format can't handle this char */
if (c == quote_char)
{
if (yyPeek() != quote_char)
break;
c=yyGet();
double_quotes++;
continue;
}
}
#ifdef USE_MB
else if (length < 1)
break; // Error
lex->ptr+= length-1;
#endif
}
if (double_quotes)
{
tmp_lex_string= get_quoted_token(lex,yyLength() - double_quotes, quote_char);
new_ast_terminal_node(lex, tmp_lex_string.str, lex->yytoklen, tmp_lex_string.str);
#if 0
yylval->lex_str=get_quoted_token(lex,yyLength() - double_quotes,
quote_char);
#endif
}
else
{
tmp_lex_string= get_token(lex,yyLength());
new_ast_terminal_node(lex, /*tmp_lex_string.str, */tmp_lex_string.length, tmp_lex_string.str);
#if 0
yylval->lex_str=get_token(lex,yyLength());
#endif
}
if (c == quote_char)
yySkip(); // Skip end `
lex->next_state= MY_LEX_START;
return(IDENT_QUOTED);
}
case MY_LEX_INT_OR_REAL: // Compleat int or incompleat real
if (c != '.')
{ // Found complete integer number.
tmp_lex_string= get_token(lex,yyLength());
new_ast_terminal_node(lex, /*tmp_lex_string.str, */tmp_lex_string.length, 0);
uint tok= int_token(tmp_lex_string.str, tmp_lex_string.length);
free(tmp_lex_string.str);
return tok;
#if 0
yylval->lex_str=get_token(lex,yyLength());
return int_token(yylval->lex_str.str,yylval->lex_str.length);
#endif
}
// fall through
case MY_LEX_REAL: // Incomplete real number
while (my_isdigit(cs,c = yyGet())) ;
if (c == 'e' || c == 'E')
{
c = yyGet();
if (c == '-' || c == '+')
c = yyGet(); // Skip sign
if (!my_isdigit(cs,c))
{ // No digit after sign
state= MY_LEX_CHAR;
break;
}
while (my_isdigit(cs,yyGet()))
;
tmp_lex_string= get_token(lex,yyLength());
new_ast_terminal_node(lex, /*tmp_lex_string.str, */tmp_lex_string.length, tmp_lex_string.str);
#if 0
yylval->lex_str=get_token(lex,yyLength());
#endif
return(FLOAT_NUM);
}
tmp_lex_string= get_token(lex,yyLength());
new_ast_terminal_node(lex, /*tmp_lex_string.str, */tmp_lex_string.length, tmp_lex_string.str);
#if 0
yylval->lex_str=get_token(lex,yyLength());
#endif
return(DECIMAL_NUM);
case MY_LEX_HEX_NUMBER: // Found x'hexstring'
(void) yyGet(); // Skip '
while (my_isxdigit(cs,(c = yyGet()))) ;
length=(uint)(lex->ptr - lex->tok_start); // Length of hexnum+3
if (!(length & 1) || c != '\'')
{
return(ABORT_SYM); // Illegal hex constant
}
(void) yyGet(); // get_token makes an unget
tmp_lex_string= get_token(lex,yyLength());
new_ast_terminal_node(lex, /*tmp_lex_string.str + 2, */tmp_lex_string.length - 3, tmp_lex_string.str);
#if 0
yylval->lex_str=get_token(lex,length);
yylval->lex_str.str+=2; // Skip x'
yylval->lex_str.length-=3; // Don't count x' and last '
#endif
lex->yytoklen-=3;
return (HEX_NUM);
case MY_LEX_BIN_NUMBER: // Found b'bin-string'
(void) yyGet(); // Skip '
while ((c= yyGet()) == '0' || c == '1') ;
length= (uint)(lex->ptr - lex->tok_start); // Length of bin-num + 3
if (c != '\'')
return(ABORT_SYM); // Illegal hex constant
(void) yyGet(); // get_token makes an unget
tmp_lex_string= get_token(lex,yyLength());
new_ast_terminal_node(lex, /*tmp_lex_string.str + 2, */tmp_lex_string.length - 3, tmp_lex_string.str);
#if 0
yylval->lex_str= get_token(lex, length);
yylval->lex_str.str+= 2; // Skip b'
yylval->lex_str.length-= 3; // Don't count b' and last '
#endif
lex->yytoklen-= 3;
return (BIN_NUM);
case MY_LEX_CMP_OP: // Incomplete comparison operator
if (state_map[yyPeek()] == MY_LEX_CMP_OP || state_map[yyPeek()] == MY_LEX_LONG_CMP_OP)
yySkip();
if ((tokval = find_keyword(lex,(uint) (lex->ptr - lex->tok_start),0)))
{
lex->next_state= MY_LEX_START; // Allow signed numbers
return(tokval);
}
state = MY_LEX_CHAR; // Something fishy found
break;
case MY_LEX_LONG_CMP_OP: // Incomplete comparison operator
if (state_map[yyPeek()] == MY_LEX_CMP_OP || state_map[yyPeek()] == MY_LEX_LONG_CMP_OP)
{
yySkip();
if (state_map[yyPeek()] == MY_LEX_CMP_OP)
yySkip();
}
if ((tokval = find_keyword(lex,(uint) (lex->ptr - lex->tok_start),0)))
{
lex->next_state= MY_LEX_START; // Found long op
return(tokval);
}
state = MY_LEX_CHAR; // Something fishy found
break;
case MY_LEX_BOOL:
if (c != yyPeek())
{
state=MY_LEX_CHAR;
break;
}
yySkip();
tokval = find_keyword(lex,2,0); // Is a bool operator
lex->next_state= MY_LEX_START; // Allow signed numbers
return(tokval);
case MY_LEX_STRING_OR_DELIMITER:
//if (((THD *) yythd)->variables.sql_mode & MODE_ANSI_QUOTES)
if (lex->sql_mode.MODE_ANSI_QUOTES)
{
state= MY_LEX_USER_VARIABLE_DELIMITER;
break;
}
/* " used for strings */
case MY_LEX_STRING: // Incomplete text string
tmp_lex_string.str= get_text(lex);
if(!tmp_lex_string.str)
{
new_ast_terminal_node(lex, /*0, */0, tmp_lex_string.str);
state= MY_LEX_CHAR; // Read char by char
break;
}
new_ast_terminal_node(lex, tmp_lex_string.str, lex->yytoklen, tmp_lex_string.str);
#if 0
if (!(yylval->lex_str.str = get_text(lex)))
{
state= MY_LEX_CHAR; // Read char by char
break;
}
yylval->lex_str.length=lex->yytoklen;
#endif
return(TEXT_STRING);
case MY_LEX_COMMENT: // Comment
//lex->select_lex.options|= OPTION_FOUND_COMMENT;
while ((c = yyGet()) && (c != '\n') && !((c == '\r') && (yyPeek() != '\n'))) ;
yyUnget(); // Safety against eof
state = MY_LEX_START; // Try again
break;
case MY_LEX_LONG_COMMENT: /* Long C comment? */
if (yyPeek() != '*')
{
state=MY_LEX_CHAR; // Probable division
break;
}
yySkip(); // Skip '*'
//lex->select_lex.options|= OPTION_FOUND_COMMENT;
if (yyPeek() == '!') // MySQL command in comment
{
ulong version=MYSQL_VERSION_ID;
yySkip();
state=MY_LEX_START;
if (my_isdigit(cs,yyPeek()))
{ // Version number
version=strtol((char*) lex->ptr,(char**) &lex->ptr,10);
}
if (version <= MYSQL_VERSION_ID)
{
lex->in_comment=1;
break;
}
}
while (lex->ptr != lex->end_of_query &&
((c=yyGet()) != '*' || yyPeek() != '/')) ;
if (lex->ptr != lex->end_of_query)
yySkip(); // remove last '/'
state = MY_LEX_START; // Try again
break;
case MY_LEX_END_LONG_COMMENT:
if (lex->in_comment && yyPeek() == '/')
{
yySkip();
lex->in_comment=0;
state=MY_LEX_START;
}
else
state=MY_LEX_CHAR; // Return '*'
break;
case MY_LEX_SET_VAR: // Check if ':='
if (yyPeek() != '=')
{
state=MY_LEX_CHAR; // Return ':'
break;
}
yySkip();
new_ast_terminal_node(lex, /*":=", */2, 0);
return (SET_VAR);
case MY_LEX_SEMICOLON: // optional line terminator
if (yyPeek())
{
//if ((thd->client_capabilities & CLIENT_MULTI_STATEMENTS) &&
// !lex->stmt_prepare_mode)
if(0) //serg: disabled so procedure body (with ';') could be parsed successfully
{
lex->safe_to_cache_query= 0;
lex->found_semicolon=(char*) lex->ptr;
//thd->server_status|= SERVER_MORE_RESULTS_EXISTS;
lex->next_state= MY_LEX_END;
return (END_OF_INPUT);
}
state= MY_LEX_CHAR; // Return ';'
break;
}
/* fall true */
case MY_LEX_EOL:
if (lex->ptr >= lex->end_of_query)
{
lex->next_state=MY_LEX_END; // Mark for next loop
return(END_OF_INPUT);
}
state=MY_LEX_CHAR;
break;
case MY_LEX_END:
lex->next_state=MY_LEX_END;
return(0); // We found end of input last time
/* Actually real shouldn't start with . but allow them anyhow */
case MY_LEX_REAL_OR_POINT:
if (my_isdigit(cs,yyPeek()))
state = MY_LEX_REAL; // Real
else
{
state= MY_LEX_IDENT_SEP; // return '.'
yyUnget(); // Put back '.'
}
break;
case MY_LEX_USER_END: // end '@' of user@hostname
switch (state_map[yyPeek()]) {
case MY_LEX_STRING:
case MY_LEX_USER_VARIABLE_DELIMITER:
case MY_LEX_STRING_OR_DELIMITER:
break;
case MY_LEX_USER_END:
lex->next_state=MY_LEX_SYSTEM_VAR;
break;
default:
lex->next_state=MY_LEX_HOSTNAME;
break;
}
new_ast_terminal_node(lex, /*"@", */1, 0);
#if 0
yylval->lex_str.str=(char*) lex->ptr;
yylval->lex_str.length=1;
#endif
return((int) '@');
case MY_LEX_HOSTNAME: // end '@' of user@hostname
for (c=yyGet() ;
my_isalnum(cs,c) || c == '.' || c == '_' || c == '$';
c= yyGet()) ;
tmp_lex_string= get_token(lex,yyLength());
new_ast_terminal_node(lex, /*tmp_lex_string.str, */tmp_lex_string.length, tmp_lex_string.str);
#if 0
yylval->lex_str=get_token(lex,yyLength());
#endif
return(LEX_HOSTNAME);
case MY_LEX_SYSTEM_VAR:
new_ast_terminal_node(lex, /*(char*) lex->ptr, */1, 0);
#if 0
yylval->lex_str.str=(char*) lex->ptr;
yylval->lex_str.length=1;
#endif
yySkip(); // Skip '@'
lex->next_state= (state_map[yyPeek()] ==
MY_LEX_USER_VARIABLE_DELIMITER ?
MY_LEX_OPERATOR_OR_IDENT :
MY_LEX_IDENT_OR_KEYWORD);
return((int) '@');
case MY_LEX_IDENT_OR_KEYWORD:
/*
We come here when we have found two '@' in a row.
We should now be able to handle:
[(global | local | session) .]variable_name
*/
for (result_state= 0; ident_map[c= yyGet()]; result_state|= c) ;
/* If there were non-ASCII characters, mark that we must convert */
result_state= result_state & 0x80 ? IDENT_QUOTED : IDENT;
if (c == '.')
lex->next_state=MY_LEX_IDENT_SEP;
length= (uint) (lex->ptr - lex->tok_start)-1;
if (length == 0)
return(ABORT_SYM); // Names must be nonempty.
if ((tokval= find_keyword(lex,length,0)))
{
yyUnget(); // Put back 'c'
return(tokval); // Was keyword
}
tmp_lex_string= get_token(lex, length);
new_ast_terminal_node(lex, /*tmp_lex_string.str, */tmp_lex_string.length, tmp_lex_string.str);
#if 0
yylval->lex_str=get_token(lex,length);
#endif
return(result_state);
}
}
return 0;
}