in apache2/libinjection/libinjection_sqli.c [1371:1889]
int libinjection_sqli_fold(struct libinjection_sqli_state * sf)
{
stoken_t last_comment;
/* POS is the position of where the NEXT token goes */
size_t pos = 0;
/* LEFT is a count of how many tokens that are already
folded or processed (i.e. part of the fingerprint) */
size_t left = 0;
int more = 1;
st_clear(&last_comment);
/* Skip all initial comments, right-parens ( and unary operators
*
*/
sf->current = &(sf->tokenvec[0]);
while (more) {
more = libinjection_sqli_tokenize(sf);
if ( ! (sf->current->type == TYPE_COMMENT ||
sf->current->type == TYPE_LEFTPARENS ||
sf->current->type == TYPE_SQLTYPE ||
st_is_unary_op(sf->current))) {
break;
}
}
if (! more) {
/* If input was only comments, unary or (, then exit */
return 0;
} else {
/* it's some other token */
pos += 1;
}
while (1) {
FOLD_DEBUG;
/* do we have all the max number of tokens? if so do
* some special cases for 5 tokens
*/
if (pos >= LIBINJECTION_SQLI_MAX_TOKENS) {
if (
(
sf->tokenvec[0].type == TYPE_NUMBER &&
(sf->tokenvec[1].type == TYPE_OPERATOR || sf->tokenvec[1].type == TYPE_COMMA) &&
sf->tokenvec[2].type == TYPE_LEFTPARENS &&
sf->tokenvec[3].type == TYPE_NUMBER &&
sf->tokenvec[4].type == TYPE_RIGHTPARENS
) ||
(
sf->tokenvec[0].type == TYPE_BAREWORD &&
sf->tokenvec[1].type == TYPE_OPERATOR &&
sf->tokenvec[2].type == TYPE_LEFTPARENS &&
(sf->tokenvec[3].type == TYPE_BAREWORD || sf->tokenvec[3].type == TYPE_NUMBER) &&
sf->tokenvec[4].type == TYPE_RIGHTPARENS
) ||
(
sf->tokenvec[0].type == TYPE_NUMBER &&
sf->tokenvec[1].type == TYPE_RIGHTPARENS &&
sf->tokenvec[2].type == TYPE_COMMA &&
sf->tokenvec[3].type == TYPE_LEFTPARENS &&
sf->tokenvec[4].type == TYPE_NUMBER
) ||
(
sf->tokenvec[0].type == TYPE_BAREWORD &&
sf->tokenvec[1].type == TYPE_RIGHTPARENS &&
sf->tokenvec[2].type == TYPE_OPERATOR &&
sf->tokenvec[3].type == TYPE_LEFTPARENS &&
sf->tokenvec[4].type == TYPE_BAREWORD
)
)
{
if (pos > LIBINJECTION_SQLI_MAX_TOKENS) {
st_copy(&(sf->tokenvec[1]), &(sf->tokenvec[LIBINJECTION_SQLI_MAX_TOKENS]));
pos = 2;
left = 0;
} else {
pos = 1;
left = 0;
}
}
}
if (! more || left >= LIBINJECTION_SQLI_MAX_TOKENS) {
left = pos;
break;
}
/* get up to two tokens */
while (more && pos <= LIBINJECTION_SQLI_MAX_TOKENS && (pos - left) < 2) {
sf->current = &(sf->tokenvec[pos]);
more = libinjection_sqli_tokenize(sf);
if (more) {
if (sf->current->type == TYPE_COMMENT) {
st_copy(&last_comment, sf->current);
} else {
last_comment.type = CHAR_NULL;
pos += 1;
}
}
}
FOLD_DEBUG;
/* did we get 2 tokens? if not then we are done */
if (pos - left < 2) {
left = pos;
continue;
}
/* FOLD: "ss" -> "s"
* "foo" "bar" is valid SQL
* just ignore second string
*/
if (sf->tokenvec[left].type == TYPE_STRING && sf->tokenvec[left+1].type == TYPE_STRING) {
pos -= 1;
sf->stats_folds += 1;
continue;
} else if (sf->tokenvec[left].type == TYPE_SEMICOLON && sf->tokenvec[left+1].type == TYPE_SEMICOLON) {
/* not sure how various engines handle
* 'select 1;;drop table foo' or
* 'select 1; /x foo x/; drop table foo'
* to prevent surprises, just fold away repeated semicolons
*/
pos -= 1;
sf->stats_folds += 1;
continue;
} else if ((sf->tokenvec[left].type == TYPE_OPERATOR ||
sf->tokenvec[left].type == TYPE_LOGIC_OPERATOR) &&
(st_is_unary_op(&sf->tokenvec[left+1]) ||
sf->tokenvec[left+1].type == TYPE_SQLTYPE)) {
pos -= 1;
sf->stats_folds += 1;
left = 0;
continue;
} else if (sf->tokenvec[left].type == TYPE_LEFTPARENS &&
st_is_unary_op(&sf->tokenvec[left+1])) {
pos -= 1;
sf->stats_folds += 1;
if (left > 0) {
left -= 1;
}
continue;
} else if (syntax_merge_words(sf, &sf->tokenvec[left], &sf->tokenvec[left+1])) {
pos -= 1;
sf->stats_folds += 1;
if (left > 0) {
left -= 1;
}
continue;
} else if (sf->tokenvec[left].type == TYPE_SEMICOLON &&
sf->tokenvec[left+1].type == TYPE_FUNCTION &&
(sf->tokenvec[left+1].val[0] == 'I' ||
sf->tokenvec[left+1].val[0] == 'i' ) &&
(sf->tokenvec[left+1].val[1] == 'F' ||
sf->tokenvec[left+1].val[1] == 'f' )) {
/* IF is normally a function, except in Transact-SQL where it can be used as a
* standalone control flow operator, e.g. ; IF 1=1 ...
* if found after a semicolon, convert from 'f' type to 'T' type
*/
sf->tokenvec[left+1].type = TYPE_TSQL;
/* left += 2; */
continue; /* reparse everything, but we probably can advance left, and pos */
} else if ((sf->tokenvec[left].type == TYPE_BAREWORD || sf->tokenvec[left].type == TYPE_VARIABLE) &&
sf->tokenvec[left+1].type == TYPE_LEFTPARENS && (
/* TSQL functions but common enough to be column names */
cstrcasecmp("USER_ID", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("USER_NAME", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
/* Function in MYSQL */
cstrcasecmp("DATABASE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("PASSWORD", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("USER", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
/* Mysql words that act as a variable and are a function */
/* TSQL current_users is fake-variable */
/* http://msdn.microsoft.com/en-us/library/ms176050.aspx */
cstrcasecmp("CURRENT_USER", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("CURRENT_DATE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("CURRENT_TIME", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("CURRENT_TIMESTAMP", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("LOCALTIME", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("LOCALTIMESTAMP", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0
)) {
/* pos is the same
* other conversions need to go here... for instance
* password CAN be a function, coalesce CAN be a function
*/
sf->tokenvec[left].type = TYPE_FUNCTION;
continue;
} else if (sf->tokenvec[left].type == TYPE_KEYWORD && (
cstrcasecmp("IN", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("NOT IN", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0
)) {
if (sf->tokenvec[left+1].type == TYPE_LEFTPARENS) {
/* got .... IN ( ... (or 'NOT IN')
* it's an operator
*/
sf->tokenvec[left].type = TYPE_OPERATOR;
} else {
/*
* it's a nothing
*/
sf->tokenvec[left].type = TYPE_BAREWORD;
}
/* "IN" can be used as "IN BOOLEAN MODE" for mysql
* in which case merging of words can be done later
* other wise it acts as an equality operator __ IN (values..)
*
* here we got "IN" "(" so it's an operator.
* also back track to handle "NOT IN"
* might need to do the same with like
* two use cases "foo" LIKE "BAR" (normal operator)
* "foo" = LIKE(1,2)
*/
continue;
} else if ((sf->tokenvec[left].type == TYPE_OPERATOR) && (
cstrcasecmp("LIKE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
cstrcasecmp("NOT LIKE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0)) {
if (sf->tokenvec[left+1].type == TYPE_LEFTPARENS) {
/* SELECT LIKE(...
* it's a function
*/
sf->tokenvec[left].type = TYPE_FUNCTION;
}
} else if (sf->tokenvec[left].type == TYPE_SQLTYPE &&
(sf->tokenvec[left+1].type == TYPE_BAREWORD ||
sf->tokenvec[left+1].type == TYPE_NUMBER ||
sf->tokenvec[left+1].type == TYPE_SQLTYPE ||
sf->tokenvec[left+1].type == TYPE_LEFTPARENS ||
sf->tokenvec[left+1].type == TYPE_FUNCTION ||
sf->tokenvec[left+1].type == TYPE_VARIABLE ||
sf->tokenvec[left+1].type == TYPE_STRING)) {
st_copy(&sf->tokenvec[left], &sf->tokenvec[left+1]);
pos -= 1;
sf->stats_folds += 1;
left = 0;
continue;
} else if (sf->tokenvec[left].type == TYPE_COLLATE &&
sf->tokenvec[left+1].type == TYPE_BAREWORD) {
/*
* there are too many collation types.. so if the bareword has a "_"
* then it's TYPE_SQLTYPE
*/
if (strchr(sf->tokenvec[left+1].val, '_') != NULL) {
sf->tokenvec[left+1].type = TYPE_SQLTYPE;
left = 0;
}
} else if (sf->tokenvec[left].type == TYPE_BACKSLASH) {
if (st_is_arithmetic_op(&(sf->tokenvec[left+1]))) {
/* very weird case in TSQL where '\%1' is parsed as '0 % 1', etc */
sf->tokenvec[left].type = TYPE_NUMBER;
} else {
/* just ignore it.. Again T-SQL seems to parse \1 as "1" */
st_copy(&sf->tokenvec[left], &sf->tokenvec[left+1]);
pos -= 1;
sf->stats_folds += 1;
}
left = 0;
continue;
} else if (sf->tokenvec[left].type == TYPE_LEFTPARENS &&
sf->tokenvec[left+1].type == TYPE_LEFTPARENS) {
pos -= 1;
left = 0;
sf->stats_folds += 1;
continue;
} else if (sf->tokenvec[left].type == TYPE_RIGHTPARENS &&
sf->tokenvec[left+1].type == TYPE_RIGHTPARENS) {
pos -= 1;
left = 0;
sf->stats_folds += 1;
continue;
} else if (sf->tokenvec[left].type == TYPE_LEFTBRACE &&
sf->tokenvec[left+1].type == TYPE_BAREWORD) {
/*
* MySQL Degenerate case --
*
* select { ``.``.id }; -- valid !!!
* select { ``.``.``.id }; -- invalid
* select ``.``.id; -- invalid
* select { ``.id }; -- invalid
*
* so it appears {``.``.id} is a magic case
* I suspect this is "current database, current table, field id"
*
* The folding code can't look at more than 3 tokens, and
* I don't want to make two passes.
*
* Since "{ ``" so rare, we are just going to blacklist it.
*
* Highly likely this will need revisiting!
*
* CREDIT @rsalgado 2013-11-25
*/
if (sf->tokenvec[left+1].len == 0) {
sf->tokenvec[left+1].type = TYPE_EVIL;
return (int)(left+2);
}
/* weird ODBC / MYSQL {foo expr} --> expr
* but for this rule we just strip away the "{ foo" part
*/
left = 0;
pos -= 2;
sf->stats_folds += 2;
continue;
} else if (sf->tokenvec[left+1].type == TYPE_RIGHTBRACE) {
pos -= 1;
left = 0;
sf->stats_folds += 1;
continue;
}
/* all cases of handing 2 tokens is done
and nothing matched. Get one more token
*/
FOLD_DEBUG;
while (more && pos <= LIBINJECTION_SQLI_MAX_TOKENS && pos - left < 3) {
sf->current = &(sf->tokenvec[pos]);
more = libinjection_sqli_tokenize(sf);
if (more) {
if (sf->current->type == TYPE_COMMENT) {
st_copy(&last_comment, sf->current);
} else {
last_comment.type = CHAR_NULL;
pos += 1;
}
}
}
/* do we have three tokens? If not then we are done */
if (pos -left < 3) {
left = pos;
continue;
}
/*
* now look for three token folding
*/
if (sf->tokenvec[left].type == TYPE_NUMBER &&
sf->tokenvec[left+1].type == TYPE_OPERATOR &&
sf->tokenvec[left+2].type == TYPE_NUMBER) {
pos -= 2;
left = 0;
continue;
} else if (sf->tokenvec[left].type == TYPE_OPERATOR &&
sf->tokenvec[left+1].type != TYPE_LEFTPARENS &&
sf->tokenvec[left+2].type == TYPE_OPERATOR) {
left = 0;
pos -= 2;
continue;
} else if (sf->tokenvec[left].type == TYPE_LOGIC_OPERATOR &&
sf->tokenvec[left+2].type == TYPE_LOGIC_OPERATOR) {
pos -= 2;
left = 0;
continue;
} else if (sf->tokenvec[left].type == TYPE_VARIABLE &&
sf->tokenvec[left+1].type == TYPE_OPERATOR &&
(sf->tokenvec[left+2].type == TYPE_VARIABLE ||
sf->tokenvec[left+2].type == TYPE_NUMBER ||
sf->tokenvec[left+2].type == TYPE_BAREWORD)) {
pos -= 2;
left = 0;
continue;
} else if ((sf->tokenvec[left].type == TYPE_BAREWORD ||
sf->tokenvec[left].type == TYPE_NUMBER ) &&
sf->tokenvec[left+1].type == TYPE_OPERATOR &&
(sf->tokenvec[left+2].type == TYPE_NUMBER ||
sf->tokenvec[left+2].type == TYPE_BAREWORD)) {
pos -= 2;
left = 0;
continue;
} else if ((sf->tokenvec[left].type == TYPE_BAREWORD ||
sf->tokenvec[left].type == TYPE_NUMBER ||
sf->tokenvec[left].type == TYPE_VARIABLE ||
sf->tokenvec[left].type == TYPE_STRING) &&
sf->tokenvec[left+1].type == TYPE_OPERATOR &&
streq(sf->tokenvec[left+1].val, "::") &&
sf->tokenvec[left+2].type == TYPE_SQLTYPE) {
pos -= 2;
left = 0;
sf->stats_folds += 2;
continue;
} else if ((sf->tokenvec[left].type == TYPE_BAREWORD ||
sf->tokenvec[left].type == TYPE_NUMBER ||
sf->tokenvec[left].type == TYPE_STRING ||
sf->tokenvec[left].type == TYPE_VARIABLE) &&
sf->tokenvec[left+1].type == TYPE_COMMA &&
(sf->tokenvec[left+2].type == TYPE_NUMBER ||
sf->tokenvec[left+2].type == TYPE_BAREWORD ||
sf->tokenvec[left+2].type == TYPE_STRING ||
sf->tokenvec[left+2].type == TYPE_VARIABLE)) {
pos -= 2;
left = 0;
continue;
} else if ((sf->tokenvec[left].type == TYPE_EXPRESSION ||
sf->tokenvec[left].type == TYPE_GROUP ||
sf->tokenvec[left].type == TYPE_COMMA) &&
st_is_unary_op(&sf->tokenvec[left+1]) &&
sf->tokenvec[left+2].type == TYPE_LEFTPARENS) {
/* got something like SELECT + (, LIMIT + (
* remove unary operator
*/
st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
pos -= 1;
left = 0;
continue;
} else if ((sf->tokenvec[left].type == TYPE_KEYWORD ||
sf->tokenvec[left].type == TYPE_EXPRESSION ||
sf->tokenvec[left].type == TYPE_GROUP ) &&
st_is_unary_op(&sf->tokenvec[left+1]) &&
(sf->tokenvec[left+2].type == TYPE_NUMBER ||
sf->tokenvec[left+2].type == TYPE_BAREWORD ||
sf->tokenvec[left+2].type == TYPE_VARIABLE ||
sf->tokenvec[left+2].type == TYPE_STRING ||
sf->tokenvec[left+2].type == TYPE_FUNCTION )) {
/* remove unary operators
* select - 1
*/
st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
pos -= 1;
left = 0;
continue;
} else if (sf->tokenvec[left].type == TYPE_COMMA &&
st_is_unary_op(&sf->tokenvec[left+1]) &&
(sf->tokenvec[left+2].type == TYPE_NUMBER ||
sf->tokenvec[left+2].type == TYPE_BAREWORD ||
sf->tokenvec[left+2].type == TYPE_VARIABLE ||
sf->tokenvec[left+2].type == TYPE_STRING)) {
/*
* interesting case turn ", -1" ->> ",1" PLUS we need to back up
* one token if possible to see if more folding can be done
* "1,-1" --> "1"
*/
st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
left = 0;
/* pos is >= 3 so this is safe */
assert(pos >= 3);
pos -= 3;
continue;
} else if (sf->tokenvec[left].type == TYPE_COMMA &&
st_is_unary_op(&sf->tokenvec[left+1]) &&
sf->tokenvec[left+2].type == TYPE_FUNCTION) {
/* Separate case from above since you end up with
* 1,-sin(1) --> 1 (1)
* Here, just do
* 1,-sin(1) --> 1,sin(1)
* just remove unary operator
*/
st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
pos -= 1;
left = 0;
continue;
} else if ((sf->tokenvec[left].type == TYPE_BAREWORD) &&
(sf->tokenvec[left+1].type == TYPE_DOT) &&
(sf->tokenvec[left+2].type == TYPE_BAREWORD)) {
/* ignore the '.n'
* typically is this databasename.table
*/
assert(pos >= 3);
pos -= 2;
left = 0;
continue;
} else if ((sf->tokenvec[left].type == TYPE_EXPRESSION) &&
(sf->tokenvec[left+1].type == TYPE_DOT) &&
(sf->tokenvec[left+2].type == TYPE_BAREWORD)) {
/* select . `foo` --> select `foo` */
st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
pos -= 1;
left = 0;
continue;
} else if ((sf->tokenvec[left].type == TYPE_FUNCTION) &&
(sf->tokenvec[left+1].type == TYPE_LEFTPARENS) &&
(sf->tokenvec[left+2].type != TYPE_RIGHTPARENS)) {
/*
* whats going on here
* Some SQL functions like USER() have 0 args
* if we get User(foo), then User is not a function
* This should be expanded since it eliminated a lot of false
* positives.
*/
if (cstrcasecmp("USER", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0) {
sf->tokenvec[left].type = TYPE_BAREWORD;
}
}
/* no folding -- assume left-most token is
is good, now use the existing 2 tokens --
do not get another
*/
left += 1;
} /* while(1) */
/* if we have 4 or less tokens, and we had a comment token
* at the end, add it back
*/
if (left < LIBINJECTION_SQLI_MAX_TOKENS && last_comment.type == TYPE_COMMENT) {
st_copy(&sf->tokenvec[left], &last_comment);
left += 1;
}
/* sometimes we grab a 6th token to help
determine the type of token 5.
*/
if (left > LIBINJECTION_SQLI_MAX_TOKENS) {
left = LIBINJECTION_SQLI_MAX_TOKENS;
}
return (int)left;
}