int libinjection_sqli_fold()

in apache2/libinjection/libinjection_sqli.c [1371:1889]


int libinjection_sqli_fold(struct libinjection_sqli_state * sf)
{
    stoken_t last_comment;

    /* POS is the position of where the NEXT token goes */
    size_t pos = 0;

    /* LEFT is a count of how many tokens that are already
       folded or processed (i.e. part of the fingerprint) */
    size_t left =  0;

    int more = 1;

    st_clear(&last_comment);

    /* Skip all initial comments, right-parens ( and unary operators
     *
     */
    sf->current = &(sf->tokenvec[0]);
    while (more) {
        more = libinjection_sqli_tokenize(sf);
        if ( ! (sf->current->type == TYPE_COMMENT ||
                sf->current->type == TYPE_LEFTPARENS ||
                sf->current->type == TYPE_SQLTYPE ||
                st_is_unary_op(sf->current))) {
            break;
        }
    }

    if (! more) {
        /* If input was only comments, unary or (, then exit */
        return 0;
    } else {
        /* it's some other token */
        pos += 1;
    }

    while (1) {
        FOLD_DEBUG;

        /* do we have all the max number of tokens?  if so do
         * some special cases for 5 tokens
         */
        if (pos >= LIBINJECTION_SQLI_MAX_TOKENS) {
            if (
                (
                    sf->tokenvec[0].type == TYPE_NUMBER &&
                    (sf->tokenvec[1].type == TYPE_OPERATOR || sf->tokenvec[1].type == TYPE_COMMA) &&
                    sf->tokenvec[2].type == TYPE_LEFTPARENS &&
                    sf->tokenvec[3].type == TYPE_NUMBER &&
                    sf->tokenvec[4].type == TYPE_RIGHTPARENS
                    ) ||
                (
                    sf->tokenvec[0].type == TYPE_BAREWORD &&
                    sf->tokenvec[1].type == TYPE_OPERATOR &&
                    sf->tokenvec[2].type == TYPE_LEFTPARENS &&
                    (sf->tokenvec[3].type == TYPE_BAREWORD || sf->tokenvec[3].type == TYPE_NUMBER) &&
                    sf->tokenvec[4].type == TYPE_RIGHTPARENS
                    ) ||
                (
                    sf->tokenvec[0].type == TYPE_NUMBER &&
                    sf->tokenvec[1].type == TYPE_RIGHTPARENS &&
                    sf->tokenvec[2].type == TYPE_COMMA &&
                    sf->tokenvec[3].type == TYPE_LEFTPARENS &&
                    sf->tokenvec[4].type == TYPE_NUMBER
                    ) ||
                (
                    sf->tokenvec[0].type == TYPE_BAREWORD &&
                    sf->tokenvec[1].type == TYPE_RIGHTPARENS &&
                    sf->tokenvec[2].type == TYPE_OPERATOR &&
                    sf->tokenvec[3].type == TYPE_LEFTPARENS &&
                    sf->tokenvec[4].type == TYPE_BAREWORD
                    )
                )
            {
                if (pos > LIBINJECTION_SQLI_MAX_TOKENS) {
		    st_copy(&(sf->tokenvec[1]), &(sf->tokenvec[LIBINJECTION_SQLI_MAX_TOKENS]));
                    pos = 2;
                    left = 0;
                } else {
                    pos = 1;
                    left = 0;
                }
            }
        }

        if (! more || left >= LIBINJECTION_SQLI_MAX_TOKENS) {
            left = pos;
            break;
        }

        /* get up to two tokens */
        while (more && pos <= LIBINJECTION_SQLI_MAX_TOKENS && (pos - left) < 2) {
            sf->current = &(sf->tokenvec[pos]);
            more = libinjection_sqli_tokenize(sf);
            if (more) {
                if (sf->current->type == TYPE_COMMENT) {
                    st_copy(&last_comment, sf->current);
                } else {
                    last_comment.type = CHAR_NULL;
                    pos += 1;
                }
            }
        }
        FOLD_DEBUG;
        /* did we get 2 tokens? if not then we are done */
        if (pos - left < 2) {
            left = pos;
            continue;
        }

        /* FOLD: "ss" -> "s"
         * "foo" "bar" is valid SQL
         * just ignore second string
         */
        if (sf->tokenvec[left].type == TYPE_STRING && sf->tokenvec[left+1].type == TYPE_STRING) {
            pos -= 1;
            sf->stats_folds += 1;
            continue;
        } else if (sf->tokenvec[left].type == TYPE_SEMICOLON && sf->tokenvec[left+1].type == TYPE_SEMICOLON) {
            /* not sure how various engines handle
             * 'select 1;;drop table foo' or
             * 'select 1; /x foo x/; drop table foo'
             * to prevent surprises, just fold away repeated semicolons
             */
            pos -= 1;
            sf->stats_folds += 1;
            continue;
        } else if ((sf->tokenvec[left].type == TYPE_OPERATOR ||
                    sf->tokenvec[left].type == TYPE_LOGIC_OPERATOR) &&
                   (st_is_unary_op(&sf->tokenvec[left+1]) ||
                    sf->tokenvec[left+1].type == TYPE_SQLTYPE)) {
            pos -= 1;
            sf->stats_folds += 1;
            left = 0;
            continue;
        } else if (sf->tokenvec[left].type == TYPE_LEFTPARENS &&
                   st_is_unary_op(&sf->tokenvec[left+1])) {
            pos -= 1;
            sf->stats_folds += 1;
            if (left > 0) {
                left -= 1;
            }
            continue;
        } else if (syntax_merge_words(sf, &sf->tokenvec[left], &sf->tokenvec[left+1])) {
            pos -= 1;
            sf->stats_folds += 1;
            if (left > 0) {
                left -= 1;
            }
            continue;
        } else if (sf->tokenvec[left].type == TYPE_SEMICOLON &&
                   sf->tokenvec[left+1].type == TYPE_FUNCTION &&
		   (sf->tokenvec[left+1].val[0] == 'I' ||
		    sf->tokenvec[left+1].val[0] == 'i' ) &&
		   (sf->tokenvec[left+1].val[1] == 'F' ||
                    sf->tokenvec[left+1].val[1] == 'f' )) {
            /* IF is normally a function, except in Transact-SQL where it can be used as a
             * standalone control flow operator, e.g. ; IF 1=1 ...
             * if found after a semicolon, convert from 'f' type to 'T' type
             */
            sf->tokenvec[left+1].type = TYPE_TSQL;
            /* left += 2; */
            continue; /* reparse everything, but we probably can advance left, and pos */
        } else if ((sf->tokenvec[left].type == TYPE_BAREWORD || sf->tokenvec[left].type == TYPE_VARIABLE) &&
                   sf->tokenvec[left+1].type == TYPE_LEFTPARENS && (
                       /* TSQL functions but common enough to be column names */
                       cstrcasecmp("USER_ID", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
                       cstrcasecmp("USER_NAME", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||

                       /* Function in MYSQL */
                       cstrcasecmp("DATABASE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
                       cstrcasecmp("PASSWORD", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
                       cstrcasecmp("USER", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||

                       /* Mysql words that act as a variable and are a function */

                       /* TSQL current_users is fake-variable */
                       /* http://msdn.microsoft.com/en-us/library/ms176050.aspx */
                       cstrcasecmp("CURRENT_USER", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
                       cstrcasecmp("CURRENT_DATE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
                       cstrcasecmp("CURRENT_TIME", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
                       cstrcasecmp("CURRENT_TIMESTAMP", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
                       cstrcasecmp("LOCALTIME", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
                       cstrcasecmp("LOCALTIMESTAMP", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0
                       )) {

            /* pos is the same
             * other conversions need to go here... for instance
             * password CAN be a function, coalesce CAN be a function
             */
            sf->tokenvec[left].type = TYPE_FUNCTION;
            continue;
        } else if (sf->tokenvec[left].type == TYPE_KEYWORD && (
                       cstrcasecmp("IN", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
                       cstrcasecmp("NOT IN", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0
                       )) {

            if (sf->tokenvec[left+1].type == TYPE_LEFTPARENS) {
                /* got .... IN ( ...  (or 'NOT IN')
                 * it's an operator
                 */
                sf->tokenvec[left].type = TYPE_OPERATOR;
            } else {
                /*
                 * it's a nothing
                 */
                sf->tokenvec[left].type = TYPE_BAREWORD;
            }

            /* "IN" can be used as "IN BOOLEAN MODE" for mysql
             *  in which case merging of words can be done later
             * other wise it acts as an equality operator __ IN (values..)
             *
             * here we got "IN" "(" so it's an operator.
             * also back track to handle "NOT IN"
             * might need to do the same with like
             * two use cases   "foo" LIKE "BAR" (normal operator)
             *  "foo" = LIKE(1,2)
             */
            continue;
        } else if ((sf->tokenvec[left].type == TYPE_OPERATOR) && (
                       cstrcasecmp("LIKE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0 ||
                       cstrcasecmp("NOT LIKE", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0)) {
            if (sf->tokenvec[left+1].type == TYPE_LEFTPARENS) {
                /* SELECT LIKE(...
                 * it's a function
                 */
                sf->tokenvec[left].type = TYPE_FUNCTION;
            }
        } else if (sf->tokenvec[left].type == TYPE_SQLTYPE &&
                   (sf->tokenvec[left+1].type == TYPE_BAREWORD ||
                    sf->tokenvec[left+1].type == TYPE_NUMBER ||
                    sf->tokenvec[left+1].type == TYPE_SQLTYPE ||
                    sf->tokenvec[left+1].type == TYPE_LEFTPARENS ||
                    sf->tokenvec[left+1].type == TYPE_FUNCTION ||
                    sf->tokenvec[left+1].type == TYPE_VARIABLE ||
                    sf->tokenvec[left+1].type == TYPE_STRING))  {
            st_copy(&sf->tokenvec[left], &sf->tokenvec[left+1]);
            pos -= 1;
            sf->stats_folds += 1;
            left = 0;
            continue;
        } else if (sf->tokenvec[left].type == TYPE_COLLATE &&
                   sf->tokenvec[left+1].type == TYPE_BAREWORD) {
            /*
             * there are too many collation types.. so if the bareword has a "_"
             * then it's TYPE_SQLTYPE
             */
            if (strchr(sf->tokenvec[left+1].val, '_') != NULL) {
                sf->tokenvec[left+1].type = TYPE_SQLTYPE;
                left = 0;
            }
        } else if (sf->tokenvec[left].type == TYPE_BACKSLASH) {
            if (st_is_arithmetic_op(&(sf->tokenvec[left+1]))) {
                /* very weird case in TSQL where '\%1' is parsed as '0 % 1', etc */
                sf->tokenvec[left].type = TYPE_NUMBER;
            } else {
                /* just ignore it.. Again T-SQL seems to parse \1 as "1" */
                st_copy(&sf->tokenvec[left], &sf->tokenvec[left+1]);
                pos -= 1;
                sf->stats_folds += 1;
            }
            left = 0;
            continue;
        } else if (sf->tokenvec[left].type == TYPE_LEFTPARENS &&
                   sf->tokenvec[left+1].type == TYPE_LEFTPARENS) {
            pos -= 1;
            left = 0;
            sf->stats_folds += 1;
            continue;
        } else if (sf->tokenvec[left].type == TYPE_RIGHTPARENS &&
                   sf->tokenvec[left+1].type == TYPE_RIGHTPARENS) {
            pos -= 1;
            left = 0;
            sf->stats_folds += 1;
            continue;
        } else if (sf->tokenvec[left].type == TYPE_LEFTBRACE &&
                   sf->tokenvec[left+1].type == TYPE_BAREWORD) {

            /*
             * MySQL Degenerate case --
             *
             *   select { ``.``.id };  -- valid !!!
             *   select { ``.``.``.id };  -- invalid
             *   select ``.``.id; -- invalid
             *   select { ``.id }; -- invalid
             *
             * so it appears {``.``.id} is a magic case
             * I suspect this is "current database, current table, field id"
             *
             * The folding code can't look at more than 3 tokens, and
             * I don't want to make two passes.
             *
             * Since "{ ``" so rare, we are just going to blacklist it.
             *
             * Highly likely this will need revisiting!
             *
             * CREDIT @rsalgado 2013-11-25
             */
            if (sf->tokenvec[left+1].len == 0) {
                sf->tokenvec[left+1].type = TYPE_EVIL;
                return (int)(left+2);
            }
            /* weird ODBC / MYSQL  {foo expr} --> expr
             * but for this rule we just strip away the "{ foo" part
             */
            left = 0;
            pos -= 2;
            sf->stats_folds += 2;
            continue;
        } else if (sf->tokenvec[left+1].type == TYPE_RIGHTBRACE) {
            pos -= 1;
            left = 0;
            sf->stats_folds += 1;
            continue;
        }

        /* all cases of handing 2 tokens is done
           and nothing matched.  Get one more token
        */
        FOLD_DEBUG;
        while (more && pos <= LIBINJECTION_SQLI_MAX_TOKENS && pos - left < 3) {
            sf->current = &(sf->tokenvec[pos]);
            more = libinjection_sqli_tokenize(sf);
            if (more) {
                if (sf->current->type == TYPE_COMMENT) {
                    st_copy(&last_comment, sf->current);
                } else {
                    last_comment.type = CHAR_NULL;
                    pos += 1;
                }
            }
        }

        /* do we have three tokens? If not then we are done */
        if (pos -left < 3) {
            left = pos;
            continue;
        }

        /*
         * now look for three token folding
         */
        if (sf->tokenvec[left].type == TYPE_NUMBER &&
            sf->tokenvec[left+1].type == TYPE_OPERATOR &&
            sf->tokenvec[left+2].type == TYPE_NUMBER) {
            pos -= 2;
            left = 0;
            continue;
        } else if (sf->tokenvec[left].type == TYPE_OPERATOR &&
                   sf->tokenvec[left+1].type != TYPE_LEFTPARENS &&
                   sf->tokenvec[left+2].type == TYPE_OPERATOR) {
            left = 0;
            pos -= 2;
            continue;
        } else if (sf->tokenvec[left].type == TYPE_LOGIC_OPERATOR &&
                   sf->tokenvec[left+2].type == TYPE_LOGIC_OPERATOR) {
            pos -= 2;
            left = 0;
            continue;
        } else if (sf->tokenvec[left].type == TYPE_VARIABLE &&
                   sf->tokenvec[left+1].type == TYPE_OPERATOR &&
                   (sf->tokenvec[left+2].type == TYPE_VARIABLE ||
                    sf->tokenvec[left+2].type == TYPE_NUMBER ||
                    sf->tokenvec[left+2].type == TYPE_BAREWORD)) {
            pos -= 2;
            left = 0;
            continue;
        } else if ((sf->tokenvec[left].type == TYPE_BAREWORD ||
                    sf->tokenvec[left].type == TYPE_NUMBER ) &&
                   sf->tokenvec[left+1].type == TYPE_OPERATOR &&
                   (sf->tokenvec[left+2].type == TYPE_NUMBER ||
                    sf->tokenvec[left+2].type == TYPE_BAREWORD)) {
            pos -= 2;
            left = 0;
            continue;
        } else if ((sf->tokenvec[left].type == TYPE_BAREWORD ||
                    sf->tokenvec[left].type == TYPE_NUMBER ||
                    sf->tokenvec[left].type == TYPE_VARIABLE ||
                    sf->tokenvec[left].type == TYPE_STRING) &&
                   sf->tokenvec[left+1].type == TYPE_OPERATOR &&
                   streq(sf->tokenvec[left+1].val, "::") &&
                   sf->tokenvec[left+2].type == TYPE_SQLTYPE) {
            pos -= 2;
            left = 0;
            sf->stats_folds += 2;
            continue;
        } else if ((sf->tokenvec[left].type == TYPE_BAREWORD ||
                    sf->tokenvec[left].type == TYPE_NUMBER ||
                    sf->tokenvec[left].type == TYPE_STRING ||
                    sf->tokenvec[left].type == TYPE_VARIABLE) &&
                   sf->tokenvec[left+1].type == TYPE_COMMA &&
                   (sf->tokenvec[left+2].type == TYPE_NUMBER ||
                    sf->tokenvec[left+2].type == TYPE_BAREWORD ||
                    sf->tokenvec[left+2].type == TYPE_STRING ||
                    sf->tokenvec[left+2].type == TYPE_VARIABLE)) {
            pos -= 2;
            left = 0;
            continue;
        } else if ((sf->tokenvec[left].type == TYPE_EXPRESSION ||
                    sf->tokenvec[left].type == TYPE_GROUP ||
                    sf->tokenvec[left].type == TYPE_COMMA) &&
                   st_is_unary_op(&sf->tokenvec[left+1]) &&
                   sf->tokenvec[left+2].type == TYPE_LEFTPARENS) {
            /* got something like SELECT + (, LIMIT + (
             * remove unary operator
             */
            st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
            pos -= 1;
            left = 0;
            continue;
        } else if ((sf->tokenvec[left].type == TYPE_KEYWORD ||
                    sf->tokenvec[left].type == TYPE_EXPRESSION ||
                    sf->tokenvec[left].type == TYPE_GROUP )  &&
                   st_is_unary_op(&sf->tokenvec[left+1]) &&
                   (sf->tokenvec[left+2].type == TYPE_NUMBER ||
                    sf->tokenvec[left+2].type == TYPE_BAREWORD ||
                    sf->tokenvec[left+2].type == TYPE_VARIABLE ||
                    sf->tokenvec[left+2].type == TYPE_STRING ||
                    sf->tokenvec[left+2].type == TYPE_FUNCTION )) {
            /* remove unary operators
             * select - 1
             */
            st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
            pos -= 1;
            left = 0;
            continue;
        } else if (sf->tokenvec[left].type == TYPE_COMMA &&
                   st_is_unary_op(&sf->tokenvec[left+1]) &&
                   (sf->tokenvec[left+2].type == TYPE_NUMBER ||
                    sf->tokenvec[left+2].type == TYPE_BAREWORD ||
                    sf->tokenvec[left+2].type == TYPE_VARIABLE ||
                    sf->tokenvec[left+2].type == TYPE_STRING)) {
            /*
             * interesting case    turn ", -1"  ->> ",1" PLUS we need to back up
             * one token if possible to see if more folding can be done
             * "1,-1" --> "1"
             */
            st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
            left = 0;
            /* pos is >= 3 so this is safe */
            assert(pos >= 3);
            pos -= 3;
            continue;
        } else if (sf->tokenvec[left].type == TYPE_COMMA &&
                   st_is_unary_op(&sf->tokenvec[left+1]) &&
                   sf->tokenvec[left+2].type == TYPE_FUNCTION) {

            /* Separate case from above since you end up with
             * 1,-sin(1) --> 1 (1)
             * Here, just do
             * 1,-sin(1) --> 1,sin(1)
             * just remove unary operator
             */
            st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
            pos -= 1;
            left = 0;
            continue;
        } else if ((sf->tokenvec[left].type == TYPE_BAREWORD) &&
                   (sf->tokenvec[left+1].type == TYPE_DOT) &&
                   (sf->tokenvec[left+2].type == TYPE_BAREWORD)) {
            /* ignore the '.n'
             * typically is this databasename.table
             */
            assert(pos >= 3);
            pos -= 2;
            left = 0;
            continue;
        } else if ((sf->tokenvec[left].type == TYPE_EXPRESSION) &&
                   (sf->tokenvec[left+1].type == TYPE_DOT) &&
                   (sf->tokenvec[left+2].type == TYPE_BAREWORD)) {
            /* select . `foo` --> select `foo` */
            st_copy(&sf->tokenvec[left+1], &sf->tokenvec[left+2]);
            pos -= 1;
            left = 0;
            continue;
        } else if ((sf->tokenvec[left].type == TYPE_FUNCTION) &&
                   (sf->tokenvec[left+1].type == TYPE_LEFTPARENS) &&
                   (sf->tokenvec[left+2].type != TYPE_RIGHTPARENS)) {
            /*
             * whats going on here
             * Some SQL functions like USER() have 0 args
             * if we get User(foo), then User is not a function
             * This should be expanded since it eliminated a lot of false
             * positives. 
             */
            if  (cstrcasecmp("USER", sf->tokenvec[left].val, sf->tokenvec[left].len) == 0) {
                sf->tokenvec[left].type = TYPE_BAREWORD;
            }
        }

        /* no folding -- assume left-most token is
           is good, now use the existing 2 tokens --
           do not get another
        */

        left += 1;

    } /* while(1) */

    /* if we have 4 or less tokens, and we had a comment token
     * at the end, add it back
     */

    if (left < LIBINJECTION_SQLI_MAX_TOKENS && last_comment.type == TYPE_COMMENT) {
        st_copy(&sf->tokenvec[left], &last_comment);
        left += 1;
    }

    /* sometimes we grab a 6th token to help
       determine the type of token 5.
    */
    if (left > LIBINJECTION_SQLI_MAX_TOKENS) {
        left = LIBINJECTION_SQLI_MAX_TOKENS;
    }

    return (int)left;
}