static inline void agtype_lex()

in src/backend/utils/adt/agtype_parser.c [528:730]


static inline void agtype_lex(agtype_lex_context *lex)
{
    char *s;
    int len;

    /* Skip leading whitespace. */
    s = lex->token_terminator;
    len = s - lex->input;
    while (len < lex->input_length &&
           (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
    {
        if (*s == '\n')
            ++lex->line_number;
        ++s;
        ++len;
    }
    lex->token_start = s;

    /* Determine token type. */
    if (len >= lex->input_length)
    {
        lex->token_start = NULL;
        lex->prev_token_terminator = lex->token_terminator;
        lex->token_terminator = s;
        lex->token_type = AGTYPE_TOKEN_END;
    }
    else
    {
        switch (*s)
        {
            /* Single-character token, some kind of punctuation mark. */
        case '{':
            lex->prev_token_terminator = lex->token_terminator;
            lex->token_terminator = s + 1;
            lex->token_type = AGTYPE_TOKEN_OBJECT_START;
            break;
        case '}':
            lex->prev_token_terminator = lex->token_terminator;
            lex->token_terminator = s + 1;
            lex->token_type = AGTYPE_TOKEN_OBJECT_END;
            break;
        case '[':
            lex->prev_token_terminator = lex->token_terminator;
            lex->token_terminator = s + 1;
            lex->token_type = AGTYPE_TOKEN_ARRAY_START;
            break;
        case ']':
            lex->prev_token_terminator = lex->token_terminator;
            lex->token_terminator = s + 1;
            lex->token_type = AGTYPE_TOKEN_ARRAY_END;
            break;
        case ',':
            lex->prev_token_terminator = lex->token_terminator;
            lex->token_terminator = s + 1;
            lex->token_type = AGTYPE_TOKEN_COMMA;
            break;
        case ':':
            /* if this is an annotation '::' */
            if ((len < lex->input_length - 1) && *(s + 1) == ':')
            {
                s += 2;
                lex->prev_token_terminator = lex->token_terminator;
                lex->token_terminator = s;
                lex->token_type = AGTYPE_TOKEN_ANNOTATION;
            }
            else
            {
                lex->prev_token_terminator = lex->token_terminator;
                lex->token_terminator = s + 1;
                lex->token_type = AGTYPE_TOKEN_COLON;
            }
            break;
        case '"':
            /* string */
            agtype_lex_string(lex);
            lex->token_type = AGTYPE_TOKEN_STRING;
            break;
        case '-':
            /* Negative numbers and special float values. */
            if (*(s + 1) == 'i' || *(s + 1) == 'I')
            {
                char *s1 = s + 1;
                char *p = s1;

                /* advance p to the end of the token */
                while (p - s < lex->input_length - len &&
                       ((*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z')))
                    p++;

                /* update the terminators */
                lex->prev_token_terminator = lex->token_terminator;
                lex->token_terminator = p;

                lex->token_type = AGTYPE_TOKEN_INVALID;
                len = p - s1;
                switch (len)
                {
                case 3:
                    if (pg_strncasecmp(s1, "inf", len) == 0)
                        lex->token_type = AGTYPE_TOKEN_FLOAT;
                    break;
                case 8:
                    if (pg_strncasecmp(s1, "Infinity", len) == 0)
                        lex->token_type = AGTYPE_TOKEN_FLOAT;
                    break;
                }
                if (lex->token_type == AGTYPE_TOKEN_INVALID)
                    report_invalid_token(lex);
            }
            else
            {
                agtype_lex_number(lex, s + 1, NULL, NULL);
            }
            /* token is assigned in agtype_lex_number */
            break;
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
            /* Positive number. */
            agtype_lex_number(lex, s, NULL, NULL);
            /* token is assigned in agtype_lex_number */
            break;
        default:
        {
            char *p;

            /*
             * We're not dealing with a string, number, legal
             * punctuation mark, or end of string.  The only legal
             * tokens we might find here are true, false, and null,
             * but for error reporting purposes we scan until we see a
             * non-alphanumeric character.  That way, we can report
             * the whole word as an unexpected token, rather than just
             * some unintuitive prefix thereof.
             */
            for (p = s; p - s < lex->input_length - len &&
                        AGTYPE_ALPHANUMERIC_CHAR(*p);
                 p++)
                /* skip */;

            /*
             * We got some sort of unexpected punctuation or an
             * otherwise unexpected character, so just complain about
             * that one character.
             */
            if (p == s)
            {
                lex->prev_token_terminator = lex->token_terminator;
                lex->token_terminator = s + 1;
                report_invalid_token(lex);
            }

            /*
             * We've got a real alphanumeric token here.  If it
             * happens to be true, false, or null, all is well.  If
             * not, error out.
             */
            lex->prev_token_terminator = lex->token_terminator;
            lex->token_terminator = p;

            /* it is an identifier, unless proven otherwise */
            lex->token_type = AGTYPE_TOKEN_IDENTIFIER;
            len = p - s;
            switch (len)
            {
            /* A note about the mixture of case and case insensitivity -
             * The original code adheres to the JSON spec where true,
             * false, and null are strictly lower case. The Postgres float
             * logic, on the other hand, is case insensitive, allowing for
             * possibly many different input sources for float values. Hence,
             * the mixture of the two.
             */
            case 3:
                if ((pg_strncasecmp(s, "NaN", len) == 0) ||
                    (pg_strncasecmp(s, "inf", len) == 0))
                    lex->token_type = AGTYPE_TOKEN_FLOAT;
                break;
            case 4:
                if (memcmp(s, "true", len) == 0)
                    lex->token_type = AGTYPE_TOKEN_TRUE;
                else if (memcmp(s, "null", len) == 0)
                    lex->token_type = AGTYPE_TOKEN_NULL;
                break;
            case 5:
                if (memcmp(s, "false", len) == 0)
                    lex->token_type = AGTYPE_TOKEN_FALSE;
                break;
            case 8:
                if (pg_strncasecmp(s, "Infinity", len) == 0)
                    lex->token_type = AGTYPE_TOKEN_FLOAT;
                break;
            }
        } /* end of default case */
        } /* end of switch */
    }
}