in src/backend/utils/adt/agtype_parser.c [528:730]
static inline void agtype_lex(agtype_lex_context *lex)
{
char *s;
int len;
/* Skip leading whitespace. */
s = lex->token_terminator;
len = s - lex->input;
while (len < lex->input_length &&
(*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
{
if (*s == '\n')
++lex->line_number;
++s;
++len;
}
lex->token_start = s;
/* Determine token type. */
if (len >= lex->input_length)
{
lex->token_start = NULL;
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s;
lex->token_type = AGTYPE_TOKEN_END;
}
else
{
switch (*s)
{
/* Single-character token, some kind of punctuation mark. */
case '{':
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
lex->token_type = AGTYPE_TOKEN_OBJECT_START;
break;
case '}':
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
lex->token_type = AGTYPE_TOKEN_OBJECT_END;
break;
case '[':
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
lex->token_type = AGTYPE_TOKEN_ARRAY_START;
break;
case ']':
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
lex->token_type = AGTYPE_TOKEN_ARRAY_END;
break;
case ',':
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
lex->token_type = AGTYPE_TOKEN_COMMA;
break;
case ':':
/* if this is an annotation '::' */
if ((len < lex->input_length - 1) && *(s + 1) == ':')
{
s += 2;
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s;
lex->token_type = AGTYPE_TOKEN_ANNOTATION;
}
else
{
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
lex->token_type = AGTYPE_TOKEN_COLON;
}
break;
case '"':
/* string */
agtype_lex_string(lex);
lex->token_type = AGTYPE_TOKEN_STRING;
break;
case '-':
/* Negative numbers and special float values. */
if (*(s + 1) == 'i' || *(s + 1) == 'I')
{
char *s1 = s + 1;
char *p = s1;
/* advance p to the end of the token */
while (p - s < lex->input_length - len &&
((*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z')))
p++;
/* update the terminators */
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = p;
lex->token_type = AGTYPE_TOKEN_INVALID;
len = p - s1;
switch (len)
{
case 3:
if (pg_strncasecmp(s1, "inf", len) == 0)
lex->token_type = AGTYPE_TOKEN_FLOAT;
break;
case 8:
if (pg_strncasecmp(s1, "Infinity", len) == 0)
lex->token_type = AGTYPE_TOKEN_FLOAT;
break;
}
if (lex->token_type == AGTYPE_TOKEN_INVALID)
report_invalid_token(lex);
}
else
{
agtype_lex_number(lex, s + 1, NULL, NULL);
}
/* token is assigned in agtype_lex_number */
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
/* Positive number. */
agtype_lex_number(lex, s, NULL, NULL);
/* token is assigned in agtype_lex_number */
break;
default:
{
char *p;
/*
* We're not dealing with a string, number, legal
* punctuation mark, or end of string. The only legal
* tokens we might find here are true, false, and null,
* but for error reporting purposes we scan until we see a
* non-alphanumeric character. That way, we can report
* the whole word as an unexpected token, rather than just
* some unintuitive prefix thereof.
*/
for (p = s; p - s < lex->input_length - len &&
AGTYPE_ALPHANUMERIC_CHAR(*p);
p++)
/* skip */;
/*
* We got some sort of unexpected punctuation or an
* otherwise unexpected character, so just complain about
* that one character.
*/
if (p == s)
{
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = s + 1;
report_invalid_token(lex);
}
/*
* We've got a real alphanumeric token here. If it
* happens to be true, false, or null, all is well. If
* not, error out.
*/
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = p;
/* it is an identifier, unless proven otherwise */
lex->token_type = AGTYPE_TOKEN_IDENTIFIER;
len = p - s;
switch (len)
{
/* A note about the mixture of case and case insensitivity -
* The original code adheres to the JSON spec where true,
* false, and null are strictly lower case. The Postgres float
* logic, on the other hand, is case insensitive, allowing for
* possibly many different input sources for float values. Hence,
* the mixture of the two.
*/
case 3:
if ((pg_strncasecmp(s, "NaN", len) == 0) ||
(pg_strncasecmp(s, "inf", len) == 0))
lex->token_type = AGTYPE_TOKEN_FLOAT;
break;
case 4:
if (memcmp(s, "true", len) == 0)
lex->token_type = AGTYPE_TOKEN_TRUE;
else if (memcmp(s, "null", len) == 0)
lex->token_type = AGTYPE_TOKEN_NULL;
break;
case 5:
if (memcmp(s, "false", len) == 0)
lex->token_type = AGTYPE_TOKEN_FALSE;
break;
case 8:
if (pg_strncasecmp(s, "Infinity", len) == 0)
lex->token_type = AGTYPE_TOKEN_FLOAT;
break;
}
} /* end of default case */
} /* end of switch */
}
}