in bigquery_etl/format_sql/tokenizer.py [0:0]
def tokenize(query, token_priority=BIGQUERY_TOKEN_PRIORITY) -> Iterator[Token]:
"""Split query into a series of tokens."""
open_blocks: list[BlockStartKeyword] = []
open_angle_brackets = 0
angle_bracket_is_operator = True
reserved_keyword_is_identifier = False
while query:
for token_type in token_priority:
match = token_type.pattern.match(query)
if not match:
continue
token = token_type(match.group())
# handle stateful matches
if isinstance(token, MaybeCaseSubclause):
if open_blocks and open_blocks[-1].value.upper() == "CASE":
token = CaseSubclause(token.value)
else:
token = BlockMiddleKeyword(token.value)
elif isinstance(token, MaybeOpeningAngleBracket):
if angle_bracket_is_operator:
continue # prevent matching operator as opening bracket
token = OpeningBracket(token.value)
open_angle_brackets += 1
elif isinstance(token, MaybeClosingAngleBracket):
if angle_bracket_is_operator:
continue # prevent matching operator as closing bracket
token = ClosingBracket(token.value)
open_angle_brackets -= 1
elif (
reserved_keyword_is_identifier
and isinstance(token, ReservedKeyword)
and Identifier.pattern.match(token.value) is not None
):
continue # prevent matching identifier as keyword
yield token
length = len(token.value)
query = query[length:]
# update stateful conditions for next token
if isinstance(token, BlockEndKeyword) and open_blocks:
open_blocks.pop()
if isinstance(token, BlockStartKeyword):
open_blocks.append(token)
if not isinstance(token, (Comment, Whitespace)):
# angle brackets are operators unless already in angle bracket
# block or preceded by an AngleBracketKeyword
angle_bracket_is_operator = not (
open_angle_brackets > 0 or isinstance(token, AngleBracketKeyword)
)
# field access operator may be followed by an identifier that
# would otherwise be a reserved keyword.
reserved_keyword_is_identifier = isinstance(
token, (FieldAccessOperator, AliasSeparator)
)
break
else:
raise ValueError(f"Could not determine next token in {query!r}")