def tokenize()

in bigquery_etl/format_sql/tokenizer.py [0:0]


def tokenize(query, token_priority=BIGQUERY_TOKEN_PRIORITY) -> Iterator[Token]:
    """Split query into a series of tokens."""
    open_blocks: list[BlockStartKeyword] = []
    open_angle_brackets = 0
    angle_bracket_is_operator = True
    reserved_keyword_is_identifier = False
    while query:
        for token_type in token_priority:
            match = token_type.pattern.match(query)
            if not match:
                continue
            token = token_type(match.group())
            # handle stateful matches
            if isinstance(token, MaybeCaseSubclause):
                if open_blocks and open_blocks[-1].value.upper() == "CASE":
                    token = CaseSubclause(token.value)
                else:
                    token = BlockMiddleKeyword(token.value)
            elif isinstance(token, MaybeOpeningAngleBracket):
                if angle_bracket_is_operator:
                    continue  # prevent matching operator as opening bracket
                token = OpeningBracket(token.value)
                open_angle_brackets += 1
            elif isinstance(token, MaybeClosingAngleBracket):
                if angle_bracket_is_operator:
                    continue  # prevent matching operator as closing bracket
                token = ClosingBracket(token.value)
                open_angle_brackets -= 1
            elif (
                reserved_keyword_is_identifier
                and isinstance(token, ReservedKeyword)
                and Identifier.pattern.match(token.value) is not None
            ):
                continue  # prevent matching identifier as keyword
            yield token
            length = len(token.value)
            query = query[length:]
            # update stateful conditions for next token
            if isinstance(token, BlockEndKeyword) and open_blocks:
                open_blocks.pop()
            if isinstance(token, BlockStartKeyword):
                open_blocks.append(token)
            if not isinstance(token, (Comment, Whitespace)):
                # angle brackets are operators unless already in angle bracket
                # block or preceded by an AngleBracketKeyword
                angle_bracket_is_operator = not (
                    open_angle_brackets > 0 or isinstance(token, AngleBracketKeyword)
                )
                # field access operator may be followed by an identifier that
                # would otherwise be a reserved keyword.
                reserved_keyword_is_identifier = isinstance(
                    token, (FieldAccessOperator, AliasSeparator)
                )
            break
        else:
            raise ValueError(f"Could not determine next token in {query!r}")