in sync/metrichub.py [0:0]
def _extract_table_references(sql: str) -> List[str]:
"""
Return a list of tables referenced in the given SQL. Adapted from bigquery-etl:
https://github.com/mozilla/bigquery-etl/blob/12c27464b1d5c41f15a6d3d9e2463547164e3518/bigquery_etl/dependency.py#L31
"""
# sqlglot cannot handle scripts with variables and control statements
if re.search(r"^\s*DECLARE\b", sql, flags=re.MULTILINE):
return []
# sqlglot parses UDFs with keyword names incorrectly:
# https://github.com/tobymao/sqlglot/issues/1535
sql = re.sub(
r"\.(range|true|false|null)\(",
r".\1_(",
sql,
flags=re.IGNORECASE,
)
# sqlglot doesn't suppport OPTIONS on UDFs
sql = re.sub(
r"""OPTIONS\s*\(("([^"]|\\")*"|'([^']|\\')*'|[^)])*\)""",
"",
sql,
flags=re.MULTILINE | re.IGNORECASE,
)
# sqlglot doesn't fully support byte strings
sql = re.sub(
r"""b(["'])""",
r"\1",
sql,
flags=re.IGNORECASE,
)
query_statements = sqlglot.parse(sql, read="bigquery")
# If there's only one statement, and it's a Column token, it's the table name:
if len(query_statements) == 1 and isinstance(
query_statements[0], sqlglot.exp.Column
):
return [sql.replace("`", "")]
creates, tables = set(), set()
for statement in query_statements:
if statement is None:
continue
creates |= {
_raw_table_name(expr.this)
for expr in statement.find_all(sqlglot.exp.Create)
}
tables |= (
{_raw_table_name(table) for table in statement.find_all(sqlglot.exp.Table)}
# ignore references created in this query
- creates
# ignore CTEs created in this statement
- {cte.alias_or_name for cte in statement.find_all(sqlglot.exp.CTE)}
)
return sorted(tables)