backfill/2024-05-07-telemetry-pings-os-distro/generate_statements.py (24 lines of code) (raw):

import subprocess # use bash to avoid python dependencies result = subprocess.check_output( [ "bq", "query", "--format=sparse", "--nouse_legacy_sql", "--max_rows=1000", "SELECT DISTINCT(_TABLE_SUFFIX) FROM `moz-fx-data-backfill-1.telemetry_os_distro_output.*` ORDER BY 1", ] ) tables = result.decode().split()[2:] CREATE_STATEMENT_TEMPLATE = """-- {table} CREATE TABLE `moz-fx-data-backfill-1.telemetry_os_distro_deduped.{table}` LIKE `moz-fx-data-backfill-1.telemetry_os_distro_output.{table}` AS ( WITH existing_doc_ids AS ( SELECT document_id FROM `moz-fx-data-shared-prod.telemetry_stable.{table}` WHERE DATE(submission_timestamp) BETWEEN '2024-01-16' AND '2024-05-02' ), new_rows AS ( SELECT * FROM `moz-fx-data-backfill-1.telemetry_os_distro_output.{table}` WHERE DATE(submission_timestamp) BETWEEN '2024-01-16' AND '2024-05-02' QUALIFY ROW_NUMBER() OVER (PARTITION BY document_id ORDER BY submission_timestamp) = 1 ) SELECT new_rows.* FROM new_rows LEFT JOIN existing_doc_ids USING (document_id) WHERE existing_doc_ids.document_id IS NULL ); """ creates = [] for table in tables: creates.append(CREATE_STATEMENT_TEMPLATE.format(table=table)) with open("dedupe_pings.sql", "w") as f: f.write("\n".join(creates)) INSERT_STATEMENT_TEMPLATE = """-- {table} INSERT INTO `moz-fx-data-shared-prod.telemetry_stable.{table}` SELECT * FROM `moz-fx-data-backfill-1.telemetry_os_distro_deduped.{table}` WHERE DATE(submission_timestamp) BETWEEN '2024-01-16' AND '2024-05-02'; """ inserts = [] for table in tables: inserts.append(INSERT_STATEMENT_TEMPLATE.format(table=table)) with open("final_insert.sql", "w") as f: f.write("\n".join(inserts))