backfill/2022-01-27-sanitize-search-engines/org_mozilla_fenix.sql (146 lines of code) (raw):

CREATE TEMP FUNCTION sanitize_labeled_counter(input ARRAY<STRUCT<key STRING, value INT64>>) AS ( ARRAY( WITH base AS ( SELECT *, REGEXP_EXTRACT(key, r"([^.]+[.]in-content[.][^.]+[.])[^.]*[.]?.*") AS prefix, REGEXP_EXTRACT(key, r"[^.]+[.]in-content[.][^.]+[.]([^.]*)[.]?.*") AS code, REGEXP_EXTRACT(key, r"[^.]+[.]in-content[.][^.]+[.][^.]*[.]?(.*)") AS channel, FROM UNNEST(input) ), validations AS ( SELECT *, code IN ( "none", "other", "hz", "h_", "moz2", "moz4", "moz5", "moza", "mozb", "mozd", "moze", "mozi", "mozm", "mozo", "mozt", "mozw", "mozsl01", "mozsl02", "mozsl03", "firefox-a", "firefox-b", "firefox-b-1", "firefox-b-ab", "firefox-b-1-ab", "firefox-b-d", "firefox-b-1-d", "firefox-b-e", "firefox-b-1-e", "firefox-b-m", "firefox-b-1-m", "firefox-b-o", "firefox-b-1-o", "firefox-b-lm", "firefox-b-1-lm", "firefox-b-lg", "firefox-b-huawei-h1611", "firefox-b-is-oem1", "firefox-b-oem1", "firefox-b-oem2", "firefox-b-tinno", "firefox-b-pn-wt", "firefox-b-pn-wt-us", "ubuntu", "ffab", "ffcm", "ffhp", "ffip", "ffit", "ffnt", "ffocus", "ffos", "ffsb", "fpas", "fpsa", "ftas", "ftsa", "newext", "monline_dg", "monline_3_dg", "monline_4_dg", "monline_7_dg", "_1000969a", "_1000969b" ) AS code_is_valid, channel IS NULL OR channel = "" OR channel = "ts" AS channel_is_valid, FROM base ), -- This logic should match https://github.com/mozilla/gcp-ingestion/blob/b88da84c017ee6251947b5223019bee90d20ea3b/ingestion-beam/src/main/java/com/mozilla/telemetry/decoder/MessageScrubber.java#L456-L489 scrubbed AS ( SELECT * REPLACE ( CASE -- If prefix is null, this didn't match the regex and we want to pass through unchanged WHEN prefix IS NULL THEN key WHEN code_is_valid AND channel_is_valid THEN key WHEN code_is_valid AND NOT channel_is_valid THEN FORMAT("%s%s", prefix, code) ELSE FORMAT("%s%s", prefix, "other-scrubbed") END AS key ) FROM validations ) SELECT AS STRUCT key, SUM(value) AS value FROM scrubbed GROUP BY key ) ); CREATE OR REPLACE TABLE `moz-fx-data-backfill-20.org_mozilla_fenix_stable.metrics_v1` LIKE `moz-fx-data-shared-prod.org_mozilla_fenix_stable.metrics_v1` AS SELECT * REPLACE ( ( SELECT AS STRUCT metrics.* REPLACE ( ( SELECT AS STRUCT metrics.labeled_counter.* REPLACE ( sanitize_labeled_counter( metrics.labeled_counter.browser_search_ad_clicks ) AS browser_search_ad_clicks, sanitize_labeled_counter( metrics.labeled_counter.browser_search_in_content ) AS browser_search_in_content ) ) AS labeled_counter ) ) AS metrics ) FROM `moz-fx-data-shared-prod.org_mozilla_fenix_stable.metrics_v1` WHERE -- all time DATE(submission_timestamp) >= "2010-01-01"