bigquery_etl/glam/templates/extract_probe_counts_v1.sql (131 lines of code) (raw):

{{ header }} WITH final_probe_extract AS ( SELECT channel, app_version AS version, ping_type, os, app_build_id AS build_id, IF( app_build_id = "*", NULL, SAFE_CAST({{ build_date_udf }}(app_build_id) AS STRING) ) AS build_date, metric, metric_type, -- BigQuery has some null unicode characters which Postgresql doesn't like, -- so we remove those here. Also limit string length to 200 to match column -- length. SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key, client_agg_type, MAX(total_users) AS total_users, MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(aggregates), NULL)) AS histogram, MAX( IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL) ) AS non_norm_histogram FROM `{{ dataset }}.{{ prefix }}__view_probe_counts_v1` WHERE total_users > {{ total_users }} AND app_version NOT IN (2015815747, 2015819723, 2015828803, 2015829155, 3015815747) GROUP BY channel, app_version, ping_type, os, app_build_id, metric, metric_type, key, client_agg_type ), -- to populate total_sample for agg_type other than 'count' glam_sample_counts AS ( SELECT fsc1.os, fsc1.app_version, fsc1.app_build_id, fsc1.metric, fsc1.key, fsc1.ping_type, fsc1.agg_type, CASE WHEN fsc1.agg_type IN ('max', 'min', 'sum', 'avg') AND fsc2.agg_type = 'count' THEN fsc2.total_sample ELSE fsc1.total_sample END AS total_sample FROM `{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc1 INNER JOIN `{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc2 ON fsc1.os = fsc2.os AND fsc1.app_build_id = fsc2.app_build_id AND fsc1.app_version = fsc2.app_version AND fsc1.metric = fsc2.metric AND fsc1.key = fsc2.key AND fsc1.ping_type = fsc2.ping_type ), -- get all the rcords from view_probe_counts and the matching from view_sample_counts ranked_data AS ( SELECT cp.channel, cp.version, cp.os, cp.ping_type, cp.build_id, cp.build_date, cp.metric, cp.metric_key, cp.client_agg_type, cp.metric_type, total_users, histogram, non_norm_histogram, CASE WHEN client_agg_type = '' THEN 0 ELSE total_sample END AS total_sample, ROW_NUMBER() OVER ( PARTITION BY cp.version, cp.os, cp.build_id, cp.ping_type, cp.metric, cp.metric_key, cp.client_agg_type, cp.metric_type, histogram, non_norm_histogram ORDER BY total_users, total_sample DESC ) AS rnk FROM final_probe_extract cp LEFT JOIN glam_sample_counts sc ON sc.os = cp.os AND sc.app_build_id = cp.build_id AND sc.app_version = cp.version AND sc.metric = cp.metric AND sc.key = cp.metric_key AND total_sample IS NOT NULL AND (sc.agg_type = cp.client_agg_type OR cp.client_agg_type = '') ) --remove duplicates SELECT channel, version, ping_type, os, build_id, build_date, metric, metric_type, metric_key, client_agg_type, total_users, histogram, non_norm_histogram, CAST(total_sample AS INT) total_sample FROM ranked_data WHERE rnk = 1