bigquery_etl/glam/templates/probe_counts_v1.sql (138 lines of code) (raw):

{{ header }} {% if not is_scalar %} CREATE TEMP FUNCTION udf_get_buckets( metric_type STRING, range_min INT64, range_max INT64, bucket_count INT64 ) RETURNS ARRAY<INT64> AS ( ( WITH buckets AS ( SELECT CASE -- We use ENDS_WITH here to accommodate for types prefixed with "labeled_" WHEN ENDS_WITH(metric_type, 'timing_distribution') -- https://mozilla.github.io/glean/book/user/metrics/timing_distribution.html THEN mozfun.glam.histogram_generate_functional_buckets(2, 8, range_max) WHEN ENDS_WITH(metric_type, 'memory_distribution') -- https://mozilla.github.io/glean/book/user/metrics/memory_distribution.html THEN mozfun.glam.histogram_generate_functional_buckets(2, 16, range_max) WHEN ENDS_WITH(metric_type, 'custom_distribution_exponential') THEN mozfun.glam.histogram_generate_exponential_buckets( range_min, range_max, bucket_count ) WHEN ENDS_WITH(metric_type, 'custom_distribution_linear') THEN mozfun.glam.histogram_generate_linear_buckets(range_min, range_max, bucket_count) ELSE [] END AS arr ) SELECT ARRAY_AGG(CAST(item AS INT64)) FROM buckets CROSS JOIN UNNEST(arr) AS item ) ); {% endif %} WITH probe_counts AS ( SELECT {{ attributes }}, {{ aggregate_attributes }}, {% if is_scalar %} client_agg_type, agg_type, {% if channel == "release" %} -- Logic to count clients based on sampled windows release data, which started in v119. -- If you're changing this, then you'll also need to change -- clients_daily_[scalar | histogram]_aggregates IF(os = 'Windows' AND app_version >= 119, SUM(count) * 10, SUM(count)) AS total_users, {% else %} SUM(count) AS total_users, {% endif %} mozfun.glam.histogram_fill_buckets_dirichlet( mozfun.map.sum(ARRAY_AGG(STRUCT<key STRING, value FLOAT64>(bucket, count))), CASE WHEN metric_type IN ({{ scalar_metric_types }}) THEN ARRAY( SELECT FORMAT("%.*f", 2, bucket) FROM UNNEST( mozfun.glam.histogram_generate_scalar_buckets( range_min, range_max, bucket_count ) ) AS bucket ORDER BY bucket ) WHEN metric_type IN ({{ boolean_metric_types }}) THEN ['always', 'never', 'sometimes'] END, SUM(count) ) AS aggregates {% else %} agg_type AS client_agg_type, 'histogram' AS agg_type, CAST(ROUND(SUM(record.value)) AS INT64) AS total_users, mozfun.glam.histogram_fill_buckets_dirichlet( mozfun.map.sum(ARRAY_AGG(record)), mozfun.glam.histogram_buckets_cast_string_array( udf_get_buckets(metric_type, range_min, range_max, bucket_count) ), CAST(ROUND(SUM(record.value)) AS INT64) ) AS aggregates, mozfun.glam.histogram_fill_buckets( mozfun.map.sum(ARRAY_AGG(non_norm_record)), mozfun.glam.histogram_buckets_cast_string_array( udf_get_buckets(metric_type, range_min, range_max, bucket_count) ) ) AS non_norm_aggregates {% endif %} FROM {{ source_table }} GROUP BY {{ attributes }}, range_min, range_max, bucket_count, {{ aggregate_attributes }}, {{ aggregate_grouping }} ) {% if channel == "release" %} , -- Fix All OS client counts which were originally calculated taking only 10% of Windows due to sampling. windows_probe_counts AS ( SELECT * FROM probe_counts WHERE os = "Windows" ) SELECT pc.* REPLACE ( IF( pc.os = "*", -- Add the remaining 90% of Windows client count, if present, to the All OS (*) client count. pc.total_users + CAST((COALESCE(wpc.total_users, 0) * 0.9) AS INT64), pc.total_users ) AS total_users ) {% if is_scalar %} , pc.aggregates AS non_norm_aggregates {% endif %} FROM probe_counts pc LEFT JOIN windows_probe_counts wpc USING ( {{ attributes_no_os }}, {{ aggregate_attributes }}, {{ aggregate_grouping }} ) {% else %} SELECT * {% if is_scalar %} --Scalars are always non-normalized. --This is to comply with histograms' schema , aggregates AS non_norm_aggregates {% endif %} FROM probe_counts {% endif %}