sql/moz-fx-data-shared-prod/telemetry_derived/clients_scalar_probe_counts_v1/query.sql (349 lines of code) (raw):
CREATE TEMP FUNCTION udf_boolean_buckets(
scalar_aggs ARRAY<
STRUCT<
metric STRING,
metric_type STRING,
key STRING,
process STRING,
agg_type STRING,
value FLOAT64
>
>
)
RETURNS ARRAY<
STRUCT<
metric STRING,
metric_type STRING,
key STRING,
process STRING,
agg_type STRING,
bucket STRING
>
> AS (
(
WITH boolean_columns AS (
SELECT
metric,
metric_type,
key,
process,
agg_type,
CASE
agg_type
WHEN 'true'
THEN value
ELSE 0
END AS bool_true,
CASE
agg_type
WHEN 'false'
THEN value
ELSE 0
END AS bool_false
FROM
UNNEST(scalar_aggs)
WHERE
metric_type IN ("boolean", "keyed-scalar-boolean")
),
summed_bools AS (
SELECT
metric,
metric_type,
key,
process,
'' AS agg_type,
SUM(bool_true) AS bool_true,
SUM(bool_false) AS bool_false
FROM
boolean_columns
GROUP BY
1,
2,
3,
4
),
booleans AS (
SELECT
* EXCEPT (bool_true, bool_false),
CASE
WHEN bool_true > 0
AND bool_false > 0
THEN "sometimes"
WHEN bool_true > 0
AND bool_false = 0
THEN "always"
WHEN bool_true = 0
AND bool_false > 0
THEN "never"
END AS bucket
FROM
summed_bools
WHERE
bool_true > 0
OR bool_false > 0
)
SELECT
ARRAY_AGG((metric, metric_type, key, process, agg_type, bucket))
FROM
booleans
)
);
WITH flat_clients_scalar_aggregates AS (
SELECT
*,
os = 'Windows'
AND channel = 'release' AS sampled,
FROM
`moz-fx-data-shared-prod.telemetry_derived.clients_scalar_aggregates_v1`
WHERE
submission_date = @submission_date
AND (@app_version IS NULL OR app_version = @app_version)
),
log_min_max AS (
SELECT
metric,
key,
LOG(IF(MIN(value) <= 0, 1, MIN(value)), 2) log_min,
LOG(IF(MAX(value) <= 0, 1, MAX(value)), 2) log_max
FROM
flat_clients_scalar_aggregates
CROSS JOIN
UNNEST(scalar_aggregates)
WHERE
metric_type = 'scalar'
OR metric_type = 'keyed-scalar'
GROUP BY
1,
2
),
buckets_by_metric AS (
SELECT
metric,
key,
ARRAY(
SELECT
FORMAT("%.*f", 2, bucket)
FROM
UNNEST(mozfun.glam.histogram_generate_scalar_buckets(log_min, log_max, 100)) AS bucket
ORDER BY
bucket
) AS buckets
FROM
log_min_max
),
static_combos AS (
SELECT
NULL AS os,
NULL AS app_build_id
UNION ALL
SELECT
NULL AS os,
'*' AS app_build_id
UNION ALL
SELECT
'*' AS os,
NULL AS app_build_id
UNION ALL
SELECT
'*' AS os,
'*' AS app_build_id
),
all_combos AS (
SELECT
* EXCEPT (os, app_build_id),
COALESCE(combos.os, flat_table.os) AS os,
COALESCE(combos.app_build_id, flat_table.app_build_id) AS app_build_id
FROM
flat_clients_scalar_aggregates flat_table
CROSS JOIN
static_combos combos
),
user_aggregates AS (
SELECT
client_id,
IF(os = '*', NULL, os) AS os,
app_version,
IF(app_build_id = '*', NULL, app_build_id) AS app_build_id,
channel,
IF(MAX(sampled), 10, 1) AS user_count,
`moz-fx-data-shared-prod`.udf.merge_scalar_user_data(
ARRAY_CONCAT_AGG(scalar_aggregates)
) AS scalar_aggregates
FROM
all_combos
GROUP BY
client_id,
os,
app_version,
app_build_id,
channel
),
build_ids AS (
SELECT
app_build_id,
channel,
FROM
user_aggregates
GROUP BY
1,
2
HAVING
-- Filter out builds having less than 0.5% of WAU
-- for context see https://github.com/mozilla/glam/issues/1575#issuecomment-946880387
CASE
WHEN channel = 'release'
THEN SUM(user_count) > 625000
WHEN channel = 'beta'
THEN SUM(user_count) > 9000
WHEN channel = 'nightly'
THEN SUM(user_count) > 375
ELSE SUM(user_count) > 100
END
),
bucketed_booleans AS (
SELECT
client_id,
os,
app_version,
app_build_id,
channel,
user_count,
os = 'Windows'
AND channel = 'release' AS sampled,
udf_boolean_buckets(scalar_aggregates) AS scalar_aggregates
FROM
user_aggregates
),
bucketed_scalars AS (
SELECT
client_id,
os,
app_version,
app_build_id,
channel,
user_count,
os = 'Windows'
AND channel = 'release' AS sampled,
metric,
metric_type,
key,
process,
agg_type,
-- Keep two decimal places before converting bucket to a string
SAFE_CAST(
FORMAT(
"%.*f",
2,
mozfun.glam.histogram_bucket_from_value(buckets, SAFE_CAST(value AS FLOAT64)) + 0.0001
) AS STRING
) AS bucket
FROM
user_aggregates
CROSS JOIN
UNNEST(scalar_aggregates)
LEFT JOIN
buckets_by_metric
USING (metric, key)
WHERE
metric_type = 'scalar'
OR metric_type = 'keyed-scalar'
),
booleans_and_scalars AS (
SELECT
* EXCEPT (scalar_aggregates)
FROM
bucketed_booleans
CROSS JOIN
UNNEST(scalar_aggregates)
UNION ALL
SELECT
*
FROM
bucketed_scalars
),
valid_booleans_scalars AS (
SELECT
*
FROM
booleans_and_scalars
INNER JOIN
build_ids
USING (app_build_id, channel)
),
clients_scalar_bucket_counts AS (
SELECT
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
process,
agg_type AS client_agg_type,
'histogram' AS agg_type,
bucket,
SUM(user_count) AS user_count,
FROM
valid_booleans_scalars
GROUP BY
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
process,
client_agg_type,
bucket
),
aggregated AS (
SELECT
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
process,
-- empty columns to match clients_histogram_probe_counts_v1 schema
NULL AS first_bucket,
NULL AS last_bucket,
NULL AS num_buckets,
client_agg_type,
agg_type,
SUM(user_count) AS total_users,
CASE
WHEN metric_type = 'scalar'
OR metric_type = 'keyed-scalar'
THEN mozfun.glam.histogram_fill_buckets(
ARRAY_AGG(STRUCT<key STRING, value FLOAT64>(bucket, user_count)),
ANY_VALUE(buckets)
)
WHEN metric_type = 'boolean'
OR metric_type = 'keyed-scalar-boolean'
THEN mozfun.glam.histogram_fill_buckets(
ARRAY_AGG(STRUCT<key STRING, value FLOAT64>(bucket, user_count)),
['always', 'never', 'sometimes']
)
END AS aggregates
FROM
clients_scalar_bucket_counts
LEFT JOIN
buckets_by_metric
USING (metric, key)
GROUP BY
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
process,
client_agg_type,
agg_type
)
SELECT
*,
aggregates AS non_norm_aggregates
FROM
aggregated