sql_generators/glean_usage/templates/baseline_clients_daily_v1.query.sql (197 lines of code) (raw):
{{ header }}
WITH base AS (
SELECT
submission_timestamp,
DATE(submission_timestamp) AS submission_date,
LOWER(client_info.client_id) AS client_id,
sample_id,
SAFE.PARSE_DATE('%F', SUBSTR(client_info.first_run_date, 1, 10)) AS first_run_date,
mozfun.glean.parse_datetime(ping_info.end_time) AS parsed_end_time,
udf.glean_timespan_seconds(metrics.timespan.glean_baseline_duration) AS duration,
client_info.android_sdk_version,
client_info.app_build,
client_info.app_channel,
client_info.app_display_version,
client_info.architecture,
client_info.device_manufacturer,
client_info.device_model,
client_info.telemetry_sdk_build,
COALESCE(client_info.locale, metrics.string.glean_baseline_locale) AS locale,
metadata.geo.city,
metadata.geo.country,
metadata.isp.name AS isp,
normalized_channel,
normalized_os,
normalized_os_version,
metadata.geo.subdivision1 AS geo_subdivision,
{% if has_profile_group_id %}
metrics.uuid.legacy_telemetry_profile_group_id AS profile_group_id,
{% else %}
CAST(NULL AS STRING) AS profile_group_id,
{% endif %}
{% if app_name == "firefox_desktop" %}
client_info.windows_build_number AS windows_build_number,
metrics.counter.browser_engagement_uri_count as browser_engagement_uri_count,
metrics.counter.browser_engagement_active_ticks as browser_engagement_active_ticks,
metrics.uuid.legacy_telemetry_client_id as legacy_telemetry_client_id,
metrics.string.usage_distribution_id AS distribution_id,
metrics.boolean.usage_is_default_browser AS is_default_browser,
CAST(NULL AS STRING) AS install_source,
{% elif app_name == "fenix" %}
CAST(NULL AS INT64) AS windows_build_number,
CAST(NULL AS INT64) AS browser_engagement_uri_count,
CAST(NULL AS INT64) AS browser_engagement_active_ticks,
CAST(NULL AS STRING) AS legacy_telemetry_client_id,
metrics.string.metrics_distribution_id AS distribution_id,
CAST(NULL AS BOOLEAN) AS is_default_browser,
metrics.string.first_session_install_source AS install_source,
{% else %}
CAST(NULL AS INT64) AS windows_build_number,
CAST(NULL AS INT64) AS browser_engagement_uri_count,
CAST(NULL AS INT64) AS browser_engagement_active_ticks,
CAST(NULL AS STRING) AS legacy_telemetry_client_id,
CAST(NULL AS STRING) AS distribution_id,
CAST(NULL AS BOOLEAN) AS is_default_browser,
CAST(NULL AS STRING) AS install_source,
{% endif %}
client_info.attribution,
client_info.distribution,
FROM
`{{ baseline_table }}`
-- Baseline pings with 'foreground' reason were first introduced in early April 2020;
-- we initially excluded them from baseline_clients_daily so that we could measure
-- effects on KPIs. On 2020-08-25, we removed the filter on reason and backfilled. See:
-- https://bugzilla.mozilla.org/show_bug.cgi?id=1627286
-- https://jira.mozilla.com/browse/DS-1018
),
--
with_dates AS (
SELECT
*,
-- For explanation of session start time calculation, see Glean docs:
-- https://mozilla.github.io/glean/book/user/pings/baseline.html#contents
DATE(SAFE.TIMESTAMP_SUB(parsed_end_time, INTERVAL duration SECOND)) AS session_start_date,
DATE(parsed_end_time) AS session_end_date,
FROM
base
),
--
with_date_offsets AS (
SELECT
*,
DATE_DIFF(submission_date, session_start_date, DAY) AS session_start_date_offset,
DATE_DIFF(submission_date, session_end_date, DAY) AS session_end_date_offset,
FROM
with_dates
),
--
overactive AS (
-- Find client_ids with over 150 000 pings in a day,
-- which could cause errors in the next step due to aggregation overflows.
SELECT
submission_date,
client_id
FROM
with_date_offsets
WHERE
{% raw %}
{% if is_init() %}
{% endraw %}
submission_date >= '2018-01-01'
{% raw %}
{% else %}
{% endraw %}
submission_date = @submission_date
{% raw %}
{% endif %}
{% endraw %}
GROUP BY
submission_date,
client_id
HAVING
COUNT(*) > 150000
),
--
windowed AS (
SELECT
submission_date,
client_id,
sample_id,
ROW_NUMBER() OVER w1_unframed AS _n,
--
-- Take the earliest first_run_date if ambiguous.
MIN(first_run_date) OVER w1 AS first_run_date,
--
-- Sums over distinct baseline pings.
SUM(IF(duration BETWEEN 0 AND 100000, duration, 0)) OVER w1 AS durations,
--
-- Bit patterns capturing activity dates relative to the submission date.
BIT_OR(
1 << IF(session_start_date_offset BETWEEN 0 AND 27, session_start_date_offset, NULL)
) OVER w1 AS days_seen_session_start_bits,
BIT_OR(
1 << IF(session_end_date_offset BETWEEN 0 AND 27, session_end_date_offset, NULL)
) OVER w1 AS days_seen_session_end_bits,
--
-- For all other dimensions, we use the mode of observed values in the day.
udf.mode_last(ARRAY_AGG(normalized_channel) OVER w1) AS normalized_channel,
udf.mode_last(ARRAY_AGG(normalized_os) OVER w1) AS normalized_os,
udf.mode_last(ARRAY_AGG(normalized_os_version) OVER w1) AS normalized_os_version,
udf.mode_last(ARRAY_AGG(android_sdk_version) OVER w1) AS android_sdk_version,
udf.mode_last(ARRAY_AGG(locale) OVER w1) AS locale,
udf.mode_last(ARRAY_AGG(city) OVER w1) AS city,
udf.mode_last(ARRAY_AGG(country) OVER w1) AS country,
udf.mode_last(ARRAY_AGG(isp) OVER w1) AS isp,
udf.mode_last(ARRAY_AGG(app_build) OVER w1) AS app_build,
udf.mode_last(ARRAY_AGG(app_channel) OVER w1) AS app_channel,
udf.mode_last(ARRAY_AGG(app_display_version) OVER w1) AS app_display_version,
udf.mode_last(ARRAY_AGG(architecture) OVER w1) AS architecture,
udf.mode_last(ARRAY_AGG(device_manufacturer) OVER w1) AS device_manufacturer,
udf.mode_last(ARRAY_AGG(device_model) OVER w1) AS device_model,
udf.mode_last(ARRAY_AGG(telemetry_sdk_build) OVER w1) AS telemetry_sdk_build,
udf.mode_last(ARRAY_AGG(distribution_id) OVER w1) AS distribution_id,
udf.mode_last(ARRAY_AGG(install_source) OVER w1) AS install_source,
udf.mode_last(ARRAY_AGG(geo_subdivision) OVER w1) AS geo_subdivision,
udf.mode_last(ARRAY_AGG(profile_group_id) OVER w1) AS profile_group_id,
udf.mode_last(ARRAY_AGG(windows_build_number) OVER w1) AS windows_build_number,
SUM(COALESCE(browser_engagement_uri_count, 0)) OVER w1 AS browser_engagement_uri_count,
SUM(COALESCE(browser_engagement_active_ticks, 0)) OVER w1 AS browser_engagement_active_ticks,
udf.mode_last(ARRAY_AGG(legacy_telemetry_client_id) OVER w1) AS legacy_telemetry_client_id,
udf.mode_last(ARRAY_AGG(is_default_browser) OVER w1) AS is_default_browser,
udf.mode_last(ARRAY_AGG(attribution) OVER w1) AS attribution,
udf.mode_last(ARRAY_AGG(`distribution`) OVER w1) AS `distribution`,
FROM
with_date_offsets
LEFT JOIN
overactive
USING (submission_date, client_id)
WHERE
overactive.client_id IS NULL AND
{% raw %}
{% if is_init() %}
{% endraw %}
submission_date >= '2018-01-01'
{% raw %}
{% else %}
{% endraw %}
submission_date = @submission_date
{% raw %}
{% endif %}
{% endraw %}
WINDOW
w1 AS (
PARTITION BY
sample_id,
client_id,
submission_date
ORDER BY
submission_timestamp
ROWS BETWEEN
UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING
),
-- We must provide a modified window for ROW_NUMBER which cannot accept a frame clause.
w1_unframed AS (
PARTITION BY
sample_id,
client_id,
submission_date
ORDER BY
submission_timestamp
)
),
joined as (
SELECT
cd.* EXCEPT (_n),
cfs.first_seen_date,
-- the first seen date may be earlier than the submission date since it also
-- takes into account the migration ping
(cd.submission_date = cfs.first_seen_date) AS is_new_profile
FROM
windowed AS cd
LEFT JOIN
`{{ first_seen_table }}` AS cfs
USING (client_id)
WHERE
_n = 1
)
--
SELECT
*
FROM
joined