sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_sessions_v2/script.sql (562 lines of code) (raw):

MERGE INTO `moz-fx-data-shared-prod.mozilla_org_derived.ga_sessions_v2` T USING ( --get all the unique "GA Client ID", "GA Session ID" combinations with events between 3 days prior to the submission date and the submission date WITH all_ga_client_id_ga_session_ids_with_new_events_in_last_3_days AS ( SELECT DISTINCT user_pseudo_id AS ga_client_id, CAST(e.value.int_value AS string) AS ga_session_id FROM `moz-fx-data-marketing-prod.analytics_313696158.events_*` a JOIN UNNEST(event_params) e WHERE e.key = 'ga_session_id' AND e.value.int_value IS NOT NULL AND _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(@submission_date, INTERVAL 3 DAY)) AND FORMAT_DATE('%Y%m%d', @submission_date) ), --get the unique client IDs from the prior table distinct_ga_client_ids AS ( SELECT DISTINCT ga_client_id FROM all_ga_client_id_ga_session_ids_with_new_events_in_last_3_days ), device_properties_at_session_start_event AS ( --get all session starts, from any date, associated with these client ID / session IDs from the last 3 days SELECT user_pseudo_id AS ga_client_id, CAST(e.value.int_value AS string) AS ga_session_id, ( SELECT `value` FROM UNNEST(event_params) WHERE key = 'ga_session_number' LIMIT 1 ).int_value AS ga_session_number, geo.country AS country, geo.region AS region, geo.city AS city, collected_traffic_source.manual_campaign_id AS campaign_id, collected_traffic_source.manual_campaign_name AS campaign, collected_traffic_source.manual_source AS source, collected_traffic_source.manual_medium AS medium, collected_traffic_source.manual_term AS term, collected_traffic_source.manual_content AS content, device.category AS device_category, device.mobile_model_name AS mobile_device_model, device.mobile_marketing_name AS mobile_device_string, device.operating_system AS os, device.operating_system_version AS os_version, device.language AS `language`, device.web_info.browser AS browser, device.web_info.browser_version AS browser_version, PARSE_DATE('%Y%m%d', event_date) AS session_date FROM `moz-fx-data-marketing-prod.analytics_313696158.events_2*` a JOIN UNNEST(event_params) AS e JOIN all_ga_client_id_ga_session_ids_with_new_events_in_last_3_days c ON a.user_pseudo_id = c.ga_client_id AND CAST(e.value.int_value AS string) = c.ga_session_id WHERE e.key = 'ga_session_id' AND e.value.int_value IS NOT NULL AND event_name = 'session_start' QUALIFY ROW_NUMBER() OVER ( PARTITION BY user_pseudo_id, e.value.int_value ORDER BY event_timestamp ASC ) = 1 ), --search all time for all the client IDs, and get all campaigns we see and the session ID we see them for all_campaigns_from_event_params_in_session_staging AS ( SELECT a.user_pseudo_id AS ga_client_id, a.event_timestamp, ( SELECT `value` FROM UNNEST(event_params) WHERE key = 'ga_session_id' LIMIT 1 ).int_value AS ga_session_id, ( SELECT `value` FROM UNNEST(event_params) WHERE key = 'campaign' LIMIT 1 ).string_value AS campaign_from_event_params FROM `moz-fx-data-marketing-prod.analytics_313696158.events_2*` a JOIN distinct_ga_client_ids b ON a.user_pseudo_id = b.ga_client_id ), all_campaigns_from_event_params_in_session AS ( SELECT ga_client_id, CAST(ga_session_id AS string) AS ga_session_id, campaign_from_event_params, event_timestamp FROM all_campaigns_from_event_params_in_session_staging WHERE ga_client_id IS NOT NULL AND ga_session_id IS NOT NULL AND campaign_from_event_params IS NOT NULL ), campaigns_by_session AS ( SELECT ga_client_id, ga_session_id, ARRAY_AGG(DISTINCT campaign_from_event_params) AS distinct_campaigns_from_event_params, ARRAY_AGG(campaign_from_event_params ORDER BY event_timestamp ASC)[ 0 ] AS first_campaign_from_event_params FROM all_campaigns_from_event_params_in_session GROUP BY ga_client_id, ga_session_id ), click_aggregate_stg AS ( SELECT DISTINCT user_pseudo_id AS ga_client_id, event_timestamp, CAST(e.value.int_value AS string) AS ga_session_id, collected_traffic_source.gclid AS gclid FROM `moz-fx-data-marketing-prod.analytics_313696158.events_2*` a JOIN UNNEST(event_params) AS e JOIN all_ga_client_id_ga_session_ids_with_new_events_in_last_3_days c ON a.user_pseudo_id = c.ga_client_id AND CAST(e.value.int_value AS string) = c.ga_session_id WHERE e.key = 'ga_session_id' AND e.value.int_value IS NOT NULL AND collected_traffic_source.gclid IS NOT NULL ), click_aggregate AS ( SELECT ga_client_id, ga_session_id, ARRAY_AGG(DISTINCT gclid) AS gclid_array, ARRAY_AGG(gclid ORDER BY event_timestamp DESC)[ 0 ] AS gclid --this is really just the last reported gclid FROM click_aggregate_stg GROUP BY ga_client_id, ga_session_id ), --get all the page views and min/max event timestamp and whether there was a product download for these session/clients of interest event_aggregates AS ( SELECT user_pseudo_id AS ga_client_id, CAST(e.value.int_value AS string) AS ga_session_id, COUNTIF(event_name = 'page_view') AS pageviews, MIN(event_timestamp) AS min_event_timestamp, MAX(event_timestamp) AS max_event_timestamp, SUM( CASE WHEN (event_name = 'firefox_download' AND event_date >= '20240217') OR ( event_name = 'product_download' AND event_date < '20240217' AND ( SELECT `value` FROM UNNEST(event_params) WHERE key = 'product' LIMIT 1 ).string_value = 'firefox' AND ( SELECT `value` FROM UNNEST(event_params) WHERE key = 'platform' LIMIT 1 ).string_value IN ( 'win', 'win64', 'macos', 'linux64', 'win64-msi', 'linux', 'win-msi', 'win64-aarch64' ) --platform = desktop ) THEN 1 ELSE 0 END ) AS firefox_desktop_downloads, CAST( MAX( CASE WHEN event_name IN ( 'product_download', 'firefox_download', 'firefox_mobile_download', 'focus_download', 'klar_download' ) THEN 1 ELSE 0 END ) AS boolean ) AS had_download_event FROM `moz-fx-data-marketing-prod.analytics_313696158.events_2*` a JOIN UNNEST(event_params) AS e JOIN all_ga_client_id_ga_session_ids_with_new_events_in_last_3_days c ON a.user_pseudo_id = c.ga_client_id AND CAST(e.value.int_value AS string) = c.ga_session_id WHERE e.key = 'ga_session_id' AND e.value.int_value IS NOT NULL GROUP BY user_pseudo_id, CAST(e.value.int_value AS string) ), --get all the stub session IDs for the clients of interest stub_session_ids_staging AS ( SELECT user_pseudo_id AS ga_client_id, event_timestamp, CAST( ( SELECT `value` FROM UNNEST(event_params) WHERE key = 'ga_session_id' LIMIT 1 ).int_value AS string ) AS ga_session_id, CAST(e.value.int_value AS string) AS stub_session_id FROM `moz-fx-data-marketing-prod.analytics_313696158.events_2*` a JOIN UNNEST(event_params) AS e JOIN distinct_ga_client_ids c ON a.user_pseudo_id = c.ga_client_id WHERE event_name = 'stub_session_set' AND e.key = 'id' AND e.value.int_value IS NOT NULL ), --put the stub session IDs into an array all_stub_session_ids AS ( SELECT ga_client_id, ga_session_id, ARRAY_AGG(stub_session_id) AS all_reported_stub_session_ids, ARRAY_AGG(stub_session_id ORDER BY event_timestamp DESC)[0] AS last_reported_stub_session_id FROM stub_session_ids_staging GROUP BY ga_client_id, ga_session_id ), --for each session, get the entrance page location landing_page_by_session_staging AS ( SELECT user_pseudo_id AS ga_client_id, CAST( ( SELECT `value` FROM UNNEST(event_params) WHERE key = 'ga_session_id' LIMIT 1 ).int_value AS string ) AS ga_session_id, SPLIT( ( SELECT `value` FROM UNNEST(event_params) WHERE key = 'page_location' LIMIT 1 ).string_value, '?' )[OFFSET(0)] AS page_location, event_timestamp FROM `moz-fx-data-marketing-prod.analytics_313696158.events_2*` a JOIN UNNEST(event_params) AS e JOIN distinct_ga_client_ids c ON a.user_pseudo_id = c.ga_client_id WHERE e.key = 'entrances' AND e.value.int_value = 1 ), --if there is ever for some reason more than 1 entrance in 1 session, just select the first one landing_page_by_session AS ( SELECT ga_client_id, ga_session_id, page_location, event_timestamp FROM landing_page_by_session_staging QUALIFY ROW_NUMBER() OVER ( PARTITION BY ga_client_id, ga_session_id ORDER BY event_timestamp ASC ) = 1 ), --search for any download events (used to be called product download, but then was broken out into the 4 more detailed options) install_targets_staging AS ( SELECT user_pseudo_id AS ga_client_id, CAST(e.value.int_value AS string) AS ga_session_id, event_timestamp, event_name AS install_event_name FROM `moz-fx-data-marketing-prod.analytics_313696158.events_2*` a JOIN UNNEST(event_params) AS e JOIN all_ga_client_id_ga_session_ids_with_new_events_in_last_3_days c ON a.user_pseudo_id = c.ga_client_id AND CAST(e.value.int_value AS string) = c.ga_session_id WHERE e.key = 'ga_session_id' AND e.value.int_value IS NOT NULL AND event_name IN ( 'product_download', 'firefox_download', 'firefox_mobile_download', 'focus_download', 'klar_download' ) ), --put these into an array for the session all_install_targets AS ( SELECT ga_client_id, ga_session_id, ARRAY_AGG(install_event_name) AS all_reported_install_targets, ARRAY_AGG(install_event_name ORDER BY event_timestamp DESC)[ SAFE_OFFSET(0) ] AS last_reported_install_target FROM install_targets_staging GROUP BY ga_client_id, ga_session_id ) SELECT sess_strt.ga_client_id, sess_strt.ga_session_id, sess_strt.session_date, CASE WHEN sess_strt.ga_session_number = 1 THEN TRUE ELSE FALSE END AS is_first_session, sess_strt.ga_session_number AS session_number, CAST( (evnt.max_event_timestamp - evnt.min_event_timestamp) / 1000000 AS int64 ) AS time_on_site, evnt.pageviews, sess_strt.country, sess_strt.region, sess_strt.city, sess_strt.campaign_id, sess_strt.campaign, sess_strt.source, sess_strt.medium, sess_strt.term, sess_strt.content, clicks.gclid, clicks.gclid_array, sess_strt.device_category, sess_strt.mobile_device_model, sess_strt.mobile_device_string, sess_strt.os, sess_strt.os_version, sess_strt.language, sess_strt.browser, sess_strt.browser_version, evnt.had_download_event, evnt.firefox_desktop_downloads, installs.last_reported_install_target, installs.all_reported_install_targets, stub_sessn_ids.last_reported_stub_session_id, stub_sessn_ids.all_reported_stub_session_ids, lndg_pg.page_location AS landing_screen, campaigns.distinct_campaigns_from_event_params, campaigns.first_campaign_from_event_params FROM device_properties_at_session_start_event sess_strt JOIN all_ga_client_id_ga_session_ids_with_new_events_in_last_3_days sessions_to_update USING (ga_client_id, ga_session_id) LEFT JOIN event_aggregates evnt USING (ga_client_id, ga_session_id) LEFT JOIN all_stub_session_ids stub_sessn_ids USING (ga_client_id, ga_session_id) LEFT JOIN landing_page_by_session lndg_pg USING (ga_client_id, ga_session_id) LEFT JOIN all_install_targets installs USING (ga_client_id, ga_session_id) LEFT JOIN click_aggregate clicks USING (ga_client_id, ga_session_id) LEFT JOIN campaigns_by_session campaigns USING (ga_client_id, ga_session_id) ) S ON T.ga_client_id = S.ga_client_id AND T.ga_session_id = S.ga_session_id WHEN NOT MATCHED BY TARGET THEN INSERT ( ga_client_id, ga_session_id, session_date, is_first_session, session_number, time_on_site, pageviews, country, region, city, campaign_id, campaign, source, medium, term, content, gclid, gclid_array, device_category, mobile_device_model, mobile_device_string, os, os_version, `LANGUAGE`, browser, browser_version, had_download_event, last_reported_install_target, all_reported_install_targets, last_reported_stub_session_id, all_reported_stub_session_ids, landing_screen, distinct_campaigns_from_event_params, first_campaign_from_event_params ) VALUES ( S.ga_client_id, S.ga_session_id, S.session_date, S.is_first_session, S.session_number, S.time_on_site, S.pageviews, S.country, S.region, S.city, S.campaign_id, S.campaign, S.source, S.medium, S.term, S.content, S.gclid, S.gclid_array, S.device_category, S.mobile_device_model, S.mobile_device_string, S.os, S.os_version, S.language, S.browser, S.browser_version, S.had_download_event, S.last_reported_install_target, S.all_reported_install_targets, S.last_reported_stub_session_id, S.all_reported_stub_session_ids, S.landing_screen, S.distinct_campaigns_from_event_params, S.first_campaign_from_event_params ) WHEN MATCHED THEN UPDATE SET T.ga_client_id = S.ga_client_id, T.ga_session_id = S.ga_session_id, T.session_date = S.session_date, T.is_first_session = S.is_first_session, T.session_number = S.session_number, T.time_on_site = S.time_on_site, T.pageviews = S.pageviews, T.country = S.country, T.region = S.region, T.city = S.city, T.campaign_id = S.campaign_id, T.campaign = S.campaign, T.source = S.source, T.medium = S.medium, T.term = S.term, T.content = S.content, T.gclid = S.gclid, T.gclid_array = S.gclid_array, T.device_category = S.device_category, T.mobile_device_model = S.mobile_device_model, T.mobile_device_string = S.mobile_device_string, T.os = S.os, T.os_version = S.os_version, T.language = S.language, T.browser = S.browser, T.browser_version = S.browser_version, T.had_download_event = S.had_download_event, T.last_reported_install_target = S.last_reported_install_target, T.all_reported_install_targets = S.all_reported_install_targets, T.last_reported_stub_session_id = S.last_reported_stub_session_id, T.all_reported_stub_session_ids = S.all_reported_stub_session_ids, T.landing_screen = S.landing_screen, T.distinct_campaigns_from_event_params = S.distinct_campaigns_from_event_params, T.first_campaign_from_event_params = S.first_campaign_from_event_params