download.sql (114 lines of code) (raw):

declare oses ARRAY<STRING>; declare channels ARRAY<STRING>; declare process_types ARRAY<STRING>; declare utility_actors ARRAY<STRING>; set oses = ['Android', 'Linux', 'Mac', 'Windows']; set channels = ['nightly', 'beta', 'release']; set process_types = ['main', 'content', 'gpu', 'rdd', 'socket', 'gmplugin', 'utility']; set utility_actors = ['audio-decoder-generic', 'audio-decoder-applemedia', 'audio-decoder-wmf', 'mf-media-engine', 'js-oracle', 'windows-utils', 'windows-file-dialog']; -- We materialize a temp table of configurations because the generated row numbers must be consistent. -- This table contains all combinations of the oses, channels, process_types, and utility process + utility_actors. create temp table config as ( select (ROW_NUMBER() over ()) as id, -- Choose version based on channel (case channel when 'release' then @release_version when 'beta' then @release_version + 1 when 'nightly' then @release_version + 2 else ERROR("unknown channel") end) as version, -- Choose sample count based on channel (case channel when 'release' then 5000 -- Set very high counts for nightly and beta, to essentially always process all pings for these channels. else 50000 end) as target_sample_count, * from UNNEST(oses) as os cross join UNNEST(channels) as channel cross join ( select * from UNNEST(process_types) as process_type cross join (select STRING(null) as utility_actor) union all select * from (select 'utility' as process_type) cross join UNNEST(utility_actors) as utility_actor ) ); -- Because we want to output the config table later with the counts of pings -- which matched each configuration, we materialize temporary tables here -- rather than use CTEs (since CTEs would be recomputed and we need these -- tables for both the config query and final output query). create temp table pings as ( -- Desktop and Android have slightly different tables because their metrics -- differ, so we can't simply union them without first extracting our -- fields of interest. with desktop as ( select document_id, submission_timestamp, metrics.object.crash_stack_traces as stack_traces, metrics.string.crash_moz_crash_reason as moz_crash_reason, metrics.string.crash_ipc_channel_error as ipc_channel_error, metrics.quantity.memory_oom_allocation_size as oom_size, normalized_os as os, metrics.string.crash_app_channel as channel, metrics.string.crash_app_display_version as display_version, metrics.string.crash_process_type as process_type, metrics.string_list.crash_utility_actors_name as utility_actors_name from firefox_desktop.desktop_crashes ), android as ( select document_id, submission_timestamp, metrics.object.crash_stack_traces as stack_traces, metrics.object.crash_java_exception as java_exception, metrics.string.crash_moz_crash_reason as moz_crash_reason, metrics.string.crash_ipc_channel_error as ipc_channel_error, metrics.quantity.memory_oom_allocation_size as oom_size, normalized_os as os, metrics.string.crash_app_channel as channel, metrics.string.crash_app_display_version as display_version, metrics.string.crash_process_type as process_type from fenix.crash ) select config.id as config_id, target_sample_count, document_id, -- Explicitly format the timestamp for maximum precision because these values will be round-tripped into the ingest output table and joined. -- Otherwise the default result string only has millisecond precision and doesn't join correctly. FORMAT_TIMESTAMP("%FT%R:%E*S", submission_timestamp) as submission_timestamp, IF(stack_traces is null, null, TO_JSON_STRING(stack_traces)) as stack_traces, IF(java_exception is null, null, TO_JSON_STRING(java_exception)) as java_exception, moz_crash_reason, ipc_channel_error, oom_size, data.os, data.channel, from (select * from desktop outer union all by name select * from android) as data join config on config.os = data.os and config.channel = data.channel and (SAFE_CAST(REGEXP_SUBSTR(display_version, '[0-9]*') as INT64)) = version and config.process_type = data.process_type and ( config.process_type != 'utility' or ( (utility_actor is null and ARRAY_LENGTH(IFNULL(utility_actors_name, [])) = 0) or (utility_actor in UNNEST(utility_actors_name)) ) ) where DATE(submission_timestamp) = @date and (stack_traces is not null or java_exception is not null) ); create temp table config_counts as ( select config_id, COUNT(*) as total_config_count from pings group by config_id ); -- Output the configuration with the total counts to record how the data was selected. select STRING(@date) as date, total_config_count as count, config.* from config_counts join config on config_id = config.id; -- Just in case, select distinct (it's possible, though unlikely, that we might -- have duplicates from the way utility actors are selected: one ping may -- have more than one actor set). select distinct document_id, config_id, submission_timestamp, stack_traces, java_exception, moz_crash_reason, ipc_channel_error, oom_size, os, channel from pings join config_counts using (config_id) -- To ensure a random sample, we select based on the proportion of the total. where RAND() <= (target_sample_count / total_config_count); -- Clean up by dropping temp tables. drop table config; drop table pings; drop table config_counts;