etl/firefox_legacy_etl.py (61 lines of code) (raw):

import json import os import requests from .search import create_metrics_search_js from .utils import snake_case PROBES_URL = os.getenv( "PROBES_URL", "https://probeinfo.telemetry.mozilla.org/firefox/all/main/all_probes" ) PROBE_RECORDED_IN_PROCESSES_URL = os.getenv( "PROBE_RECORDED_IN_PROCESSES_URL", "https://public-data.telemetry.mozilla.org/api/v1/tables/telemetry_derived/client_probe_processes/v1/files/000000000000.json", # noqa ) def _get_legacy_firefox_metric_summary(probe_data, activity_mapping): """ Get a summary of legacy firefox metrics, which we can use as a search index """ probe_summary = {} for probe_id, probe in probe_data.items(): if probe["type"] == "event": # let's just skip legacy firefox events, since we're just doing # this for GLAM's benefit (which doesn't display events) continue if probe["history"].get("nightly"): most_recent_metadata = probe["history"]["nightly"][0] else: most_recent_metadata = probe["history"]["beta"][0] normalized_probe_name = probe["name"].lower().replace(".", "_") if probe_id.startswith("scalar/"): # scalar names are camelCased, but we want snake_case # to match the convention used in bigquery-etl # see: https://github.com/mozilla/glam/issues/1956 normalized_probe_name = snake_case(probe_id.split("/")[1]).lower().replace(".", "_") probe_summary[normalized_probe_name] = { "name": normalized_probe_name, "id": probe_id, "type": probe["type"], "description": most_recent_metadata["description"], "bug_numbers": most_recent_metadata["bug_numbers"], "details": most_recent_metadata["details"], "optout": most_recent_metadata["optout"], "kind": most_recent_metadata["details"]["kind"], "versions": { channel: channel_data[0]["versions"] for (channel, channel_data) in probe["history"].items() }, "active": normalized_probe_name in activity_mapping, "seen_in_processes": activity_mapping.get(normalized_probe_name, []), } if most_recent_metadata["details"].get("labels") is not None: probe_summary[normalized_probe_name]["labels"] = most_recent_metadata["details"][ "labels" ] return probe_summary def write_firefox_legacy_metadata(output_dir, functions_dir): # pull down the recorded in process information, which we use as the # authoritative guide on whether a legacy probe is still "active" recorded_in_process_data = requests.get(PROBE_RECORDED_IN_PROCESSES_URL).json() activity_mapping = {row["metric"]: row["processes"] for row in recorded_in_process_data} # get the actual probe data probe_data = requests.get(PROBES_URL).json() # then write it out probe_output_directory = os.path.join(output_dir, "firefox_legacy", "metrics") os.makedirs(probe_output_directory, exist_ok=True) probe_summary = _get_legacy_firefox_metric_summary(probe_data, activity_mapping) for probe_name, probe_metadata in probe_summary.items(): with open(os.path.join(probe_output_directory, f"data_{probe_name}.json"), "w") as f: json.dump(probe_metadata, f) # write a search index for legacy telemetry data open(os.path.join(functions_dir, "metrics_search_firefox_legacy.js"), "w").write( create_metrics_search_js(probe_summary.values(), legacy=True) ) # write a search index for legacy telemetry + FOG data open(os.path.join(functions_dir, "metrics_search_fog_and_legacy.js"), "w").write( create_metrics_search_js(probe_summary.values(), app_name="fog_and_legacy", legacy=True) )