def get_histogram_probes_and_buckets()

in sql/moz-fx-data-shared-prod/telemetry_derived/clients_daily_histogram_aggregates_v1.sql.py [0:0]


def get_histogram_probes_and_buckets(histogram_type, processes_to_output):
    """Return relevant histogram probes."""
    project = "moz-fx-data-shared-prod"
    main_summary_histograms = {}

    client = bigquery.Client(project)
    table = client.get_table("telemetry_stable.main_v5")
    main_summary_schema = [field.to_api_repr() for field in table.schema]

    # Fetch the histograms field
    histograms_field = []
    for field in main_summary_schema:
        if field["name"] != "payload":
            continue

        for payload_field in field["fields"]:
            if payload_field["name"] == histogram_type:
                histograms_field.append(
                    {"histograms": payload_field, "process": "parent"}
                )
                continue

            if payload_field["name"] == "processes":
                for processes_field in payload_field["fields"]:
                    if processes_field["name"] in ["content", "gpu"]:
                        process_field = processes_field["name"]
                        for type_field in processes_field["fields"]:
                            if type_field["name"] == histogram_type:
                                histograms_field.append(
                                    {"histograms": type_field, "process": process_field}
                                )
                                break

    if len(histograms_field) == 0:
        return

    for histograms_and_process in histograms_field:
        for histogram in histograms_and_process["histograms"].get("fields", {}):
            if "name" not in histogram:
                continue

            processes = main_summary_histograms.setdefault(histogram["name"], set())
            if (
                processes_to_output is None
                or histograms_and_process["process"] in processes_to_output
            ):
                processes.add(histograms_and_process["process"])
            main_summary_histograms[histogram["name"]] = processes

    with urllib.request.urlopen(PROBE_INFO_SERVICE) as url:
        data = json.loads(url.read())
        excluded_probes = probe_filters.get_etl_excluded_probes_quickfix("desktop")
        histogram_probes = {
            x.replace("histogram/", "").replace(".", "_").lower()
            for x in data.keys()
            if x.startswith("histogram/")
        }

        bucket_details = {}
        relevant_probes = {
            histogram: {"processes": process}
            for histogram, process in main_summary_histograms.items()
            if histogram in histogram_probes and histogram not in excluded_probes
        }
        for key in data.keys():
            if not key.startswith("histogram/"):
                continue

            channel = "nightly"
            if "nightly" not in data[key]["history"]:
                channel = "beta"

                if "beta" not in data[key]["history"]:
                    channel = "release"

            data_details = data[key]["history"][channel][0]["details"]
            probe = key.replace("histogram/", "").replace(".", "_").lower()

            # Some keyed GPU metrics aren't correctly flagged as type
            # "keyed_histograms", so we filter those out here.
            if processes_to_output is None or "gpu" in processes_to_output:
                if data_details["keyed"] == (histogram_type == "histograms"):
                    try:
                        del relevant_probes[probe]
                    except KeyError:
                        pass
                    continue

            if probe in relevant_probes:
                relevant_probes[probe]["type"] = data_details["kind"]

            # NOTE: some probes, (e.g. POPUP_NOTIFICATION_MAINACTION_TRIGGERED_MS) have values
            # in the probe info service like 80 * 25 for the value of n_buckets.
            # So they do need to be evaluated as expressions.
            bucket_details[probe] = {
                "n_buckets": int(eval(str(data_details["n_buckets"]))),
                "min": int(eval(str(data_details["low"]))),
                "max": int(eval(str(data_details["high"]))),
            }

        return {"probes": relevant_probes, "buckets": bucket_details}