generator/views/glean_ping_view.py (353 lines of code) (raw):
"""Class to describe a Glean Ping View."""
import logging
import re
from collections import Counter
from textwrap import dedent
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
import click
from mozilla_schema_generator.glean_ping import GleanPing
from mozilla_schema_generator.probes import GleanProbe
from . import lookml_utils
from .lookml_utils import slug_to_title
from .ping_view import PingView
DISTRIBUTION_TYPES = {
"timing_distribution",
"memory_distribution",
"custom_distribution",
}
ALLOWED_TYPES = DISTRIBUTION_TYPES | {
"boolean",
"labeled_boolean",
"counter",
"labeled_counter",
"datetime",
"jwe",
"quantity",
"string",
"labeled_string",
"rate",
"timespan",
"uuid",
"url",
"text",
"labeled_quantity",
}
# Bug 1737656 - some metric types are exposed under different names
# We need to map to the new name when building dimensions.
RENAMED_METRIC_TYPES = {
"jwe": "jwe2",
"text": "text2",
"url": "url2",
}
DISALLOWED_PINGS = {"events"}
# List of labeled counter names for which a suggest explore should be generated.
# Generating suggest explores for all labeled counters slows down Looker.
SUGGESTS_FOR_LABELED_COUNTERS = {"metrics__labeled_counter__glean_error_invalid_label"}
class GleanPingView(PingView):
"""A view on a ping table for an application using the Glean SDK."""
type: str = "glean_ping_view"
allow_glean: bool = True
@classmethod
def from_db_views(klass, *args, **kwargs):
"""Generate GleanPingViews from db views."""
for view in super().from_db_views(*args, **kwargs):
if view.name not in DISALLOWED_PINGS:
yield view
def to_lookml(self, v1_name: Optional[str], dryrun) -> Dict[str, Any]:
"""Generate LookML for this view.
The Glean views include a labeled metrics, which need to be joined
against the view in the explore.
"""
lookml = super().to_lookml(v1_name, dryrun=dryrun)
# ignore nested join views
lookml["views"] = [lookml["views"][0]]
# iterate over all of the glean metrics and generate views for unnested
# fields as necessary. Append them to the list of existing view
# definitions.
table = next(
(table for table in self.tables if table.get("channel") == "release"),
self.tables[0],
)["table"]
dimensions = self.get_dimensions(table, v1_name, dryrun=dryrun)
dimension_names = {dimension["name"] for dimension in dimensions}
client_id_field = self.get_client_id(dimensions, table)
view_definitions = []
metrics = self._get_glean_metrics(v1_name)
for metric in metrics:
looker_name = self._to_looker_name(metric)
if looker_name not in dimension_names:
continue # skip metrics with no matching dimension
if metric.type == "labeled_counter":
view_name = f"{self.name}__{looker_name}"
suggest_name = f"suggest__{view_name}"
category, name = [
slug_to_title(v) for v in self._get_category_and_name(metric)
]
view_label = f"{category} - {name}"
metric_hidden = "no" if metric.is_in_source() else "yes"
measures = [
{
"name": "count",
"type": "sum",
"sql": "${value}",
"hidden": metric_hidden,
}
]
if client_id_field is not None:
# client_id field is missing for pings with minimal Glean schema
measures.append(
{
"name": "client_count",
"type": "count_distinct",
"sql": f"case when ${{value}} > 0 then ${{{self.name}.{client_id_field}}} end",
"hidden": metric_hidden,
}
)
join_view: Dict[str, Any] = {
"name": view_name,
"label": view_label,
"dimensions": [
{
"name": "document_id",
"type": "string",
"sql": f"${{{self.name}.document_id}}",
"hidden": "yes",
},
# labeled counters need a primary key that incorporates
# their labels, otherwise we get jumbled results:
# https://github.com/mozilla/lookml-generator/issues/171
{
"name": "document_label_id",
"type": "string",
"sql": f"${{{self.name}.document_id}}-${{label}}",
"primary_key": "yes",
"hidden": "yes",
},
{
"name": "value",
"type": "number",
"sql": "${TABLE}.value",
"hidden": "yes",
},
],
"measures": measures,
}
if looker_name in SUGGESTS_FOR_LABELED_COUNTERS:
join_view["dimensions"].append(
{
"name": "label",
"type": "string",
"sql": "${TABLE}.key",
"suggest_explore": suggest_name,
"suggest_dimension": f"{suggest_name}.key",
"hidden": metric_hidden,
},
)
suggest_view = {
"name": suggest_name,
"derived_table": {
"sql": dedent(
f"""
select
m.key,
count(*) as n
from {table} as t,
unnest(metrics.{metric.type}.{metric.id.replace(".", "_")}) as m
where date(submission_timestamp) > date_sub(current_date, interval 30 day)
and sample_id = 0
group by key
order by n desc
"""
)
},
"dimensions": [
{"name": "key", "type": "string", "sql": "${TABLE}.key"}
],
}
view_definitions += [join_view, suggest_view]
else:
join_view["dimensions"].append(
{
"name": "label",
"type": "string",
"sql": "${TABLE}.key",
"hidden": metric_hidden,
},
)
view_definitions += [join_view]
# deduplicate view definitions, because somehow a few entries make it in
# twice e.g. metrics__metrics__labeled_counter__media_audio_init_failure
view_definitions = sorted(
{v["name"]: v for v in view_definitions}.values(), key=lambda x: x["name"] # type: ignore
)
[project, dataset, table] = table.split(".")
table_schema = dryrun.create(
project=project,
dataset=dataset,
table=table,
).get_table_schema()
nested_views = lookml_utils._generate_nested_dimension_views(
table_schema, self.name
)
lookml["views"] += view_definitions + nested_views
return lookml
def _get_links(self, dimension: dict) -> List[Dict[str, str]]:
"""Get a link annotation given a metric name."""
name = self._get_name(dimension)
title = slug_to_title(name)
return [
{
"label": (f"Glean Dictionary reference for {title}"),
"url": (
f"https://dictionary.telemetry.mozilla.org"
f"/apps/{self.namespace}/metrics/{name}"
),
"icon_url": "https://dictionary.telemetry.mozilla.org/favicon.png",
}
]
def _get_name(self, dimension: dict) -> str:
return dimension["name"].split("__")[-1]
def _get_metric_type(self, dimension: dict) -> str:
return dimension["name"].split("__")[1]
def _is_metric(self, dimension) -> bool:
return dimension["name"].startswith("metrics__")
def _get_glean_metrics(self, v1_name: Optional[str]) -> List[GleanProbe]:
if v1_name is None:
logging.error(
f"Error: Missing v1 name for ping {self.name} in namespace {self.namespace}"
)
return []
repo = next((r for r in GleanPing.get_repos() if r["name"] == v1_name))
glean_app = GleanPing(repo)
ping_probes = []
probe_ids = set()
for probe in glean_app.get_probes():
send_in_pings_snakecase = [
ping.replace("-", "_") for ping in probe.definition["send_in_pings"]
]
if self.name not in send_in_pings_snakecase:
continue
if probe.id in probe_ids:
# Some ids are duplicated, ignore them
continue
ping_probes.append(probe)
probe_ids.add(probe.id)
return ping_probes
def _get_category_and_name(self, metric: GleanProbe) -> Tuple[str, str]:
*category, name = metric.id.split(".")
category = "_".join(category)
return category, name
def _to_looker_name(self, metric: GleanProbe, suffix: str = "") -> str:
"""Convert a glean probe into a looker name."""
category, name = self._get_category_and_name(metric)
sep = "" if not category else "_"
label = name
looker_name = f"metrics__{metric.type}__{category}{sep}{label}"
if suffix:
looker_name = f"{looker_name}__{suffix}"
return looker_name
def _make_dimension(
self, metric: GleanProbe, suffix: str, sql_map: Dict[str, Dict[str, str]]
) -> Optional[Dict[str, Union[str, List[Dict[str, str]]]]]:
*category, name = metric.id.split(".")
category = "_".join(category)
sep = "" if not category else "_"
label = name
type = RENAMED_METRIC_TYPES.get(metric.type, metric.type)
looker_name = f"metrics__{type}__{category}{sep}{name}"
if suffix:
label = f"{name}_{suffix}"
looker_name = f"{looker_name}__{suffix}"
if looker_name not in sql_map:
return None
group_label = slug_to_title(category)
group_item_label = slug_to_title(label)
if not group_label:
group_label = "Glean"
friendly_name = f"{group_label} {group_item_label}"
lookml = {
"name": looker_name,
"label": friendly_name,
# metrics that are no longer in the source are hidden by default
"hidden": "no" if metric.is_in_source() else "yes",
"sql": sql_map[looker_name]["sql"],
"type": sql_map[looker_name]["type"],
"group_label": group_label,
"group_item_label": group_item_label,
"links": [
{
"label": (f"Glean Dictionary reference for {friendly_name}"),
"url": (
f"https://dictionary.telemetry.mozilla.org"
f"/apps/{self.namespace}/metrics/{category}{sep}{name}"
),
"icon_url": "https://dictionary.telemetry.mozilla.org/favicon.png",
},
],
}
if lookml["type"] == "time":
# Remove any _{type} suffix from the dimension group name because each timeframe
# will add a _{type} suffix to its individual dimension name.
lookml["name"] = re.sub("_(date|time(stamp)?)$", "", looker_name)
lookml["timeframes"] = [
"raw",
"time",
"date",
"week",
"month",
"quarter",
"year",
]
# Dimension groups should not be nested (see issue #82).
del lookml["group_label"]
del lookml["group_item_label"]
# Links are not supported for dimension groups.
del lookml["links"]
# remove some elements from the definition if we're handling a labeled
# counter, as an initial join dimension
if metric.type == "labeled_counter":
# this field is not used since labeled counters are maps
del lookml["type"]
lookml["hidden"] = "yes"
if metric.description:
lookml["description"] = metric.description
return lookml
def _get_metric_dimensions(
self, metric: GleanProbe, sql_map: Dict[str, Dict[str, str]]
) -> Iterable[Optional[Dict[str, Union[str, List[Dict[str, str]]]]]]:
if metric.type == "rate":
for suffix in ("numerator", "denominator"):
yield self._make_dimension(metric, suffix, sql_map)
elif metric.type in DISTRIBUTION_TYPES:
yield self._make_dimension(metric, "sum", sql_map)
elif metric.type == "timespan":
yield self._make_dimension(metric, "value", sql_map)
elif metric.type in ALLOWED_TYPES:
yield self._make_dimension(metric, "", sql_map)
def _get_glean_metric_dimensions(
self, all_fields: List[dict], v1_name: Optional[str]
):
sql_map = {
f["name"]: {"sql": f["sql"], "type": f.get("type", "string")}
for f in all_fields
}
metrics = self._get_glean_metrics(v1_name)
return [
dimension
for metric in metrics
for dimension in self._get_metric_dimensions(metric, sql_map)
if dimension is not None
]
def _add_link(self, dimension):
annotations = {}
if self._is_metric(dimension) and not self._get_metric_type(
dimension
).startswith("labeled"):
annotations["links"] = self._get_links(dimension)
return dict(dimension, **annotations)
def get_dimensions(
self, table, v1_name: Optional[str], dryrun
) -> List[Dict[str, Any]]:
"""Get the set of dimensions for this view."""
all_fields = super().get_dimensions(table, v1_name, dryrun=dryrun)
fields = self._get_glean_metric_dimensions(all_fields, v1_name) + [
self._add_link(d)
for d in all_fields
if not d["name"].startswith("metrics__")
]
# later entries will override earlier entries, if there are duplicates
field_dict = {f["name"]: f for f in fields}
return list(field_dict.values())
def get_measures(
self, dimensions: List[dict], table: str, v1_name: Optional[str]
) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]:
"""Generate measures from a list of dimensions.
When no dimension-specific measures are found, return a single "count" measure.
Raise ClickException if dimensions result in duplicate measures.
"""
measures = super().get_measures(dimensions, table, v1_name)
client_id_field = self.get_client_id(dimensions, table)
for dimension in dimensions:
if (
self._is_metric(dimension)
and self._get_metric_type(dimension) == "counter"
):
# handle the counters in the metric ping
name = self._get_name(dimension)
dimension_name = dimension["name"]
measures += [
{
"name": name,
"type": "sum",
"sql": f"${{{dimension_name}}}",
"links": self._get_links(dimension),
},
]
if client_id_field is not None:
measures += [
{
"name": f"{name}_client_count",
"type": "count_distinct",
"filters": [{dimension_name: ">0"}],
"sql": f"${{{client_id_field}}}",
"links": self._get_links(dimension),
},
]
# check if there are any duplicate values
names = [measure["name"] for measure in measures]
duplicates = [k for k, v in Counter(names).items() if v > 1]
if duplicates:
raise click.ClickException(
f"duplicate measures {duplicates!r} for table {table!r}"
)
return measures