generator/views/glean_ping_view.py (353 lines of code) (raw):

"""Class to describe a Glean Ping View.""" import logging import re from collections import Counter from textwrap import dedent from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import click from mozilla_schema_generator.glean_ping import GleanPing from mozilla_schema_generator.probes import GleanProbe from . import lookml_utils from .lookml_utils import slug_to_title from .ping_view import PingView DISTRIBUTION_TYPES = { "timing_distribution", "memory_distribution", "custom_distribution", } ALLOWED_TYPES = DISTRIBUTION_TYPES | { "boolean", "labeled_boolean", "counter", "labeled_counter", "datetime", "jwe", "quantity", "string", "labeled_string", "rate", "timespan", "uuid", "url", "text", "labeled_quantity", } # Bug 1737656 - some metric types are exposed under different names # We need to map to the new name when building dimensions. RENAMED_METRIC_TYPES = { "jwe": "jwe2", "text": "text2", "url": "url2", } DISALLOWED_PINGS = {"events"} # List of labeled counter names for which a suggest explore should be generated. # Generating suggest explores for all labeled counters slows down Looker. SUGGESTS_FOR_LABELED_COUNTERS = {"metrics__labeled_counter__glean_error_invalid_label"} class GleanPingView(PingView): """A view on a ping table for an application using the Glean SDK.""" type: str = "glean_ping_view" allow_glean: bool = True @classmethod def from_db_views(klass, *args, **kwargs): """Generate GleanPingViews from db views.""" for view in super().from_db_views(*args, **kwargs): if view.name not in DISALLOWED_PINGS: yield view def to_lookml(self, v1_name: Optional[str], dryrun) -> Dict[str, Any]: """Generate LookML for this view. The Glean views include a labeled metrics, which need to be joined against the view in the explore. """ lookml = super().to_lookml(v1_name, dryrun=dryrun) # ignore nested join views lookml["views"] = [lookml["views"][0]] # iterate over all of the glean metrics and generate views for unnested # fields as necessary. Append them to the list of existing view # definitions. table = next( (table for table in self.tables if table.get("channel") == "release"), self.tables[0], )["table"] dimensions = self.get_dimensions(table, v1_name, dryrun=dryrun) dimension_names = {dimension["name"] for dimension in dimensions} client_id_field = self.get_client_id(dimensions, table) view_definitions = [] metrics = self._get_glean_metrics(v1_name) for metric in metrics: looker_name = self._to_looker_name(metric) if looker_name not in dimension_names: continue # skip metrics with no matching dimension if metric.type == "labeled_counter": view_name = f"{self.name}__{looker_name}" suggest_name = f"suggest__{view_name}" category, name = [ slug_to_title(v) for v in self._get_category_and_name(metric) ] view_label = f"{category} - {name}" metric_hidden = "no" if metric.is_in_source() else "yes" measures = [ { "name": "count", "type": "sum", "sql": "${value}", "hidden": metric_hidden, } ] if client_id_field is not None: # client_id field is missing for pings with minimal Glean schema measures.append( { "name": "client_count", "type": "count_distinct", "sql": f"case when ${{value}} > 0 then ${{{self.name}.{client_id_field}}} end", "hidden": metric_hidden, } ) join_view: Dict[str, Any] = { "name": view_name, "label": view_label, "dimensions": [ { "name": "document_id", "type": "string", "sql": f"${{{self.name}.document_id}}", "hidden": "yes", }, # labeled counters need a primary key that incorporates # their labels, otherwise we get jumbled results: # https://github.com/mozilla/lookml-generator/issues/171 { "name": "document_label_id", "type": "string", "sql": f"${{{self.name}.document_id}}-${{label}}", "primary_key": "yes", "hidden": "yes", }, { "name": "value", "type": "number", "sql": "${TABLE}.value", "hidden": "yes", }, ], "measures": measures, } if looker_name in SUGGESTS_FOR_LABELED_COUNTERS: join_view["dimensions"].append( { "name": "label", "type": "string", "sql": "${TABLE}.key", "suggest_explore": suggest_name, "suggest_dimension": f"{suggest_name}.key", "hidden": metric_hidden, }, ) suggest_view = { "name": suggest_name, "derived_table": { "sql": dedent( f""" select m.key, count(*) as n from {table} as t, unnest(metrics.{metric.type}.{metric.id.replace(".", "_")}) as m where date(submission_timestamp) > date_sub(current_date, interval 30 day) and sample_id = 0 group by key order by n desc """ ) }, "dimensions": [ {"name": "key", "type": "string", "sql": "${TABLE}.key"} ], } view_definitions += [join_view, suggest_view] else: join_view["dimensions"].append( { "name": "label", "type": "string", "sql": "${TABLE}.key", "hidden": metric_hidden, }, ) view_definitions += [join_view] # deduplicate view definitions, because somehow a few entries make it in # twice e.g. metrics__metrics__labeled_counter__media_audio_init_failure view_definitions = sorted( {v["name"]: v for v in view_definitions}.values(), key=lambda x: x["name"] # type: ignore ) [project, dataset, table] = table.split(".") table_schema = dryrun.create( project=project, dataset=dataset, table=table, ).get_table_schema() nested_views = lookml_utils._generate_nested_dimension_views( table_schema, self.name ) lookml["views"] += view_definitions + nested_views return lookml def _get_links(self, dimension: dict) -> List[Dict[str, str]]: """Get a link annotation given a metric name.""" name = self._get_name(dimension) title = slug_to_title(name) return [ { "label": (f"Glean Dictionary reference for {title}"), "url": ( f"https://dictionary.telemetry.mozilla.org" f"/apps/{self.namespace}/metrics/{name}" ), "icon_url": "https://dictionary.telemetry.mozilla.org/favicon.png", } ] def _get_name(self, dimension: dict) -> str: return dimension["name"].split("__")[-1] def _get_metric_type(self, dimension: dict) -> str: return dimension["name"].split("__")[1] def _is_metric(self, dimension) -> bool: return dimension["name"].startswith("metrics__") def _get_glean_metrics(self, v1_name: Optional[str]) -> List[GleanProbe]: if v1_name is None: logging.error( f"Error: Missing v1 name for ping {self.name} in namespace {self.namespace}" ) return [] repo = next((r for r in GleanPing.get_repos() if r["name"] == v1_name)) glean_app = GleanPing(repo) ping_probes = [] probe_ids = set() for probe in glean_app.get_probes(): send_in_pings_snakecase = [ ping.replace("-", "_") for ping in probe.definition["send_in_pings"] ] if self.name not in send_in_pings_snakecase: continue if probe.id in probe_ids: # Some ids are duplicated, ignore them continue ping_probes.append(probe) probe_ids.add(probe.id) return ping_probes def _get_category_and_name(self, metric: GleanProbe) -> Tuple[str, str]: *category, name = metric.id.split(".") category = "_".join(category) return category, name def _to_looker_name(self, metric: GleanProbe, suffix: str = "") -> str: """Convert a glean probe into a looker name.""" category, name = self._get_category_and_name(metric) sep = "" if not category else "_" label = name looker_name = f"metrics__{metric.type}__{category}{sep}{label}" if suffix: looker_name = f"{looker_name}__{suffix}" return looker_name def _make_dimension( self, metric: GleanProbe, suffix: str, sql_map: Dict[str, Dict[str, str]] ) -> Optional[Dict[str, Union[str, List[Dict[str, str]]]]]: *category, name = metric.id.split(".") category = "_".join(category) sep = "" if not category else "_" label = name type = RENAMED_METRIC_TYPES.get(metric.type, metric.type) looker_name = f"metrics__{type}__{category}{sep}{name}" if suffix: label = f"{name}_{suffix}" looker_name = f"{looker_name}__{suffix}" if looker_name not in sql_map: return None group_label = slug_to_title(category) group_item_label = slug_to_title(label) if not group_label: group_label = "Glean" friendly_name = f"{group_label} {group_item_label}" lookml = { "name": looker_name, "label": friendly_name, # metrics that are no longer in the source are hidden by default "hidden": "no" if metric.is_in_source() else "yes", "sql": sql_map[looker_name]["sql"], "type": sql_map[looker_name]["type"], "group_label": group_label, "group_item_label": group_item_label, "links": [ { "label": (f"Glean Dictionary reference for {friendly_name}"), "url": ( f"https://dictionary.telemetry.mozilla.org" f"/apps/{self.namespace}/metrics/{category}{sep}{name}" ), "icon_url": "https://dictionary.telemetry.mozilla.org/favicon.png", }, ], } if lookml["type"] == "time": # Remove any _{type} suffix from the dimension group name because each timeframe # will add a _{type} suffix to its individual dimension name. lookml["name"] = re.sub("_(date|time(stamp)?)$", "", looker_name) lookml["timeframes"] = [ "raw", "time", "date", "week", "month", "quarter", "year", ] # Dimension groups should not be nested (see issue #82). del lookml["group_label"] del lookml["group_item_label"] # Links are not supported for dimension groups. del lookml["links"] # remove some elements from the definition if we're handling a labeled # counter, as an initial join dimension if metric.type == "labeled_counter": # this field is not used since labeled counters are maps del lookml["type"] lookml["hidden"] = "yes" if metric.description: lookml["description"] = metric.description return lookml def _get_metric_dimensions( self, metric: GleanProbe, sql_map: Dict[str, Dict[str, str]] ) -> Iterable[Optional[Dict[str, Union[str, List[Dict[str, str]]]]]]: if metric.type == "rate": for suffix in ("numerator", "denominator"): yield self._make_dimension(metric, suffix, sql_map) elif metric.type in DISTRIBUTION_TYPES: yield self._make_dimension(metric, "sum", sql_map) elif metric.type == "timespan": yield self._make_dimension(metric, "value", sql_map) elif metric.type in ALLOWED_TYPES: yield self._make_dimension(metric, "", sql_map) def _get_glean_metric_dimensions( self, all_fields: List[dict], v1_name: Optional[str] ): sql_map = { f["name"]: {"sql": f["sql"], "type": f.get("type", "string")} for f in all_fields } metrics = self._get_glean_metrics(v1_name) return [ dimension for metric in metrics for dimension in self._get_metric_dimensions(metric, sql_map) if dimension is not None ] def _add_link(self, dimension): annotations = {} if self._is_metric(dimension) and not self._get_metric_type( dimension ).startswith("labeled"): annotations["links"] = self._get_links(dimension) return dict(dimension, **annotations) def get_dimensions( self, table, v1_name: Optional[str], dryrun ) -> List[Dict[str, Any]]: """Get the set of dimensions for this view.""" all_fields = super().get_dimensions(table, v1_name, dryrun=dryrun) fields = self._get_glean_metric_dimensions(all_fields, v1_name) + [ self._add_link(d) for d in all_fields if not d["name"].startswith("metrics__") ] # later entries will override earlier entries, if there are duplicates field_dict = {f["name"]: f for f in fields} return list(field_dict.values()) def get_measures( self, dimensions: List[dict], table: str, v1_name: Optional[str] ) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]: """Generate measures from a list of dimensions. When no dimension-specific measures are found, return a single "count" measure. Raise ClickException if dimensions result in duplicate measures. """ measures = super().get_measures(dimensions, table, v1_name) client_id_field = self.get_client_id(dimensions, table) for dimension in dimensions: if ( self._is_metric(dimension) and self._get_metric_type(dimension) == "counter" ): # handle the counters in the metric ping name = self._get_name(dimension) dimension_name = dimension["name"] measures += [ { "name": name, "type": "sum", "sql": f"${{{dimension_name}}}", "links": self._get_links(dimension), }, ] if client_id_field is not None: measures += [ { "name": f"{name}_client_count", "type": "count_distinct", "filters": [{dimension_name: ">0"}], "sql": f"${{{client_id_field}}}", "links": self._get_links(dimension), }, ] # check if there are any duplicate values names = [measure["name"] for measure in measures] duplicates = [k for k, v in Counter(names).items() if v > 1] if duplicates: raise click.ClickException( f"duplicate measures {duplicates!r} for table {table!r}" ) return measures