sync/datahub/metrichub_glossary.py (140 lines of code) (raw):
"""Builds the metric-hub glossary YAML file for syncing to DataHub."""
from collections import defaultdict
import itertools
import operator
from os import linesep
from typing import Any, Dict, List
from metric_config_parser.metric import MetricLevel
from datahub.emitter.mce_builder import make_term_urn, make_dataset_urn
import yaml
from sync.metrichub import (
METRIC_HUB_REPO_URL,
get_metric_definitions,
MetricHubDefinition,
)
GLOSSARY_FILENAME = "metric_hub_glossary.yaml"
TABLE_TO_METRIC_FILENAME = "datasets.yaml"
LOOKER_EXPLORE_URL = "https://mozilla.cloud.looker.com/explore"
def _build_metric_dict(metric: MetricHubDefinition) -> Dict:
metric_content = ""
if metric.deprecated:
metric_content += "#### ⚠️ **This metric has been deprecated**\n\n"
if metric.friendly_name:
metric_content += f"## {metric.friendly_name} \n\n"
if metric.level:
metric_content += (
f"**Metric Level:** {_get_metric_level_link_text(metric.level)}\n\n"
)
if metric.description:
metric_content += f"{metric.description.strip().replace(linesep, ' ')}\n\n"
if metric.sql_definition:
metric_content += (
f"**SQL Definition:**\n```sql\n{metric.sql_definition.strip()}\n```\n\n"
)
explore_link = _get_looker_explore_link(metric)
if explore_link:
metric_content += "**Explore this metric in Looker:**\n"
metric_content += explore_link
if metric.statistics:
metric_content += "\n"
metric_content += "\n".join(_get_looker_statistics_links(metric))
return {
"id": metric.urn,
"name": metric.display_name,
"description": metric_content,
"owners": {"users": metric.owners},
"term_source": "EXTERNAL",
}
def _get_metric_level_link_text(level: MetricLevel) -> str:
url = "https://mozilla.acryl.io/glossaryTerm"
if level == MetricLevel.GOLD:
urn = make_term_urn("5fbb70ef-0a69-4db5-a301-907dd13148bc")
text = "🥇Gold Metric"
elif level == MetricLevel.SILVER:
urn = make_term_urn("548b65c5-581c-4572-b544-8bd1cbbdc7a5")
text = "🥈Silver Metric"
elif level == MetricLevel.BRONZE:
urn = make_term_urn("3be839fc-9782-433f-8e82-0632dc780c1c")
text = "🥉Bronze Metric"
else:
return ""
return f"[{text}]({url}/{urn})\n\n"
def _get_looker_explore_link(metric: MetricHubDefinition) -> List[str]:
if metric.data_source is not None:
explore = f"metric_definitions_{metric.data_source}"
fields = ",".join(
[
f"{explore}.submission_date",
f"{explore}.{metric.name}",
]
)
title = f"In explore {explore} as {metric.name}"
return f"[{title}]({LOOKER_EXPLORE_URL}/{metric.product}/{explore}?fields={fields})"
return None
def _get_looker_statistics_links(metric: MetricHubDefinition) -> List[str]:
links = []
for statistic in metric.statistics:
if metric.data_source is not None:
explore = f"metric_definitions_{metric.data_source}"
fields = ",".join(
[
f"{explore}.submission_date",
f"{explore}.{metric.name}_{statistic.name}",
]
)
title = f"{metric.title_cased_name} {statistic.title_cased_name}"
links.append(
f"[{title}]({LOOKER_EXPLORE_URL}/{metric.product}/"
+ f"{explore}?fields={fields}&toggle=vis)"
)
return links
def _build_product_dict(product: str, metrics: List[MetricHubDefinition]) -> Dict:
return {
"name": product,
"description": f"{product} metrics",
"terms": [_build_metric_dict(metric) for metric in metrics],
}
def _generate_table_to_term_data(
metrics: List[MetricHubDefinition],
) -> List[Dict[str, Any]]:
source_table_to_metric = defaultdict(list)
yaml_data = []
for metric in metrics:
if metric.bigquery_tables is None:
continue
for bigquery_table in metric.bigquery_tables:
source_table_urn = make_dataset_urn(
platform="bigquery",
name=bigquery_table,
)
source_table_to_metric[source_table_urn].append(metric.urn)
for urn, glossary_terms in source_table_to_metric.items():
yaml_data.append({"urn": urn, "glossary_terms": glossary_terms})
return yaml_data
def main() -> None:
metric_hub_definitions = get_metric_definitions()
product_nodes = [
_build_product_dict(product, product_metrics)
for product, product_metrics in itertools.groupby(
metric_hub_definitions, operator.attrgetter("product")
)
]
glossary = {
"version": 1,
"source": "Metric-Hub",
"url": METRIC_HUB_REPO_URL,
"owners": [],
"nodes": [
{
"name": "Metric Hub",
"description": "Central hub for metric definitions that are considered the source of truth.", # noqa: E501
"nodes": product_nodes,
}
],
}
datasets_yaml_data = _generate_table_to_term_data(metric_hub_definitions)
with open(GLOSSARY_FILENAME, "w+") as f:
yaml.dump(glossary, f)
with open(TABLE_TO_METRIC_FILENAME, "w+") as f:
yaml.dump(datasets_yaml_data, f, sort_keys=False)
if __name__ == "__main__":
main()