in probe_scraper/glean_checks.py [0:0]
def check_for_duplicate_metrics(repositories, metrics_by_repo, emails):
"""
Checks for duplicate metric names across all libraries used by a particular application.
It only checks for metrics that exist in the latest (HEAD) commit in each repo, so that
it's possible to remove (or disable) the metric in the latest commit and not have this
check repeatedly fail.
If duplicates are found, e-mails are queued and this returns True.
"""
found_duplicates = False
repo_by_library_name = {}
repo_by_name = {}
for repo in repositories:
for library_name in repo.library_names or []:
repo_by_library_name[library_name] = repo.name
repo_by_name[repo.name] = repo
for repo in repositories:
for library_name in repo.dependencies:
if library_name not in repo_by_library_name:
raise MissingDependencyError(
f"{repo.name} missing dependency {library_name}"
)
dependencies = [repo.name] + [
repo_by_library_name[library_name] for library_name in repo.dependencies
]
metric_sources = {}
for dependency in dependencies:
# skip if no metrics
if not metrics_by_repo[dependency]:
continue
# otherwise look for the latest timestamp for all metrics --
# metrics which don't appear in the latest can be assumed to
# no longer be present
last_timestamp = max(
[
metric["history"][-1]["dates"]["last"]
for metric in metrics_by_repo[dependency].values()
]
)
for metric_name, metric in metrics_by_repo[dependency].items():
if metric["history"][-1]["dates"]["last"] == last_timestamp:
metric_sources.setdefault(metric_name, []).append(dependency)
duplicate_sources = {}
for k, v in metric_sources.items():
# Exempt cases when one of the sources is Geckoview Streaming to
# avoid false positive duplication accross app channels.
# Temporarily exempt cases when one of the sources is server compat library
# to avoid raising alarm for metrics defined in fxa's custom ping.
v = [
dep
for dep in v
if (
"engine-gecko" not in dep
and "glean-server-metrics-compat" not in dep
)
]
if k in SKIP_METRICS.keys():
potential_deps = SKIP_METRICS[k]
if any([dep for dep in potential_deps if dep in v]):
continue
if len(v) > 1:
duplicate_sources[k] = v
if not len(duplicate_sources):
continue
found_duplicates = True
addresses = set()
duplicates = []
for name, sources in duplicate_sources.items():
duplicates.append(
"- {!r} defined more than once in {}".format(
name, ", ".join(sorted(sources))
)
)
for source in sources:
# Send to the repository contacts
addresses.update(repo_by_name[source].notification_emails)
# Also send to the metric's contacts
for history_entry in metrics_by_repo[source][name]["history"]:
addresses.update(history_entry["notification_emails"])
duplicates = "\n".join(duplicates)
emails[f"duplicate_metrics_{repo.name}"] = {
"emails": [
{
"subject": "Glean: Duplicate metric identifiers detected",
"message": DUPLICATE_METRICS_EMAIL_TEMPLATE.format(
duplicates=duplicates, repo=repo
),
}
],
"addresses": list(addresses),
}
return found_duplicates