def check_for_duplicate_metrics()

in probe_scraper/glean_checks.py [0:0]


def check_for_duplicate_metrics(repositories, metrics_by_repo, emails):
    """
    Checks for duplicate metric names across all libraries used by a particular application.
    It only checks for metrics that exist in the latest (HEAD) commit in each repo, so that
    it's possible to remove (or disable) the metric in the latest commit and not have this
    check repeatedly fail.
    If duplicates are found, e-mails are queued and this returns True.
    """
    found_duplicates = False

    repo_by_library_name = {}
    repo_by_name = {}
    for repo in repositories:
        for library_name in repo.library_names or []:
            repo_by_library_name[library_name] = repo.name
        repo_by_name[repo.name] = repo

    for repo in repositories:
        for library_name in repo.dependencies:
            if library_name not in repo_by_library_name:
                raise MissingDependencyError(
                    f"{repo.name} missing dependency {library_name}"
                )
        dependencies = [repo.name] + [
            repo_by_library_name[library_name] for library_name in repo.dependencies
        ]

        metric_sources = {}
        for dependency in dependencies:
            # skip if no metrics
            if not metrics_by_repo[dependency]:
                continue
            # otherwise look for the latest timestamp for all metrics --
            # metrics which don't appear in the latest can be assumed to
            # no longer be present
            last_timestamp = max(
                [
                    metric["history"][-1]["dates"]["last"]
                    for metric in metrics_by_repo[dependency].values()
                ]
            )
            for metric_name, metric in metrics_by_repo[dependency].items():
                if metric["history"][-1]["dates"]["last"] == last_timestamp:
                    metric_sources.setdefault(metric_name, []).append(dependency)

        duplicate_sources = {}
        for k, v in metric_sources.items():
            # Exempt cases when one of the sources is Geckoview Streaming to
            # avoid false positive duplication accross app channels.
            # Temporarily exempt cases when one of the sources is server compat library
            # to avoid raising alarm for metrics defined in fxa's custom ping.
            v = [
                dep
                for dep in v
                if (
                    "engine-gecko" not in dep
                    and "glean-server-metrics-compat" not in dep
                )
            ]

            if k in SKIP_METRICS.keys():
                potential_deps = SKIP_METRICS[k]
                if any([dep for dep in potential_deps if dep in v]):
                    continue

            if len(v) > 1:
                duplicate_sources[k] = v

        if not len(duplicate_sources):
            continue

        found_duplicates = True

        addresses = set()
        duplicates = []
        for name, sources in duplicate_sources.items():
            duplicates.append(
                "- {!r} defined more than once in {}".format(
                    name, ", ".join(sorted(sources))
                )
            )

            for source in sources:
                # Send to the repository contacts
                addresses.update(repo_by_name[source].notification_emails)

                # Also send to the metric's contacts
                for history_entry in metrics_by_repo[source][name]["history"]:
                    addresses.update(history_entry["notification_emails"])

        duplicates = "\n".join(duplicates)

        emails[f"duplicate_metrics_{repo.name}"] = {
            "emails": [
                {
                    "subject": "Glean: Duplicate metric identifiers detected",
                    "message": DUPLICATE_METRICS_EMAIL_TEMPLATE.format(
                        duplicates=duplicates, repo=repo
                    ),
                }
            ],
            "addresses": list(addresses),
        }

    return found_duplicates