probe_scraper/check_repositories.py (80 lines of code) (raw):

import os import re from collections import defaultdict from typing import Set, Tuple import git import requests as reqs from .parsers.repositories import RepositoriesParser GIT = git.Git() GIT_BRANCH_PATTERN = re.compile("ref: refs/heads/([^\t]+)\tHEAD") GITHUB_RAW_URL = "https://raw.githubusercontent.com" REPOSITORIES = os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "repositories.yaml" ) EXPECTED_MISSING_FILES: Set[Tuple[str, str]] = { ("support-migration", "components/support/migration/metrics.yaml"), ("viu-politica", "source/telemetry/metrics.yaml"), } validation_errors = [] repos = RepositoriesParser().parse(REPOSITORIES) app_id_channels = defaultdict(lambda: defaultdict(lambda: 0)) repo_by_library_name = {} for repo in repos: for library_name in repo.library_names or []: repo_by_library_name[library_name] = repo.name for repo in repos: metrics_files = repo.get_metrics_file_paths() temp_errors = [] if repo.app_id and repo.channel and not repo.deprecated: app_id_channels[repo.app_id][repo.channel] += 1 for metric_file in metrics_files: if repo.deprecated: continue # ignore missing files for deprecated apps if (repo.name, metric_file) in EXPECTED_MISSING_FILES: continue # ignore missing files branch = repo.branch if branch is None: match = GIT_BRANCH_PATTERN.match( GIT.ls_remote("--symref", repo.url, "HEAD") ) if match is None: temp_errors += ["Failed to get default branch from git for " + repo.url] continue branch = match.groups()[0] temp_url = ( repo.url.replace("https://github.com", GITHUB_RAW_URL) + "/" + branch + "/" + metric_file ) response = reqs.get(temp_url) if response.status_code != 200: temp_errors += ["Metrics file was not found at " + temp_url] for library_name in repo.dependencies: if library_name not in repo_by_library_name: temp_errors.append(f"Dependency not found: {library_name}") if temp_errors and not repo.prototype: validation_errors.append({"repo": repo.name, "errors": temp_errors}) # Ensure non-deprecated channels are uniquely named duplication_errors = [] for app_id, channels in app_id_channels.items(): temp_errors = [] for channel_name, num in channels.items(): if num > 1: duplication_errors.append( f"Non-deprecated channel names must be unique, found {channel_name} {num} " f"times for {app_id}" ) if validation_errors: print("\nSummary of validation errors:\n") print(f"{len(validation_errors)} repositories had problems\n") for error in validation_errors: print(f"\nErrors found in {error['repo']}:\n") for line_errors in error["errors"]: print(line_errors) if duplication_errors: print("\nDuplicate channel names found:\n") for duplication_error in duplication_errors: print(duplication_error) if validation_errors or duplication_errors: exit(1)