probe_scraper/scrapers/git_scraper.py (362 lines of code) (raw):

# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. import re import tempfile import traceback from collections import defaultdict from dataclasses import dataclass, field from datetime import date, datetime, time, timedelta from functools import cached_property from pathlib import Path from typing import Dict, Iterable, List, Optional, Tuple, Union import git from probe_scraper.exc import ProbeScraperInvalidRequest from probe_scraper.parsers.repositories import Repository GIT_HASH_PATTERN = re.compile("([A-Fa-f0-9]){40}") # WARNING! # Changing these dates can cause files that had metrics to # stop being scraped. When the probe-info-service # stops reporting those files, the schema-generator # will not add them to the schemas, resulting in a # schema-incompatible change that breaks the pipeline. FENIX_DATE = "2019-06-04 00:00:00" MIN_DATES = { # Previous versions of the file were not schema-compatible "glean": "2019-04-11 00:00:00", "fenix": FENIX_DATE, "fenix-nightly": FENIX_DATE, "firefox-android-nightly": FENIX_DATE, "firefox-android-beta": FENIX_DATE, "firefox-android-release": FENIX_DATE, "reference-browser": "2019-04-01 00:00:00", "firefox-desktop": "2020-07-29 00:00:00", "glean-js": "2020-09-21 13:35:00", "mozilla-vpn": "2021-05-25 00:00:00", "mozilla-vpn-android": "2021-05-25 00:00:00", "rally-markup-fb-pixel-hunt": "2021-12-04 00:00:00", "rally-citp-search-engine-usage": "2022-04-15 00:00:00", "relay-backend": "2024-05-09 00:00:00", } # Some commits in projects might contain invalid metric files. # When we know these problems are fixed in later commits we can skip them. SKIP_COMMITS = { "engine-gecko": [ "9bd9d7fa6c679f35d8cbeb157ff839c63b21a2e6" # Missing schema update from v1 to v2 ], "engine-gecko-beta": [ "9bd9d7fa6c679f35d8cbeb157ff839c63b21a2e6" # Missing schema update from v1 to v2 ], "gecko": [ "43d8cf138695faae2fca0adf44c94f47fdadfca8", # Missing gfx/metrics.yaml "340c8521a54ad4d4a32dd16333676a6ff85aaec2", # Missing toolkit/components/glean/pings.yaml "4520632fe0664572c5f70688595b7721d167e2d0", # Missing toolkit/components/glean/pings.yaml "c5d5f045aaba41933622b5a187c39da0d6ab5d80", # Missing toolkit/components/glean/tags.yaml "2c475db0ffe5df6010ded8ccb9784b0645d92ebb", # Missing netwerk/protocol/http/metrics.yaml "b6dbdfec41221b0a80cc211f85abbb01e33f8692", # Missing netwerk/protocol/http/metrics.yaml "da2b7986d0b26f26cd1ac2d8c5b78b70009a24b6", # Missing netwerk/protocol/http/metrics.yaml "74a43f86ea999ec985d77eb6c3c7f766b570dd9d", # Missing dom/media/webrtc/metrics.yaml # Missing toolkit/components/pdfjs/metrics.yaml "d1d0b69871e3d38ead989d73f30563a501a448b6", "66d41da90f85d19fef2d5249c8f3058433ec4bd5", # Missing dom/pings.yaml "1e64234ac7f7303d5942deb6d90dd85cd4eb6e12", # Missing xpcom/metrics.yaml "cf06f2778f48c7f92d908dae73d48268db454e72", # Missing ipc/ipdl/metrics.yaml "bb188d821a6b3d27951ed05526ec7010d3ec0c52", # Missing ipc/ipdl/metrics.yaml "0e55b6d34c8fac3144f10f9aa450e33e4b55d520", # Missing ipc/ipdl/metrics.yaml "0b1543e85d13c30a13c57e959ce9815a3f0fa1d3", # Missing ipc/ipdl/metrics.yaml "9bc20993bc6960762ed281201e9cff437a88ca6c", # Missing ipc/ipdl/metrics.yaml "3f6ba0d4adbdf9d3e81b7047ff4c21384abbd234", # Missing dom/base/use_counter_metrics.yaml "1a7724cfd6b3cce2c599e323afb14f31430e5acd", # Missing dom/base/use_counter_metrics.yaml "02731904bba2c2f4e1c043e45a492bb21b33a930", # Missing security/manager/ssl/metrics.yaml "b16c6e1f04e563c916fb43b62661fdc0d354a925", # Missing security/manager/ssl/metrics.yaml # Missing toolkit/components/reportbrokensite/metrics.yaml "42acdc9cd5ae89222bdceeeaed7bacac755be48f", # Missing toolkit/components/reportbrokensite/metrics.yaml "c76093316c58ae74a21e854b8035c91d0c75df6e", # Missing toolkit/components/translations/metrics.yaml "b80d1b362960cef8ee389ed54cdc41702ca832d9", # Broken yaml in toolkit/components/translations/metrics.yaml, fixed in subsequent commit "3ac10c73a280b1f9bba82bb08082db7bcfd5d2de", "01a75161fac9acfc5a603bc2256245e914591e5e", # Missing dom/security/metrics.yaml "cdb47e79cd499b67d5de2804cbfb70eb2ab29796", # Missing parser/html/metrics.yaml "ed40307b32b221322505a86ebd33a322c64820bb", # Missing security/ct/metrics.yaml # Missing toolkit/components/antitracking/bouncetrackingprotection/metrics.yaml "32aceda20e3960fae23b3959be179693ec825599", # Missing toolkit/components/antitracking/bouncetrackingprotection/metrics.yaml "189fed694934b8cde47c83fa9fb56ae76b93092c", # Missing toolkit/components/antitracking/imageinputmetadatastripper/metrics.yaml "3b9744aaa5694b1c633acb0d0ea1fe8ec31c9d28", # Missing toolkit/components/reader/metrics.yaml "5bd2d84327d9385a4f4a0fbc4f55e4e0a0302bb2", "abbfb0e92e37e68d008ba0af29dbe199651fd2f3", # Missing toolkit/profile/metrics.yaml # Missing toolkit/components/antitracking/bouncetrackingprotection/metrics.yaml "84748d4bd6523268d905b0bc78cc7773a37bbca9", # Missing toolkit/components/antitracking/bouncetrackingprotection/metrics.yaml "7b49203aee2818b96242b4746fed722844619760", # Missing toolkit/components/resistfingerprinting/pings.yaml "de714a36bce1431b1332b52c48106fedb2d4142a", # Missing toolkit/components/resistfingerprinting/pings.yaml "2df76493a78a6cc21c37b699fa4ae3eb91f87218", # Missing toolkit/components/captchadetection/metrics.yaml "1fc70947ff229c30ba1b5c54f884a503af13ccda", # pdfjs metrics missing `description`s "35ed43c92f51ee67e66f2a8d2814d57acbb4063e", ], "firefox-desktop": [ "c5d5f045aaba41933622b5a187c39da0d6ab5d80", # Missing toolkit/components/glean/tags.yaml "3e81d4efd88a83e89da56b690f39ca2a78623810", # No browser/components/newtab/metrics.yaml "d556b247aaec64b3ab6a033d40f2022f1213101e", # No toolkit/components/nimbus/metrics.yaml "d1d0b69871e3d38ead989d73f30563a501a448b6", # No toolkit/components/nimbus/metrics.yaml "642be079c4465445ab42b55d18e0a4d644c19c36", # No toolkit/components/telemetry/pings.yaml # Missing toolkit/components/telemetry/dap/metrics.yaml "c5c002f81f08a73e04868e0c2bf0eb113f200b03", # Missing browser/components/backup/metrics.yaml "4d4322e829aa7ba8a4abd00fca0dcd3b10e127a3", # Missing browser/components/privatebrowsing/metrics.yaml "47da40cec7bb1235bd9dc597a26f7b69b48fc2a7", # Missing dom/media/platforms/wmf/metrics.yaml "41edcdf7fe44678c5913a603a286b1fc3979d540", # Missing toolkit/components/contentrelevancy/metrics.yaml "856ef9e3e5132cf536dc5662e220c0e0e5127a7e", # Missing toolkit/components/contentrelevancy/metrics.yaml "c7f67706fcdac6a6198d8867cb102546213dbaf8", # Missing toolkit/components/places/metrics.yaml "bc739eb4ae15600f5eb668a060de8732e34e7e26", # Missing toolkit/components/shopping/metrics.yaml "f03abd1c7bf9f721afd0df7e36023f4ea925afd2", "c9bbde88a4e816950372d1647827491902f62af4", # Missing widget/windows/metrics.yaml "21001e9ab793daf750ad988ce86cc7eefd29b856", # Missing toolkit/components/nimbus/pings.yaml "514742c4bda3c0a5ea5c631029929efa8fd6f855", # Missing toolkit/components/nimbus/pings.yaml # Missing toolkit/components/reportbrokensite/metrics.yaml "42acdc9cd5ae89222bdceeeaed7bacac755be48f", # Missing toolkit/components/reportbrokensite/metrics.yaml "c76093316c58ae74a21e854b8035c91d0c75df6e", ], "firefox-desktop-background-update": [ "c5d5f045aaba41933622b5a187c39da0d6ab5d80", # Missing toolkit/components/glean/tags.yaml ], "firefox-desktop-background-tasks": [ # Missing toolkit/components/backgroundtasks/metrics.yaml "0caa2f1940d744d1154f47c242bc5c119cf453f8", ], "firefox-translations": [ # Invalid extension/model/telemetry/metrics.yaml "02dc27b663178746499d092a987ec08c026ee560", ], "pine": [ "c5d5f045aaba41933622b5a187c39da0d6ab5d80", # Missing toolkit/components/glean/tags.yaml "3e81d4efd88a83e89da56b690f39ca2a78623810", # No browser/components/newtab/metrics.yaml "642be079c4465445ab42b55d18e0a4d644c19c36", # No toolkit/components/telemetry/pings.yaml ], "rally-core": [ "4df4dc23317e155bf1b605d04b466c27d78537fa", # Missing web-platform/glean/metrics.yaml "69559324f775b79c9a39c6a95fdb3657c184ed0e", # Bug 1769579 omit deleted onboarding ping "f633df7676b6ef64e496fea1b3687eff22680d49", # Missing web-platform/glean/pings.yaml ], "rally-attention-stream": [ "9fd0b2aeb82ca37f817dcda51bd2f34b6925b487", # `bugs`/`data_reviews` is not of type `string` "a3dacb30e198c5c19159678c6617064cf4ae1d77", # Bug 1783960 omit deleted meta-pixel ping ], "support-migration": [ "2e05b2b7d775ea726e035a7a7f16d889d63fc09a", # No components/support/migration/metrics.yaml ], "viu-politica": [ "e41967f92f40dd36729939cf67bcf680352ec1a4", # Removed all data collection ], "moso-mastodon-backend": [ "cd5c69456d88b7023366fd50806855086a039dba", # No .glean/metrics.yaml ], "tiktokreporter-android": [ "96bf78fbde4dc1eddd8fc7de175d6c58fe82e23e", # Improperly named metric ], "accounts-backend": [ "095b4e47cebaa8a2ca54d1d496814f0620dcf8b1", # Wrong schema spec used ], "glean-server-metrics-compat": [ "6fe8a8f8a4026f389a8f697669d56673e0817a29", # Wrong schema spec used ], } def _file_in_commit(repo: git.Repo, filename: Path, ref: str) -> bool: # adapted from https://stackoverflow.com/a/25961128 subtree = repo.commit(ref).tree for path_element in filename.parts[:-1]: try: subtree = subtree[path_element] except KeyError: return False # subdirectory not in tree return str(filename) in subtree @dataclass(eq=True, frozen=True) class Commit: hash: str # only compare hash when checking if commits are equal timestamp: int = field(compare=False) # Since commits from the same PR may have the same timestamp, we also record # an index representing its position in the git log so the correct ordering # of commits can be preserved. reflog_index: int = field(compare=False) is_head: bool = field(compare=False) def sort_key(self) -> Tuple[int, int]: # git log returns newest commits first, so use negative reflog_index return self.timestamp, -self.reflog_index @cached_property def pretty_timestamp(self): return datetime.utcfromtimestamp(self.timestamp).isoformat(" ") def get_commits( repo: git.Repo, filename: Path, ref: str, only_ref: bool = False, deprecated: bool = False, branch_head_hash: Optional[str] = None, ) -> Iterable[Commit]: sep = ":" log_format = f"%H{sep}%ct" commits = set() if not only_ref: # include "--" to prevent error for filename not in current tree log = repo.git.log(ref, "--", filename, format=log_format) # filter out empty strings change_commits = filter(None, log.split("\n")) commits |= set(enumerate(change_commits)) if (only_ref and not deprecated) or _file_in_commit(repo, filename, ref): # include ref when it contains filename log = repo.git.log(ref, format=log_format, max_count=1) # filter out empty strings change_commits = filter(None, log.split("\n")) commits |= set(enumerate(change_commits)) # Store the index in the ref-log as well as the timestamp, so that the # ordering of commits will be deterministic and always in the correct # order. for reflog_index, entry in commits: hash_, timestamp = entry.split(sep) yield Commit( hash=hash_, timestamp=int(timestamp), reflog_index=reflog_index, is_head=hash_ == branch_head_hash, ) def get_file_at_hash(repo: git.Repo, _hash: str, filename: Path) -> str: return repo.git.show(f"{_hash}:{filename}") def utc_timestamp(d: datetime) -> float: # See https://docs.python.org/3/library/datetime.html#datetime.datetime.timestamp # for why we're calculating this UTC timestamp explicitly return (d - datetime(1970, 1, 1)) / timedelta(seconds=1) def retrieve_files( repo_info: Repository, cache_dir: Path, glean_commit: Optional[str] = None, glean_commit_branch: Optional[str] = None, limit_date: Optional[date] = None, ) -> Tuple[Dict[Commit, List[Path]], bool]: commits = defaultdict(list) base_path = cache_dir / repo_info.name org_name, repo_name = repo_info.url.rstrip("/").split("/")[-2:] repo_path = cache_dir / org_name / f"{repo_name}.git" min_date = None if repo_info.name in MIN_DATES: min_date = utc_timestamp(datetime.fromisoformat(MIN_DATES[repo_info.name])) skip_commits = SKIP_COMMITS.get(repo_info.name, []) if repo_path.exists(): print(f"Pulling commits into {repo_path}") repo = git.Repo(repo_path) actual_urls = set(repo.remote("origin").urls) if actual_urls != {repo_info.url}: raise Exception( f"invalid cache: git repo at {repo_path} should be for " f"{repo_info.url} but got {actual_urls}" ) else: print(f"Cloning {repo_info.url} into {repo_path}") repo = git.Repo.clone_from( repo_info.url, repo_path, bare=True, depth=1 if glean_commit or limit_date else None, ) repo_is_shallow = repo.git.rev_parse(is_shallow_repository=True) == "true" branch = repo_info.branch or repo.active_branch if glean_commit is None: if limit_date is not None: shallow_since = utc_timestamp(datetime.combine(limit_date, time.min)) try: repo.git.fetch( "origin", f"{branch}:{branch}", force=True, shallow_since=shallow_since, ) except git.GitCommandError as e: if any( log in e.stderr for log in ( # github error "\n stderr: 'fatal: error processing shallow info: 4'", # local git dir error "\n stderr: 'fatal: no commits selected for shallow requests\n", ) ): # no commits, don't upload return {}, False raise else: repo.git.fetch( "origin", f"{branch}:{branch}", force=True, unshallow=repo_is_shallow, ) # pass ref around to avoid updating repo.active_branch, so that it # can be preserved for other glean repos with the same git url ref = f"refs/heads/{branch}" branch_head_hash = repo.commit(ref).hexsha upload_repo = True elif GIT_HASH_PATTERN.fullmatch(glean_commit) is None: raise ProbeScraperInvalidRequest( f"commit must be full length git hash, but got {glean_commit!r}" ) else: repo.git.fetch( "origin", glean_commit, force=True, depth=1 if repo_is_shallow else None ) ref = glean_commit upload_repo = str(branch) == glean_commit_branch # When commit_branch is the branch for this repo, verify that commit_hash is on that branch if upload_repo: print(f"Verifying that {glean_commit} is in {branch}") # doesn't change depth repo.git.fetch("origin", f"{branch}:{branch}", force=True) branch_ref = f"refs/heads/{branch}" branch_head_hash = repo.commit(branch_ref).hexsha if glean_commit != branch_head_hash: if repo_is_shallow: repo.git.fetch( "origin", f"{branch}:{branch}", force=True, unshallow=True ) try: # when commit != branch, check if it's in the history for branch repo.git.merge_base(glean_commit, branch_ref, is_ancestor=True) except git.GitCommandError: raise ProbeScraperInvalidRequest( f"Commit {glean_commit} not found in branch {branch} of {repo_info.url}" ) else: branch_head_hash = None for rel_path in map(Path, repo_info.get_change_files()): for commit in get_commits( repo, rel_path, ref, only_ref=glean_commit is not None, deprecated=repo_info.deprecated, branch_head_hash=branch_head_hash, ): if min_date and commit.timestamp < min_date: continue if commit.hash in skip_commits: continue probe_file = base_path / commit.hash / rel_path if not probe_file.exists(): try: contents = get_file_at_hash(repo, commit.hash, rel_path) except git.GitCommandError as e: if "does not exist" in str(e): raise ProbeScraperInvalidRequest( f"{rel_path} not found in commit {commit.hash} for {repo_info.app_id}" ) raise probe_file.parent.mkdir(parents=True, exist_ok=True) probe_file.write_bytes(contents.encode("UTF-8")) commits[commit].append(probe_file) return commits, upload_repo def scrape( folder: Optional[Path] = None, repos: Optional[List[Repository]] = None, glean_commit: Optional[str] = None, glean_commit_branch: Optional[str] = None, limit_date: Optional[date] = None, ) -> Tuple[ Dict[str, Dict[Commit, List[Path]]], Dict[str, Dict[str, List[Union[Dict[str, str], str]]]], List[str], ]: """ Returns three data structures. The first is commits_by_repo: { <repo-name>: { <Commit>: [<path>, ...] } } The second is emails: { <repo-name>: { "addresses": [<email>, ...]. "emails": [ { "subject": <str>, "message": <str>, }, ] }, } The third is the names of repos that are authorized to be uploaded, based on whether commit_branch matches the configured branch for that repo. When commit is not None but commit_branch is None, this is empty. When commit and commit_branch are both None, this includes all repos: [<repo-name>, ...] Raises InvalidCommitError when commit is not None or a 40 character hex sha. Also raises InvalidCommitError when commit and commit_branch are both specified and commit_branch matches the configured branch for a repo and commit is not part of the history of commit_branch for that repo. This ensures that return values correctly indicate repos where commits are authorized to be uploaded. """ if folder is None: folder = Path(tempfile.mkdtemp()) commits_by_repo = {} emails = {} upload_repos = [] for repo_info in repos: print("Getting commits for repository " + repo_info.name) commits_by_repo[repo_info.name] = {} emails[repo_info.name] = { "addresses": repo_info.notification_emails, "emails": [], } if not ( repo_info.metrics_file_paths or repo_info.ping_file_paths or repo_info.tag_file_paths ): print( f"Skipping commits for repository {repo_info.name}" " because it has no metrics/ping/tag files." ) continue try: commits, upload_repo = retrieve_files( repo_info, folder, glean_commit, glean_commit_branch, limit_date, ) print(" Got {} commits".format(len(commits))) commits_by_repo[repo_info.name] = commits if upload_repo: upload_repos.append(repo_info.name) except Exception: raise emails[repo_info.name]["emails"].append( { "subject": "Probe Scraper: Failed Probe Import", "message": traceback.format_exc(), } ) return commits_by_repo, emails, upload_repos