analytics/github_analyze.py (306 lines of code) (raw):

#!/usr/bin/env python3 from datetime import datetime, timedelta from typing import Any, Dict, List, Iterable, Optional, Union from urllib.request import urlopen, Request from urllib.error import HTTPError import json import enum import os class IssueState(enum.Enum): OPEN = "open" CLOSED = "closed" ALL = "all" def __str__(self): return self.value class GitCommit: commit_hash: str title: str body: str author: str author_date: datetime commit_date: Optional[datetime] def __init__(self, commit_hash: str, author: str, author_date: datetime, title: str, body: str, commit_date: Optional[datetime] = None) -> None: self.commit_hash = commit_hash self.author = author self.author_date = author_date self.commit_date = commit_date self.title = title self.body = body def __contains__(self, item: Any) -> bool: return item in self.body or item in self.title def get_revert_revision(commit: GitCommit) -> Optional[str]: import re rc = re.match("Revert (D\\d+):", commit.title) if rc is None: return None return rc.group(1) def get_diff_revision(commit: GitCommit) -> Optional[str]: import re rc = re.search("\\s*Differential Revision: (D\\d+)", commit.body) if rc is None: return None return rc.group(1) def is_revert(commit: GitCommit) -> bool: return get_revert_revision(commit) is not None def parse_medium_format(lines: Union[str, List[str]]) -> GitCommit: """ Expect commit message generated using `--format=medium --date=unix` format, i.e.: commit <sha1> Author: <author> Date: <author date> <title line> <full commit message> """ if isinstance(lines, str): lines = lines.split("\n") # TODO: Handle merge commits correctly if len(lines) > 1 and lines[1].startswith("Merge:"): del lines[1] assert len(lines) > 5 assert lines[0].startswith("commit") assert lines[1].startswith("Author: ") assert lines[2].startswith("Date: ") assert len(lines[3]) == 0 return GitCommit(commit_hash=lines[0].split()[1].strip(), author=lines[1].split(":", 1)[1].strip(), author_date=datetime.fromtimestamp(int(lines[2].split(":", 1)[1].strip())), title=lines[4].strip(), body="\n".join(lines[5:]), ) def parse_fuller_format(lines: Union[str, List[str]]) -> GitCommit: """ Expect commit message generated using `--format=fuller --date=unix` format, i.e.: commit <sha1> Author: <author> AuthorDate: <author date> Commit: <committer> CommitDate: <committer date> <title line> <full commit message> """ if isinstance(lines, str): lines = lines.split("\n") # TODO: Handle merge commits correctly if len(lines) > 1 and lines[1].startswith("Merge:"): del lines[1] assert len(lines) > 7 assert lines[0].startswith("commit") assert lines[1].startswith("Author: ") assert lines[2].startswith("AuthorDate: ") assert lines[3].startswith("Commit: ") assert lines[4].startswith("CommitDate: ") assert len(lines[5]) == 0 return GitCommit(commit_hash=lines[0].split()[1].strip(), author=lines[1].split(":", 1)[1].strip(), author_date=datetime.fromtimestamp(int(lines[2].split(":", 1)[1].strip())), commit_date=datetime.fromtimestamp(int(lines[4].split(":", 1)[1].strip())), title=lines[6].strip(), body="\n".join(lines[7:]), ) def _check_output(items: List[str], encoding='utf-8') -> str: from subprocess import check_output return check_output(items).decode(encoding) def get_git_remotes(path: str) -> Dict[str, str]: keys = _check_output(["git", "-C", path, "remote"]).strip().split("\n") return {key: _check_output(["git", "-C", path, "remote", "get-url", key]).strip() for key in keys} class GitRepo: def __init__(self, path, remote='upstream'): self.repo_dir = path self.remote = remote def _run_git_log(self, revision_range) -> List[GitCommit]: log = _check_output(['git', '-C', self.repo_dir, 'log', '--format=fuller', '--date=unix', revision_range, '--', '.']).split("\n") rc: List[GitCommit] = [] cur_msg: List[str] = [] for line in log: if line.startswith("commit"): if len(cur_msg) > 0: rc.append(parse_fuller_format(cur_msg)) cur_msg = [] cur_msg.append(line) if len(cur_msg) > 0: rc.append(parse_fuller_format(cur_msg)) return rc def get_commit_list(self, from_ref, to_ref) -> List[GitCommit]: return self._run_git_log(f"{self.remote}/{from_ref}..{self.remote}/{to_ref}") def build_commit_dict(commits: List[GitCommit]) -> Dict[str, GitCommit]: rc = {} for commit in commits: assert commit.commit_hash not in rc rc[commit.commit_hash] = commit return rc def fetch_json(url: str, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]: headers = {'Accept': 'application/vnd.github.v3+json'} token = os.environ.get("GITHUB_TOKEN") if token is not None and url.startswith('https://api.github.com/'): headers['Authorization'] = f'token {token}' if params is not None and len(params) > 0: url += '?' + '&'.join(f"{name}={val}" for name, val in params.items()) try: with urlopen(Request(url, headers=headers)) as data: return json.load(data) except HTTPError as err: if err.code == 403 and all(key in err.headers for key in ['X-RateLimit-Limit', 'X-RateLimit-Used']): print(f"Rate limit exceeded: {err.headers['X-RateLimit-Used']}/{err.headers['X-RateLimit-Limit']}") raise def fetch_multipage_json(url: str, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]: if params is None: params = {} assert "page" not in params page_idx, rc, prev_len, params = 1, [], -1, params.copy() while len(rc) > prev_len: prev_len = len(rc) params["page"] = page_idx page_idx += 1 rc += fetch_json(url, params) return rc def gh_get_milestones(org='pytorch', project='pytorch', state: IssueState = IssueState.OPEN) -> List[Dict[str, Any]]: url = f'https://api.github.com/repos/{org}/{project}/milestones' return fetch_multipage_json(url, {"state": state}) def gh_get_milestone_issues(org: str, project: str, milestone_idx: int, state: IssueState = IssueState.OPEN): url = f'https://api.github.com/repos/{org}/{project}/issues' return fetch_multipage_json(url, {"milestone": milestone_idx, "state": state}) def gh_get_ref_statuses(org: str, project: str, ref: str) -> Dict[str, Any]: url = f'https://api.github.com/repos/{org}/{project}/commits/{ref}/status' params = {"page": 1, "per_page": 100} nrc = rc = fetch_json(url, params) while "statuses" in nrc and len(nrc["statuses"]) == 100: params["page"] += 1 nrc = fetch_json(url, params) if "statuses" in nrc: rc["statuses"] += nrc["statuses"] return rc def extract_statuses_map(json: Dict[str, Any]): return {s["context"]: s["state"] for s in json["statuses"]} class PeriodStats: commits: int reverts: int authors: int date: datetime def __init__(self, date: datetime, commits: int, reverts: int, authors: int) -> None: self.date = date self.commits = commits self.reverts = reverts self.authors = authors def get_monthly_stats(commits: List[GitCommit]) -> Iterable[PeriodStats]: y, m, total, reverts, authors = None, None, 0, 0, set() for commit in commits: commit_date = commit.commit_date if commit.commit_date is not None else commit.author_date if y != commit_date.year or m != commit_date.month: if y is not None: yield PeriodStats(datetime(y, m, 1), total, reverts, len(authors)) y, m, total, reverts, authors = commit_date.year, commit_date.month, 0, 0, set() if is_revert(commit): reverts += 1 total += 1 authors.add(commit.author) def print_monthly_stats(commits: List[GitCommit]) -> None: stats = list(get_monthly_stats(commits)) for idx, stat in enumerate(stats): y = stat.date.year m = stat.date.month total, reverts, authors = stat.commits, stat.reverts, stat.authors reverts_ratio = 100.0 * reverts / total if idx + 1 < len(stats): commits_growth = 100.0 * (stat.commits / stats[idx + 1].commits - 1) else: commits_growth = float('nan') print(f"{y}-{m:02d}: commits {total} ({commits_growth:+.1f}%) reverts {reverts} ({reverts_ratio:.1f}%) authors {authors}") def analyze_reverts(commits: List[GitCommit]): for idx, commit in enumerate(commits): revert_id = get_revert_revision(commit) if revert_id is None: continue orig_commit = None for i in range(1, 100): orig_commit = commits[idx + i] if get_diff_revision(orig_commit) == revert_id: break if orig_commit is None: print(f"Failed to find original commit for {commit.title}") continue print(f"{commit.commit_hash} is a revert of {orig_commit.commit_hash}: {orig_commit.title}") revert_statuses = gh_get_ref_statuses("pytorch", "pytorch", commit.commit_hash) orig_statuses = gh_get_ref_statuses("pytorch", "pytorch", orig_commit.commit_hash) orig_sm = extract_statuses_map(orig_statuses) revert_sm = extract_statuses_map(revert_statuses) for k in revert_sm.keys(): if k not in orig_sm: continue if orig_sm[k] != revert_sm[k]: print(f"{k} {orig_sm[k]}->{revert_sm[k]}") def print_contributor_stats(commits, delta: Optional[timedelta] = None) -> None: authors: Dict[str, int] = {} now = datetime.now() # Default delta is one non-leap year if delta is None: delta = timedelta(days=365) for commit in commits: date, author = commit.commit_date, commit.author if now - date > delta: break if author not in authors: authors[author] = 0 authors[author] += 1 print(f"{len(authors)} contributors made {sum(authors.values())} commits in last {delta.days} days") for count, author in sorted(((commit, author) for author, commit in authors.items()), reverse=True): print(f"{author}: {count}") def commits_missing_in_branch(repo: GitRepo, branch: str, orig_branch: str, milestone_idx: int) -> None: def get_commits_dict(x, y): return build_commit_dict(repo.get_commit_list(x, y)) master_commits = get_commits_dict(orig_branch, 'master') release_commits = get_commits_dict(orig_branch, branch) print(f"len(master_commits)={len(master_commits)}") print(f"len(release_commits)={len(release_commits)}") print("URL;Title;Status") for issue in gh_get_milestone_issues('pytorch', 'pytorch', milestone_idx, IssueState.ALL): html_url, state = issue["html_url"], issue["state"] # Skip closed states if they were landed before merge date if state == "closed": mentioned_after_cut = any(html_url in commit_message for commit_message in master_commits.values()) # If issue is not mentioned after cut, that it must be present in release branch if not mentioned_after_cut: continue mentioned_in_release = any(html_url in commit_message for commit_message in release_commits.values()) # if Issue is mentioned is release branch, than it was picked already if mentioned_in_release: continue print(f'{html_url};{issue["title"]};{state}') def parse_arguments(): from argparse import ArgumentParser parser = ArgumentParser(description="Print GitHub repo stats") parser.add_argument("--repo-path", type=str, help="Path to PyTorch git checkout", default=os.path.expanduser("~/git/pytorch/pytorch")) parser.add_argument("--milestone-id", type=str) parser.add_argument("--branch", type=str) parser.add_argument("--remote", type=str, help="Remote to base off of", default="") parser.add_argument("--analyze-reverts", action="store_true") parser.add_argument("--contributor-stats", action="store_true") parser.add_argument("--missing-in-branch", action="store_true") return parser.parse_args() def main(): import time args = parse_arguments() remote = args.remote if not remote: remotes = get_git_remotes(args.repo_path) # Pick best remote remote = next(iter(remotes.keys())) for key in remotes: if remotes[key].endswith('github.com/pytorch/pytorch'): remote = key repo = GitRepo(args.repo_path, remote) if args.missing_in_branch: # Use milestone idx or search it along milestone titles try: milestone_idx = int(args.milestone_id) except ValueError: milestone_idx = -1 milestones = gh_get_milestones() for milestone in milestones: if milestone.get('title', '') == args.milestone_id: milestone_idx = int(milestone.get('number', '-2')) if milestone_idx < 0: print(f'Could not find milestone {args.milestone_id}') return commits_missing_in_branch(repo, args.branch, f'orig/{args.branch}', milestone_idx) return print(f"Parsing git history with remote {remote}...", end='', flush=True) start_time = time.time() x = repo._run_git_log(f"{remote}/master") print(f"done in {time.time()-start_time:.1f} sec") if args.analyze_reverts: analyze_reverts(x) elif args.contributor_stats: print_contributor_stats(x) else: print_monthly_stats(x) if __name__ == "__main__": main()