gh-data.py

#!/usr/bin/env python # Based on https://github.com/WebKit/standards-positions/blob/main/summary.py import json, os, requests, re, sys # Retrieve the token from environment variables token = os.getenv('GITHUB_TOKEN') headers = {"Authorization": f"token {token}"} if token else {} # Utilities def write_json(filename, data): with open(filename, "w") as f: json.dump(data, f, indent=2, separators=(",", ": ")) f.write("\n") # General processing def process(issues): summary = [] for issue in issues: if is_ignorable_issue(issue): continue summary_item = {"issue": int(issue["html_url"][issue["html_url"].rfind("/") + 1 :])} summary_item.update(process_labels(issue["labels"])) summary_item.update(process_body(issue)) summary_item["title"] = re.sub(r"(request for (mozilla )?position|rfp)( ?:| on) ", "", issue["title"], flags=re.IGNORECASE) summary.append(summary_item) write_json("gh-data-summary.json", summary) print("Done: gh-data-summary.json.") def is_ignorable_issue(issue): if "pull_request" in issue: return True for label in issue["labels"]: if label["name"] in ("duplicate", "invalid", "tooling", "proposal withdrawn"): return True return False def process_labels(labels): position = None venues = [] concerns = [] topics = [] for label in labels: # Position if label["name"].startswith("position: "): position = label["name"].split(": ")[1] # Venue elif label["name"].startswith("venue: "): venues.append(label["name"].split(": ")[1]) # Concerns elif label["name"].startswith("concerns: "): concerns.append(label["name"].split(": ")[1]) # Topics elif label["name"].startswith("topic: "): topics.append(label["name"].split(": ")[1]) return { "position": position, "venues": venues, "concerns": concerns, "topics": topics, } def get_url(text): # get the first url (maybe in markdown link) and remove trailing comma m = re.search(r"\b(https?://[^\)\s]+)", text) if m: url = m.group() if url.endswith(','): url = url[:-1] return url return "" def process_body(issue): lines = issue["body"].splitlines() body = { "title": None, "url": None, "explainer": None, "mdn": None, "caniuse": None, "bug": None, "webkit": None, } legacy_mapping = { # "specification title": "title", # Always use the issue title "specification or proposal url (if available)": "url", "specification or proposal url": "url", "explainer url (if available)": "explainer", "explainer url": "explainer", "mdn url (optional)": "mdn", "caniuse.com url (optional)": "caniuse", "caniuse.com url": "caniuse", "bugzilla url (optional)": "bug", "bugzilla url": "bug", "webkit standards-position": "webkit", } yaml_mapping = { # Specification title "Specification or proposal URL (if available)": "url", "Explainer URL (if available)": "explainer", "MDN URL": "mdn", "Caniuse.com URL": "caniuse", "Bugzilla URL": "bug", "WebKit standards-position": "webkit", } # Legacy issues using ISSUE_TEMPLATE.md if issue["number"] < 1175: for line in lines: if line == "### Other information": break for title, key in legacy_mapping.items(): text_title = f"* {title}: " if line.lower().startswith(text_title): value = line[len(text_title) :].strip() if key in ("url", "explainer", "mdn", "caniuse", "bug", "webkit"): value = get_url(value) if value != "" and value.lower() != "n/a": body[key] = value break # Issues using YAML template else: expect_response = None skip = False for line in lines: if line == "### Other information": break for title, key in yaml_mapping.items(): text_title = f"### {title}" if line == text_title: expect_response = key skip = True break if skip: skip = False continue if expect_response: value = line.strip() if key in ("url", "explainer", "mdn", "caniuse", "bug", "webkit"): value = get_url(value) if value and value != "_No response_" and value.lower() != "n/a": body[expect_response] = value expect_response = None return body # Setup def main(): # update data = [] page = 1 while True: try: print(f"Fetching page {page}...") response = requests.get( f"https://api.github.com/repos/mozilla/standards-positions/issues?direction=asc&state=all&per_page=100&page={page}", headers=headers, timeout=5, ) response.raise_for_status() except Exception: print("Update failed, network failure or request timed out.") exit(1) temp_data = response.json() if not temp_data: print("Empty!") break data.extend(temp_data) # Check for 'link' header and 'rel="next"' link_header = response.headers.get("link", "") if 'rel="next"' not in link_header: break page += 1 write_json("gh-data.json", data) print("Done: gh-data.json.") # process if not os.path.exists("gh-data.json"): print("Sorry, you have to update first.") exit(1) with open("gh-data.json", "rb") as f: data = json.load(f) process(data) if __name__ == "__main__": main()

gh-data.py (155 lines of code) (raw):