scripts/internal/update_example_tables.py (220 lines of code) (raw):

import os import re import json from collections import defaultdict import subprocess def get_tracked_files(): result = subprocess.run(["git", "ls-files"], capture_output=True, text=True) return set(result.stdout.splitlines()) def extract_info_from_md(file_path): with open(file_path, "r") as f: content = f.read() match = re.search(r"---\s*title:\s*(.*?)\s*type:\s*(.*?)\s*---", content, re.DOTALL) if match: return match.group(1).strip(), match.group(2).strip() return None, None def extract_info_from_ipynb(file_path): with open(file_path, "r") as f: notebook = json.load(f) first_cell = notebook["cells"][0] if first_cell["cell_type"] == "markdown": content = "".join(first_cell["source"]) match = re.search( r"<!--\s*---\s*title:\s*(.*?)\s*type:\s*(.*?)\s*---\s*-->", content, re.DOTALL, ) if match: return match.group(1).strip(), match.group(2).strip() return None, None def get_service(path): if "gke" in path: return "GKE" elif "vertex-ai" in path: return "Vertex AI" elif "cloud-run" in path: return "Cloud Run" return None def generate_tables(): examples = defaultdict(lambda: defaultdict(list)) root_dir = "examples" tracked_files = get_tracked_files() for dirpath, _, filenames in os.walk(root_dir): for filename in filenames: if filename in ["README.md", "vertex-notebook.ipynb"]: file_path = os.path.join(dirpath, filename) relative_path = os.path.relpath(file_path, start=".") if relative_path not in tracked_files: continue dir_path = os.path.dirname(relative_path) if filename.endswith(".md"): title, example_type = extract_info_from_md(file_path) elif filename.endswith(".ipynb"): title, example_type = extract_info_from_ipynb(file_path) if title and example_type: # type: ignore service = get_service(relative_path) if service: examples[service][example_type].append((dir_path, title)) return examples def update_readme(examples): with open("README.md", "r") as f: content = f.read() ordered_services = ["Vertex AI", "GKE", "Cloud Run"] for example_type in ["training", "inference", "evaluation"]: table_rows = [] for service in ordered_services: if examples[service].get(example_type): for path, title in sorted( examples[service][example_type], key=lambda x: x[1] ): # Format the path to include 'examples/<service>' table_rows.append( ( service, f"[{path}](./{path})", title, ) ) if table_rows: table = format_table(["Service", "Example", "Title"], table_rows) pattern = ( rf"(### {example_type.capitalize()} Examples\n\n)[\s\S]*?(\n\n###|\Z)" ) replacement = rf"\1{table}\2" content = re.sub(pattern, replacement, content, flags=re.DOTALL) with open("README.md", "w") as f: f.write(content.rstrip() + "\n") def update_docs(examples): with open("docs/source/resources.mdx", "r") as f: content = f.read() new_content = [] ordered_services = ["Vertex AI", "GKE", "Cloud Run"] ordered_types = ["inference", "training", "evaluation"] for service in ordered_services: service_name = f"(Preview) {service}" if service == "Cloud Run" else service new_content.append(f"\n### {service_name}\n") for example_type in ordered_types: if examples[service].get(example_type): new_content.append(f"\n- {example_type.capitalize()}\n\n") for path, title in sorted( examples[service][example_type], key=lambda x: x[1] ): github_url = f"https://github.com/huggingface/Google-Cloud-Containers/tree/main/{path}" new_content.append(f" - [{title}]({github_url})\n") new_examples_content = "".join(new_content) # Replace the Examples section in the original content pattern = r"(## Examples\n\n- \[All examples\].*?\n)[\s\S]*" updated_content = re.sub( pattern, rf"\1{new_examples_content}", content, flags=re.DOTALL ) with open("docs/source/resources.mdx", "w") as f: f.write(updated_content) def update_cloud_run_examples(examples): file_path = "examples/cloud-run/README.md" with open(file_path, "r") as f: content = f.read() # Update Inference Examples inference_examples = examples.get("Cloud Run", {}).get("inference", []) inference_table = format_table( ["Example", "Title"], [ (f"[{os.path.basename(path)}](./{os.path.basename(path)})", title) for path, title in sorted(inference_examples, key=lambda x: x[1]) ], ) inference_pattern = r"(## Inference Examples\n\n)[\s\S]*?(\n\n## Training Examples)" inference_replacement = rf"\1{inference_table}\2" content = re.sub(inference_pattern, inference_replacement, content, flags=re.DOTALL) # Update Training Examples training_pattern = r"(## Training Examples\n\n)[\s\S]*" training_replacement = r"\1Coming soon!" content = re.sub(training_pattern, training_replacement, content, flags=re.DOTALL) with open(file_path, "w") as f: f.write(content) def update_gke_examples(examples): file_path = "examples/gke/README.md" with open(file_path, "r") as f: content = f.read() for example_type in ["Training", "Inference"]: examples_list = examples.get("GKE", {}).get(example_type.lower(), []) pattern = rf"(## {example_type} Examples\n\n)[\s\S]*?(\n\n##|\Z)" if examples_list: # Sort examples alphabetically by their basename sorted_examples = sorted( examples_list, key=lambda x: os.path.basename(x[0]) ) table = format_table( ["Example", "Title"], [ (f"[{os.path.basename(path)}](./{os.path.basename(path)})", title) for path, title in sorted_examples ], ) replacement = rf"\1{table}\2" else: replacement = rf"\1No {example_type.lower()} examples available yet.\2" content = re.sub(pattern, replacement, content, flags=re.DOTALL) with open(file_path, "w") as f: f.write(content.rstrip() + "\n") def update_vertex_ai_examples(examples): file_path = "examples/vertex-ai/README.md" with open(file_path, "r") as f: content = f.read() new_content = [] for line in content.split("\n"): if line.startswith("## Notebooks"): new_content.append(line) break new_content.append(line) for example_type in ["Training", "Inference", "Evaluation"]: examples_list = examples.get("Vertex AI", {}).get(example_type.lower(), []) new_content.append(f"\n### {example_type} Examples\n") if examples_list: table = format_table( ["Example", "Title"], [ ( f"[notebooks/{os.path.basename(path)}](./notebooks/{os.path.basename(path)})", title, ) for path, title in sorted(examples_list, key=lambda x: x[1]) ], ) new_content.append(table) else: new_content.append("Coming soon!") # Handle Pipelines section new_content.append("\n## Pipelines\n") pipeline_examples = examples.get("Vertex AI", {}).get("pipeline", []) if pipeline_examples: table = format_table( ["Example", "Title"], [ ( f"[pipelines/{os.path.basename(path)}](./pipelines/{os.path.basename(path)})", title, ) for path, title in sorted(pipeline_examples, key=lambda x: x[1]) ], ) new_content.append(table) else: new_content.append("Coming soon!") with open(file_path, "w") as f: f.write("\n".join(new_content).strip()) def format_table(headers, rows): col_widths = [len(h) for h in headers] for row in rows: for i, cell in enumerate(row): col_widths[i] = max(col_widths[i], len(cell)) header = "| " + " | ".join(f"{h:<{w}}" for h, w in zip(headers, col_widths)) + " |" separator = "| " + " | ".join("-" * w for w in col_widths) + " |" body = [ "| " + " | ".join(f"{cell:<{w}}" for cell, w in zip(row, col_widths)) + " |" for row in rows ] return "\n".join([header, separator] + body) if __name__ == "__main__": examples = generate_tables() update_readme(examples) update_docs(examples) update_cloud_run_examples(examples) update_gke_examples(examples) update_vertex_ai_examples(examples) print( "README.md, docs/source/resources.mdx, and example README files have been updated." )