utils/generate_notebooks.py (237 lines of code) (raw):

import argparse import os import re import nbformat import shutil import yaml from pathlib import Path re_framework_test = re.compile(r"^{#if\s+fw\s+===\s+'([^']+)'}\s*$") re_framework_else = re.compile(r"^{:else}\s*$") re_framework_end = re.compile(r"^{/if}\s*$") re_html_line = re.compile(r"^<[^>]*/>\s*$") re_html_tag = re.compile(r"<([^/>]*)>\s*$") re_python_code = re.compile(r"^```(?:py|python|py no\-format|python no\-format)\s*$") re_output_code = re.compile(r"^```(?:py|python)\s+out\s*$") re_end_code = re.compile(r"^```\s*$") frameworks = {"pt": "PyTorch", "tf": "TensorFlow"} PATH_TO_COURSE = Path("chapters/") # Languages to exlude from the notebook generation because the notebooks were # created manually LANGS_TO_EXCLUDE = ["fr"] def read_and_split_frameworks(fname): """ Read the MDX in fname and creates two versions (if necessary) for each framework. """ with open(fname, "r") as f: content = f.readlines() contents = {"pt": [], "tf": []} differences = False current_content = [] line_idx = 0 for line in content: if re_framework_test.search(line) is not None: differences = True framework = re_framework_test.search(line).groups()[0] for key in contents: contents[key].extend(current_content) current_content = [] elif re_framework_else.search(line) is not None: contents[framework].extend(current_content) current_content = [] framework = "pt" if framework == "tf" else "tf" elif re_framework_end.search(line) is not None: contents[framework].extend(current_content) current_content = [] else: current_content.append(line) if len(current_content) > 0: for key in contents: contents[key].extend(current_content) if differences: return {k: "".join(content) for k, content in contents.items()} else: return "".join(content) def extract_cells(content): """ Extract the code/output cells from content. """ cells = [] current_cell = None is_output = False for line in content.split("\n"): if re_python_code.search(line) is not None: is_output = False current_cell = [] elif re_output_code.search(line) is not None: is_output = True current_cell = [] elif re_end_code.search(line) is not None and current_cell is not None: cell = "\n".join(current_cell) if is_output: if not isinstance(cells[-1], tuple): cells[-1] = (cells[-1], cell) else: cells.append(cell) current_cell = None current_md = [] elif current_cell is not None: current_cell.append(line) return cells def convert_to_nb_cell(cell): """ Convert some cell (either just code or tuple (code, output)) to a proper notebook cell. """ nb_cell = {"cell_type": "code", "execution_count": None, "metadata": {}} if isinstance(cell, tuple): nb_cell["source"] = cell[0] nb_cell["outputs"] = [ nbformat.notebooknode.NotebookNode( { "data": {"text/plain": cell[1]}, "execution_count": None, "metadata": {}, "output_type": "execute_result", } ) ] else: nb_cell["source"] = cell nb_cell["outputs"] = [] return nbformat.notebooknode.NotebookNode(nb_cell) def nb_cell(source, code=True): if not code: return nbformat.notebooknode.NotebookNode({"cell_type": "markdown", "source": source, "metadata": {}}) return nbformat.notebooknode.NotebookNode( {"cell_type": "code", "metadata": {}, "source": source, "execution_count": None, "outputs": []} ) def build_notebook(fname, title, output_dir="."): """ Build the notebook for fname with a given title in output_dir. """ sections = read_and_split_frameworks(fname) sections_with_accelerate = [ "chapter3/3", # "Fine-tuning a model with the Trainer API or Keras", "chapter3/4", # "A full training", "chapter7/2_pt", # "Token classification (PyTorch)", "chapter7/3_pt", # "Fine-tuning a masked language model (PyTorch)" "chapter7/4_pt", # "Translation (PyTorch)" "chapter7/5_pt", # "Summarization (PyTorch)", "chapter7/6_pt", # "Training a causal language model from scratch (PyTorch)" "chapter7/7_pt", # "Question answering (PyTorch)" ] sections_with_hf_hub = [ "chapter4/3_pt", # "Sharing pretrained models (PyTorch)" "chapter4/3_tf", # "Sharing pretrained models (TensorFlow)" "chapter5/5", # "Creating your own dataset" "chapter7/2_pt", # "Token classification (PyTorch)" "chapter7/2_tf", # "Token classification (TensorFlow)" "chapter6/2", # "Training a new tokenizer from an old one" "chapter7/3_pt", # "Fine-tuning a masked language model (PyTorch)" "chapter7/3_tf", # "Fine-tuning a masked language model (TensorFlow)" "chapter7/4_pt", # "Translation (PyTorch)" "chapter7/4_tf", # "Translation (TensorFlow)" "chapter7/5_pt", # "Summarization (PyTorch)" "chapter7/5_tf", # "Summarization (TensorFlow)" "chapter7/6_pt", # "Training a causal language model from scratch (PyTorch)" "chapter7/6_tf", # "Training a causal language model from scratch (TensorFlow)" "chapter7/7_pt", # "Question answering (PyTorch)" "chapter7/7_tf", # "Question answering (TensorFlow)" "chapter8/2", # "What to do when you get an error" ] sections_with_faiss = [ "chapter5/6_pt", # "Semantic search with FAISS (PyTorch)" "chapter5/6_tf", # "Semantic search with FAISS (TensorFlow)" ] sections_with_gradio = [ "chapter9/2", # "Building your first demo" "chapter9/3", # "Understanding the Interface class" "chapter9/4", # "Sharing demos with others" "chapter9/5", # "Integrations with the Hugging Face Hub" "chapter9/6", # "Advanced Interface features" "chapter9/7", # "Introduction to Blocks" ] stem = Path(fname).stem if not isinstance(sections, dict): contents = [sections] titles = [title] fnames = [f"section{stem}.ipynb"] section_names = [f"{Path(fname).parent.stem}/{stem}"] else: contents = [] titles = [] fnames = [] section_names = [] for key, section in sections.items(): contents.append(section) titles.append(f"{title} ({frameworks[key]})") fnames.append(f"section{stem}_{key}.ipynb") section_names.append(f"{Path(fname).parent.stem}/{stem}_{key}") for title, content, fname, section_name in zip(titles, contents, fnames, section_names): cells = extract_cells(content) if len(cells) == 0: continue nb_cells = [ nb_cell(f"# {title}", code=False), nb_cell("Install the Transformers, Datasets, and Evaluate libraries to run this notebook.", code=False), ] # Install cell installs = ["!pip install datasets evaluate transformers[sentencepiece]"] if section_name in sections_with_accelerate: installs.append("!pip install accelerate") installs.append("# To run the training on TPU, you will need to uncomment the following line:") installs.append( "# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl" ) if section_name in sections_with_hf_hub: installs.append("!apt install git-lfs") if section_name in sections_with_faiss: installs.append("!pip install faiss-gpu") if section_name in sections_with_gradio: installs.append("!pip install gradio") nb_cells.append(nb_cell("\n".join(installs))) if section_name in sections_with_hf_hub: nb_cells.extend( [ nb_cell( "You will need to setup git, adapt your email and name in the following cell.", code=False ), nb_cell( '!git config --global user.email "you@example.com"\n!git config --global user.name "Your Name"' ), nb_cell( "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.", code=False, ), nb_cell("from huggingface_hub import notebook_login\n\nnotebook_login()"), ] ) nb_cells += [convert_to_nb_cell(cell) for cell in cells] metadata = {"colab": {"name": title, "provenance": []}} nb_dict = {"cells": nb_cells, "metadata": metadata, "nbformat": 4, "nbformat_minor": 4} notebook = nbformat.notebooknode.NotebookNode(nb_dict) os.makedirs(output_dir, exist_ok=True) nbformat.write(notebook, os.path.join(output_dir, fname), version=4) def get_titles(language): """ Parse the _toctree.yml file to get the correspondence filename to title """ table = yaml.safe_load(open(os.path.join(f"chapters/{language}", "_toctree.yml"), "r")) result = {} for entry in table: for section in entry["sections"]: section_title = section["title"] if "local_fw" in section: section_names = section["local_fw"] result[section_names["pt"]] = section_title result[section_names["tf"]] = section_title else: section_name = section["local"] result[section_name] = section_title return {k: v for k, v in result.items() if "quiz" not in v} def create_notebooks(language, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) for folder in os.listdir(output_dir): if folder.startswith("chapter"): shutil.rmtree(os.path.join(output_dir, folder)) titles = get_titles(language) for fname, title in titles.items(): build_notebook( os.path.join(f"chapters/{language}", f"{fname}.mdx"), title, os.path.join(output_dir, Path(fname).parent), ) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--output_dir", type=str, help="Where to output the notebooks") args = parser.parse_args() languages = [f.stem for f in PATH_TO_COURSE.iterdir() if f.is_dir()] for language in languages: if language in LANGS_TO_EXCLUDE: continue language_output_dir = f"{args.output_dir}/{language}" create_notebooks(language, language_output_dir) # Remove empty notebook folders if not any(Path(language_output_dir).iterdir()): shutil.rmtree(language_output_dir)