utils/generate_notebooks.py (237 lines of code) (raw):
import argparse
import os
import re
import nbformat
import shutil
import yaml
from pathlib import Path
re_framework_test = re.compile(r"^{#if\s+fw\s+===\s+'([^']+)'}\s*$")
re_framework_else = re.compile(r"^{:else}\s*$")
re_framework_end = re.compile(r"^{/if}\s*$")
re_html_line = re.compile(r"^<[^>]*/>\s*$")
re_html_tag = re.compile(r"<([^/>]*)>\s*$")
re_python_code = re.compile(r"^```(?:py|python|py no\-format|python no\-format)\s*$")
re_output_code = re.compile(r"^```(?:py|python)\s+out\s*$")
re_end_code = re.compile(r"^```\s*$")
frameworks = {"pt": "PyTorch", "tf": "TensorFlow"}
PATH_TO_COURSE = Path("chapters/")
# Languages to exlude from the notebook generation because the notebooks were
# created manually
LANGS_TO_EXCLUDE = ["fr"]
def read_and_split_frameworks(fname):
"""
Read the MDX in fname and creates two versions (if necessary) for each framework.
"""
with open(fname, "r") as f:
content = f.readlines()
contents = {"pt": [], "tf": []}
differences = False
current_content = []
line_idx = 0
for line in content:
if re_framework_test.search(line) is not None:
differences = True
framework = re_framework_test.search(line).groups()[0]
for key in contents:
contents[key].extend(current_content)
current_content = []
elif re_framework_else.search(line) is not None:
contents[framework].extend(current_content)
current_content = []
framework = "pt" if framework == "tf" else "tf"
elif re_framework_end.search(line) is not None:
contents[framework].extend(current_content)
current_content = []
else:
current_content.append(line)
if len(current_content) > 0:
for key in contents:
contents[key].extend(current_content)
if differences:
return {k: "".join(content) for k, content in contents.items()}
else:
return "".join(content)
def extract_cells(content):
"""
Extract the code/output cells from content.
"""
cells = []
current_cell = None
is_output = False
for line in content.split("\n"):
if re_python_code.search(line) is not None:
is_output = False
current_cell = []
elif re_output_code.search(line) is not None:
is_output = True
current_cell = []
elif re_end_code.search(line) is not None and current_cell is not None:
cell = "\n".join(current_cell)
if is_output:
if not isinstance(cells[-1], tuple):
cells[-1] = (cells[-1], cell)
else:
cells.append(cell)
current_cell = None
current_md = []
elif current_cell is not None:
current_cell.append(line)
return cells
def convert_to_nb_cell(cell):
"""
Convert some cell (either just code or tuple (code, output)) to a proper notebook cell.
"""
nb_cell = {"cell_type": "code", "execution_count": None, "metadata": {}}
if isinstance(cell, tuple):
nb_cell["source"] = cell[0]
nb_cell["outputs"] = [
nbformat.notebooknode.NotebookNode(
{
"data": {"text/plain": cell[1]},
"execution_count": None,
"metadata": {},
"output_type": "execute_result",
}
)
]
else:
nb_cell["source"] = cell
nb_cell["outputs"] = []
return nbformat.notebooknode.NotebookNode(nb_cell)
def nb_cell(source, code=True):
if not code:
return nbformat.notebooknode.NotebookNode({"cell_type": "markdown", "source": source, "metadata": {}})
return nbformat.notebooknode.NotebookNode(
{"cell_type": "code", "metadata": {}, "source": source, "execution_count": None, "outputs": []}
)
def build_notebook(fname, title, output_dir="."):
"""
Build the notebook for fname with a given title in output_dir.
"""
sections = read_and_split_frameworks(fname)
sections_with_accelerate = [
"chapter3/3", # "Fine-tuning a model with the Trainer API or Keras",
"chapter3/4", # "A full training",
"chapter7/2_pt", # "Token classification (PyTorch)",
"chapter7/3_pt", # "Fine-tuning a masked language model (PyTorch)"
"chapter7/4_pt", # "Translation (PyTorch)"
"chapter7/5_pt", # "Summarization (PyTorch)",
"chapter7/6_pt", # "Training a causal language model from scratch (PyTorch)"
"chapter7/7_pt", # "Question answering (PyTorch)"
]
sections_with_hf_hub = [
"chapter4/3_pt", # "Sharing pretrained models (PyTorch)"
"chapter4/3_tf", # "Sharing pretrained models (TensorFlow)"
"chapter5/5", # "Creating your own dataset"
"chapter7/2_pt", # "Token classification (PyTorch)"
"chapter7/2_tf", # "Token classification (TensorFlow)"
"chapter6/2", # "Training a new tokenizer from an old one"
"chapter7/3_pt", # "Fine-tuning a masked language model (PyTorch)"
"chapter7/3_tf", # "Fine-tuning a masked language model (TensorFlow)"
"chapter7/4_pt", # "Translation (PyTorch)"
"chapter7/4_tf", # "Translation (TensorFlow)"
"chapter7/5_pt", # "Summarization (PyTorch)"
"chapter7/5_tf", # "Summarization (TensorFlow)"
"chapter7/6_pt", # "Training a causal language model from scratch (PyTorch)"
"chapter7/6_tf", # "Training a causal language model from scratch (TensorFlow)"
"chapter7/7_pt", # "Question answering (PyTorch)"
"chapter7/7_tf", # "Question answering (TensorFlow)"
"chapter8/2", # "What to do when you get an error"
]
sections_with_faiss = [
"chapter5/6_pt", # "Semantic search with FAISS (PyTorch)"
"chapter5/6_tf", # "Semantic search with FAISS (TensorFlow)"
]
sections_with_gradio = [
"chapter9/2", # "Building your first demo"
"chapter9/3", # "Understanding the Interface class"
"chapter9/4", # "Sharing demos with others"
"chapter9/5", # "Integrations with the Hugging Face Hub"
"chapter9/6", # "Advanced Interface features"
"chapter9/7", # "Introduction to Blocks"
]
stem = Path(fname).stem
if not isinstance(sections, dict):
contents = [sections]
titles = [title]
fnames = [f"section{stem}.ipynb"]
section_names = [f"{Path(fname).parent.stem}/{stem}"]
else:
contents = []
titles = []
fnames = []
section_names = []
for key, section in sections.items():
contents.append(section)
titles.append(f"{title} ({frameworks[key]})")
fnames.append(f"section{stem}_{key}.ipynb")
section_names.append(f"{Path(fname).parent.stem}/{stem}_{key}")
for title, content, fname, section_name in zip(titles, contents, fnames, section_names):
cells = extract_cells(content)
if len(cells) == 0:
continue
nb_cells = [
nb_cell(f"# {title}", code=False),
nb_cell("Install the Transformers, Datasets, and Evaluate libraries to run this notebook.", code=False),
]
# Install cell
installs = ["!pip install datasets evaluate transformers[sentencepiece]"]
if section_name in sections_with_accelerate:
installs.append("!pip install accelerate")
installs.append("# To run the training on TPU, you will need to uncomment the following line:")
installs.append(
"# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl"
)
if section_name in sections_with_hf_hub:
installs.append("!apt install git-lfs")
if section_name in sections_with_faiss:
installs.append("!pip install faiss-gpu")
if section_name in sections_with_gradio:
installs.append("!pip install gradio")
nb_cells.append(nb_cell("\n".join(installs)))
if section_name in sections_with_hf_hub:
nb_cells.extend(
[
nb_cell(
"You will need to setup git, adapt your email and name in the following cell.", code=False
),
nb_cell(
'!git config --global user.email "you@example.com"\n!git config --global user.name "Your Name"'
),
nb_cell(
"You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.",
code=False,
),
nb_cell("from huggingface_hub import notebook_login\n\nnotebook_login()"),
]
)
nb_cells += [convert_to_nb_cell(cell) for cell in cells]
metadata = {"colab": {"name": title, "provenance": []}}
nb_dict = {"cells": nb_cells, "metadata": metadata, "nbformat": 4, "nbformat_minor": 4}
notebook = nbformat.notebooknode.NotebookNode(nb_dict)
os.makedirs(output_dir, exist_ok=True)
nbformat.write(notebook, os.path.join(output_dir, fname), version=4)
def get_titles(language):
"""
Parse the _toctree.yml file to get the correspondence filename to title
"""
table = yaml.safe_load(open(os.path.join(f"chapters/{language}", "_toctree.yml"), "r"))
result = {}
for entry in table:
for section in entry["sections"]:
section_title = section["title"]
if "local_fw" in section:
section_names = section["local_fw"]
result[section_names["pt"]] = section_title
result[section_names["tf"]] = section_title
else:
section_name = section["local"]
result[section_name] = section_title
return {k: v for k, v in result.items() if "quiz" not in v}
def create_notebooks(language, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for folder in os.listdir(output_dir):
if folder.startswith("chapter"):
shutil.rmtree(os.path.join(output_dir, folder))
titles = get_titles(language)
for fname, title in titles.items():
build_notebook(
os.path.join(f"chapters/{language}", f"{fname}.mdx"),
title,
os.path.join(output_dir, Path(fname).parent),
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", type=str, help="Where to output the notebooks")
args = parser.parse_args()
languages = [f.stem for f in PATH_TO_COURSE.iterdir() if f.is_dir()]
for language in languages:
if language in LANGS_TO_EXCLUDE:
continue
language_output_dir = f"{args.output_dir}/{language}"
create_notebooks(language, language_output_dir)
# Remove empty notebook folders
if not any(Path(language_output_dir).iterdir()):
shutil.rmtree(language_output_dir)