utils/convert_doc_to_notebooks.py (302 lines of code) (raw):
import nbformat
import os
import re
import shutil
# Paths are set to work by invoking this scrip from the notebooks repo, presuming the transformers repo is in the
# same parent folder as the notebooks repo.
PATH_TO_DOCS = '../transformers/docs/source'
PATH_TO_DEST = 'transformers_doc'
DOC_BASE_URL = "https://huggingface.co/transformers/"
# These are the doc files converted, add any new tutorial to this list if you want it handled by the conversion
# script.
TUTORIAL_FILES = [
"benchmarks.rst",
"custom_datasets.rst",
"multilingual.rst",
"perplexity.rst",
"preprocessing.rst",
"quicktour.rst",
"task_summary.rst",
"tokenizer_summary.rst",
"training.rst"
]
###################################
# Parsing the rst file #
###################################
# Re pattern that catches markdown titles.
_re_title = re.compile(r"^#+\s+(\S+)")
# Re pattern that catches rst blocks of the form `.. block_name::`.
_re_block = re.compile(r"^\.\.\s+(\S+)::")
# Re pattern that catches what's after the :: in rst blocks of the form `.. block_name:: something`.
_re_block_lang = re.compile(r"^\.\.\s+\S+::\s*(\S+)(\s+|$)")
# Re pattern that catchers section names like `.. _name:`.
_re_anchor_section = re.compile(r"^\.\.\s+_(\S+):")
# Re pattern that catches indentation at the start of a line.
_re_indent = re.compile(r"^(\s*)\S")
def split_blocks(lines):
""" Read the lines of a doc file and group them by blocks."""
blocks = []
block_type = None
current_block = []
i = 0
def _move_to_next_non_empty_line(i):
while i < len(lines) and len(lines[i]) == 0:
i += 1
return i
def _build_block(blocks, current_block, block_type):
if len(current_block) > 0:
while len(current_block[-1]) == 0:
current_block = current_block[:-1]
blocks.append(('\n'.join(current_block), block_type))
return blocks, []
# Ignore everything before the main title (copyright header)
while _re_title.search(lines[i]) is None:
i += 1
while i < len(lines):
line = lines[i]
if _re_title.search(line) is not None:
blocks, current_block = _build_block(blocks, current_block, "prose")
blocks.append((line, "title"))
i += 1
i = _move_to_next_non_empty_line(i)
elif _re_block.search(line) is not None:
blocks, current_block = _build_block(blocks, current_block, "prose")
block_type = _re_block.search(line).groups()[0]
if _re_block_lang.search(line):
block_type += " " + _re_block_lang.search(line).groups()[0]
i += 1
i = _move_to_next_non_empty_line(i)
indent = _re_indent.search(lines[i]).groups()[0]
if len(indent) > 0:
while i < len(lines) and (lines[i].startswith(indent) or len(lines[i]) == 0):
current_block.append(lines[i])
i += 1
blocks, current_block = _build_block(blocks, current_block, block_type)
elif _re_anchor_section.search(line):
blocks, current_block = _build_block(blocks, current_block, "prose")
blocks.append((line, "anchor"))
i += 1
i = _move_to_next_non_empty_line(i)
else:
current_block.append(line)
i += 1
blocks, current_block = _build_block(blocks, current_block, "prose")
return blocks
###################################
# Text formatting and cleaning #
###################################
def process_titles(lines):
""" Converts rst titles to markdown titles."""
title_chars = """= - ` : ' " ~ ^ _ * + # < >""".split(" ")
title_levels = {}
new_lines = []
for line in lines:
if len(new_lines) > 0 and len(line) >= len(new_lines[-1]) and len(set(line)) == 1 and line[0] in title_chars and line != "::":
char = line[0]
level = title_levels.get(char, len(title_levels) + 1)
if level not in title_levels:
title_levels[char] = level
new_lines[-1] = f"{'#' * level} {new_lines[-1]}"
else:
new_lines.append(line)
return new_lines
# Re pattern to catch things inside ` ` in :obj:`thing`.
_re_obj = re.compile(r":obj:`([^`]+)`")
# Re pattern to catch things inside ` ` in :math:`thing`.
_re_math = re.compile(r":math:`([^`]+)`")
# Re pattern to catch things between single backquotes.
_re_single_backquotes = re.compile(r"(^|[^`])`([^`]+)`([^`]|$)")
# Re pattern to catch things between stars.
_re_stars = re.compile(r"\*([^\*]+)\*")
# Re pattern to catch things between double backquotes.
_re_double_backquotes = re.compile(r"``([^`]+)``")
# Re pattern to catch things inside ` ` in :func/class/meth:`thing`.
_re_func_class = re.compile(r":(?:func|class|meth):`([^`]+)`")
def convert_rst_formatting(text):
""" Convert rst syntax for formatting to markdown in text."""
# Remove :class:, :func: and :meth: markers. Simplify what's inside and put double backquotes
# (to not be caught by the italic conversion).
def _rep_func_class(match):
name = match.groups()[0]
splits = name.split('.')
i = 0
while i < len(splits)-1 and not splits[i][0].isupper():
i += 1
return f"``{'.'.join(splits[i:])}``"
text = _re_func_class.sub(_rep_func_class, text)
# Remove :obj: markers. What's after is in a single backquotes so we put in double backquotes
# (to not be caught by the italic conversion).
text = _re_obj.sub(r"``\1``", text)
# Remove :math: markers.
text = _re_math.sub(r"$\1$", text)
# Convert content in stars to bold
text = _re_stars.sub(r'**\1**', text)
# Convert content in single backquotes to italic.
text = _re_single_backquotes.sub(r'\1*\2*\3', text)
# Convert content in double backquotes to single backquotes.
text = _re_double_backquotes.sub(r'`\1`', text)
# Remove remaining ::
text = re.sub(r"::\n", "", text)
return text
# Re pattern to catch description and url in links of the form `description <url>`_.
_re_links = re.compile(r"`([^`]+\S)\s+</*([^/][^>`]*)>`_+")
# Re pattern to catch reference in links of the form :doc:`reference`.
_re_simple_doc = re.compile(r":doc:`([^`<]*)`")
# Re pattern to catch description and reference in links of the form :doc:`description <reference>`.
_re_doc_with_description = re.compile(r":doc:`([^`<]+\S)\s+</*([^/][^>`]*)>`")
# Re pattern to catch reference in links of the form :ref:`reference`.
_re_simple_ref = re.compile(r":ref:`([^`<]*)`")
# Re pattern to catch description and reference in links of the form :ref:`description <reference>`.
_re_ref_with_description = re.compile(r":ref:`([^`<]+\S)\s+<([^>]*)>`")
def convert_rst_links(text):
""" Convert the rst links in text to markdown."""
# Links of the form :doc:`page`
text = _re_simple_doc.sub(r'[\1](' + DOC_BASE_URL + r'\1.html)', text)
# Links of the form :doc:`text <page>`
text = _re_doc_with_description.sub(r'[\1](' + DOC_BASE_URL + r'\2.html)', text)
# Refs of the form :ref:`page`
text = _re_simple_ref.sub(r'[\1](#\1)', text)
# Refs of the form :ref:`text <page>`
text = _re_ref_with_description.sub(r'[\1](#\2)', text)
# Other links
def _rep_links(match):
text,url = match.groups()
if not url.startswith('http'):
url = DOC_BASE_URL + url
return f"[{text}]({url})"
text = _re_links.sub(_rep_links, text)
return text
###################################
# Notes, math and reference #
###################################
def remove_indentation(text):
""" Remove the indendation found in the first line in text."""
lines = text.split("\n")
indent = _re_indent.search(lines[0]).groups()[0]
new_lines = [line[len(indent):] for line in lines]
return "\n".join(new_lines)
# For now we just do **NOTE_TYPE:** text, maybe there is some clever html solution to have something nicer.
def convert_to_note(text, note_type):
""" Convert text to a note of note_type."""
text = remove_indentation(text)
lines = text.split("\n")
new_lines = [f"> **{note_type.upper()}:** {lines[0]}"]
new_lines += [f"> {line}" for line in lines[1:]]
return "\n".join(new_lines)
def convert_math(text):
""" Convert text to disaply mode LaTeX."""
text = remove_indentation(text)
return f"$${text}$$"
def convert_anchor(text):
""" Convert text to an anchor that can be used in the notebook."""
anchor_name = _re_anchor_section.search(text).groups()[0]
return f"<a id='{anchor_name}'></a>"
###################################
# Images #
###################################
_re_attr_rst = re.compile(r"^\s*:(\S+):\s*(\S.*)$")
def convert_image(image_name, text, pref=None, origin_folder=None, dest_folder=None):
""" Convert text to proper html code for image_name.
Optionally copy image from origin_folder to dest_folder."""
# Copy the image if necessary
if origin_folder is not None and dest_folder is not None:
origin_file = os.path.join(origin_folder, image_name)
dest_file = os.path.join(dest_folder, image_name)
if not os.path.isfile(dest_file):
os.makedirs(os.path.dirname(dest_file), exist_ok=True)
shutil.copy(origin_file, dest_file)
attrs = {'src': image_name if pref is None else os.path.join(pref, image_name)}
for line in text.split("\n"):
if _re_attr_rst.search(line) is not None:
key, attr = _re_attr_rst.search(line).groups()
attrs[key] = attr
html = " ".join([f'{key}="{value}"' for key, value in attrs.items()])
return f"<img {html}/>"
###################################
# Tables #
###################################
# Matches lines with a pattern of a table new line in rst.
_re_ignore_line_table = re.compile("^(\+[\-\s]+)+\+\s*$")
# Matches lines with a pattern of a table new line in rst, with a first column empty.
_re_ignore_line_table1 = re.compile("^\|\s+(\+[\-\s]+)+\+\s*$")
# Matches lines with a pattern of a first table line in rst.
_re_sep_line_table = re.compile("^(\+[=\s]+)+\+\s*$")
def convert_table(text):
""" Convert a table in text from rst to markdown."""
lines = text.split("\n")
new_lines = []
for line in lines:
if _re_ignore_line_table.search(line) is not None:
continue
if _re_ignore_line_table1.search(line) is not None:
continue
if _re_sep_line_table.search(line) is not None:
line = line.replace('=', '-').replace('+', '|')
new_lines.append(line)
return "\n".join(new_lines)
###################################
# Code cleaning #
###################################
# Matches the pytorch code tag.
_re_pytorch = re.compile(r"## PYTORCH CODE")
# Matches the tensorflow code tag.
_re_tensorflow = re.compile(r"## TENSORFLOW CODE")
def split_frameworks(code):
""" Split code between the two frameworks (if it has two versions) with PyTorch first."""
if _re_pytorch.search(code) is None or _re_tensorflow.search(code) is None:
return (code,)
lines = code.split("\n")
is_pytorch_first = _re_pytorch.search(lines[0]) is not None
re_split = _re_tensorflow if is_pytorch_first else _re_pytorch
i = 1
while re_split.search(lines[i]) is None:
i += 1
j = i-1
while len(lines[j]) == 0:
j -= 1
return ("\n".join(lines[:j+1]), "\n".join(lines[i:])) if is_pytorch_first else ("\n".join(lines[i:]), "\n".join(lines[:j+1]))
# Matches any doctest pattern.
_re_doctest = re.compile(r"^(>>>|\.\.\.)")
def parse_code_and_output(code):
""" Parse code to remove indentation, doctest prompts and split between source and theoretical output."""
lines = code.split("\n")
indent = _re_indent.search(lines[0]).groups()[0]
has_doctest = False
input_lines = []
output_lines = []
for line in lines:
if len(line) > 0:
line = line[len(indent):]
if _re_doctest.search(line):
has_doctest = True
line = line[4:]
input_lines.append(line)
elif has_doctest:
if len(line) > 0:
output_lines.append(line)
else:
input_lines.append(line)
return "\n".join(input_lines), "\n".join(output_lines)
###################################
# All together! #
###################################
def markdown_cell(md):
""" Create a markdown cell with md inside."""
return nbformat.notebooknode.NotebookNode({'cell_type': 'markdown', 'source': md, 'metadata': {}})
def code_cell(code, output=None):
""" Create a code cell with `code` and optionally, `output`."""
if output is None or len(output) == 0:
outputs = []
else:
outputs = [nbformat.notebooknode.NotebookNode({
'data': {'text/plain': output},
'execution_count': None,
'metadata': {},
'output_type': 'execute_result'
})]
return nbformat.notebooknode.NotebookNode(
{'cell_type': 'code',
'execution_count': None,
'source': code,
'metadata': {},
'outputs': outputs})
def create_notebook(cells):
""" Create a notebook with `cells`."""
return nbformat.notebooknode.NotebookNode(
{'cells': cells,
'metadata': {},
'nbformat': 4,
'nbformat_minor': 4,
})
def rm_first_line(text):
""" Remove the first line in `text`."""
return '\n'.join(text.split('\n')[1:])
# For the first cell of the notebook
INSTALL_CODE = """# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git
"""
def convert_rst_file_to_notebook(
rst_file,
notebook_fname,
framework=None,
img_prefix=None,
origin_folder=None,
dest_folder=None
):
r"""
Convert rst_file to a notebook named notebook_fname.
Args:
- rst_file (:obj:`str`):
The doc file to convert (in rst format).
- notebook_fname (:obj:`str`):
The output notebook file name (will be replaced if it exists).
- framework (:obj:`str`, `optional`):
If provided, must be :obj:`"pt"` or :obj:`"tf"`. In this case, only the PyTorch (resp. TensorFlow) version
of the code is kept.
- img_prefix (:obj:`str`, `optional`):
If provided, will be inserted at the beginning of each image filename (in the `pytorch` or `tensorflow`
folder, we need to add ../ to each image file to find them).
- origin_folder (:obj:`str`, `optional`):
If provided in conjunction with :obj:`dest_folder`, images encountered will be copied from this folder to
:obj:`dest_folder`.
- dest_folder (:obj:`str`, `optional`):
If provided in conjunction with :obj:`origin_folder`, images encountered will be copied from
:obj:`origin_folder` to this folder.
"""
with open(rst_file, 'r') as f:
content = f.read()
lines = content.split("\n")
lines = process_titles(lines)
blocks = split_blocks(lines)
cells = [code_cell(INSTALL_CODE)]
for block,block_type in blocks:
if block_type == 'title' or block_type == 'prose':
block = convert_table(convert_rst_formatting(convert_rst_links(block)))
cells.append(markdown_cell(block))
elif block_type == 'anchor':
block = convert_anchor(block)
cells.append(markdown_cell(block))
elif block_type.startswith('code-block'):
codes = split_frameworks(block)
if framework == 'pt' and len(codes) > 1:
codes = (rm_first_line(codes[0]),)
elif framework == 'tf' and len(codes) > 1:
codes = (rm_first_line(codes[1]),)
for code in codes:
source,output = parse_code_and_output(code)
if block_type.endswith('bash'):
lines = source.split("\n")
new_lines = [line if line.startswith("#") else f"! {line}" for line in lines]
source = "\n".join(new_lines)
cells.append(code_cell(source, output=output))
elif block_type.startswith("image"):
image_name = block_type[len("image "):]
block = convert_image(
image_name,
block,
pref=img_prefix,
origin_folder=origin_folder,
dest_folder=dest_folder
)
cells.append(markdown_cell(block))
elif block_type == "math":
block = convert_math(block)
cells.append(markdown_cell(block))
else:
block = convert_rst_formatting(convert_rst_links(block))
block = convert_to_note(block, block_type)
cells.append(markdown_cell(block))
notebook = create_notebook(cells)
nbformat.write(notebook, notebook_fname, version=4)
def convert_all_tutorials(path_to_docs=None, path_to_dest=None):
""" Convert all tutorials into notebooks."""
path_to_docs = PATH_TO_DOCS if path_to_docs is None else path_to_docs
path_to_dest = PATH_TO_DEST if path_to_dest is None else path_to_dest
for folder in ["pytorch", "tensorflow"]:
os.makedirs(os.path.join(path_to_dest, folder), exist_ok=True)
for file in TUTORIAL_FILES:
notebook_name = os.path.splitext(file)[0] + ".ipynb"
doc_file = os.path.join(path_to_docs, file)
notebook_file = os.path.join(path_to_dest, notebook_name)
convert_rst_file_to_notebook(doc_file, notebook_file, origin_folder=path_to_docs, dest_folder=path_to_dest)
for folder, framework in zip(["pytorch", "tensorflow"], ["pt", "tf"]):
notebook_file = os.path.join(os.path.join(path_to_dest, folder), notebook_name)
convert_rst_file_to_notebook(doc_file, notebook_file, framework=framework, img_prefix="..")
if __name__ == "__main__":
convert_all_tutorials()