scripts/nb_to_md.py (146 lines of code) (raw):
"""Copyright 2018-2022 The Kubeflow Authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This script creates and updates Markdown versions of Notebook files
for publication on Kubeflow on Google Cloud using Hugo/Docsy.
Hugo Markdown files have a metadata section at the top of the page for the
Front Matter. The Front Matter specifies the page Title, Description, and
Weight. These values are used to generate the left side navigation, index
pages, and some page content.
This script expects the Front Matter to be specified in the following format
in the first cell of a Jupyter notebook:
# {Title}
> {Description}
So, the Title is expected to be a Heading 1 and the Description is expected to
immediately follow it as a Blockquote. Currently, there is no Weight in the
notebook file.
The script reads the Front Matter from the existing Markdown file,
or initializes default values, and then overrides the Markdown file's
Front Matter with values from the notebook.
* The Weight is always used from the Markdown file. If no Markdown file
exists, this will default to `DEFAULT_WEIGHT`. Which should push doc to
the end of the list. Edit the Markdown file to specify the correct Weight.
* If no Title is specified in the notebook, the Markdown file's
front matter is used.
* If the Title is specified in the notebook and the Description is not,
the notebook's Title is used and otherwise the Markdown file's front
matter is used.
To run this script, type the following on the command line:
python3 scripts/nb_to_md.py --notebook /content/en/docs/path-to-notebook
Input:
The path to the notebook to convert to Markdown as `--notebook` command
line flag.
Output:
STDOUT returns the status of the conversion process.
Dependencies:
This script depends on `absl`, `nbconvert`, `nbformat`, and `toml`. You
may need to install these dependencies using a command like the following:
pip3 install nbconvert
"""
from pathlib import Path
import re
from typing import Tuple
from absl import app
from absl import flags
from nbconvert.exporters import MarkdownExporter
import nbformat
import toml
FLAGS = flags.FLAGS
flags.DEFINE_string(
'notebook',
None,
'Path to the notebook to publish. Should start with "content/en/docs"')
DEFAULT_WEIGHT = 900
class MarkdownFile:
"""Represents the Markdown version of a notebook."""
def __init__(self, file_path):
self.file_path = file_path
def exists(self):
"""Indicates if the Markdown file exists."""
return Path(self.file_path).exists()
def parse_front_matter(self) -> Tuple[str, str, int]:
"""Parses Front Matter values from Markdown
Returns
A tuple containing the title, description, and weight.
"""
# default values
title = None
description = None
weight = DEFAULT_WEIGHT
if self.exists():
content = Path(self.file_path).read_text()
# find the front matter section
regexp = re.compile('\++\n(.*?)\++\n', re.S)
m = regexp.match(content)
front_matter_content = m.group(1)
# load the TOML
front_matter = toml.loads(front_matter_content)
if 'title' in front_matter:
title = front_matter['title']
if 'description' in front_matter:
description = front_matter['description']
if 'weight' in front_matter:
weight = front_matter['weight']
return title, description, weight
def write_file(self, content: str):
p = Path(self.file_path)
p.write_text(content)
class NotebookFile:
"""Represents a Jupyter notebook file."""
def __init__(self, file_path):
self.file_path = file_path
def exists(self):
"""Indicates if the notebook file exists."""
return Path(self.file_path).exists()
def get_markdown_file(self):
p = Path(self.file_path)
markdown_file_path = p.with_suffix('.md')
return MarkdownFile(markdown_file_path)
def parse_front_matter(self, content, markdown) -> Tuple[str, str, int, str]:
"""Gets the Front Matter for the updated notebook.
Uses the Markdown Front Matter as the default values and overrides with
content from the notebook.
Args:
content: The notebook content converted to Markdown.
markdown: An instance of MarkdownFile.
Returns:
A tuple containing the title, description, weight, and content without
the Front Matter."""
title, description, weight = markdown.parse_front_matter()
content_idx = 0
# find the title
idx = content.find('\n')
if idx:
line = content[0:idx]
if line.startswith("#"):
title = line[1:].strip()
content_idx = idx + 1
# find the description
descIdx = content.find('\n', idx + 1)
if descIdx:
line = content[idx + 1:descIdx]
if line.startswith(">"):
description = line[1:].strip()
content_idx = descIdx + 1
content = content[content_idx:]
return title, description, weight, content
def publish_markdown(self):
"""Updates the Markdown version of a Jupyter notebook file."""
nb = self.get_clean_notebook();
exporter = MarkdownExporter()
(content, resources) = exporter.from_notebook_node(nb)
markdown = self.get_markdown_file()
# separate front matter from content
title, description, weight, content = self.parse_front_matter(
content, markdown)
template = ('+++\n'
'title = "{0}"\n'
'description = "{1}"\n'
'weight = {2}\n'
'+++\n\n'
'<!--\n'
'AUTOGENERATED FROM {4}\n'
'PLEASE UPDATE THE JUPYTER NOTEBOOK AND REGENERATE THIS FILE'
' USING scripts/nb_to_md.py.'
'-->\n\n'
'<style>\n'
'.notebook-links {{display: flex; margin: 1em 0;}}\n'
'.notebook-links a {{padding: .75em; margin-right: .75em;'
' font-weight: bold;}}\n'
'a.colab-link {{\n'
'padding-left: 3.25em;\n'
'background-image: url(/docs/images/logos/colab.ico);\n'
'background-repeat: no-repeat;\n'
'background-size: contain;\n'
'}}\n'
'a.github-link {{\n'
'padding-left: 2.75em;\n'
'background-image: url(/docs/images/logos/github.png);\n'
'background-repeat: no-repeat;\n'
'background-size: auto 75%;\n'
'background-position: left center;\n'
'}}\n'
'</style>\n'
'<div class="notebook-links">\n'
'<a class="colab-link" href="https://colab.research.google.com/'
'github/kubeflow/website/blob/master/{4}">Run in Google Colab'
'</a>\n'
'<a class="github-link" href="https://github.com/kubeflow/websi'
'te/blob/master/{4}">View source on GitHub</a>\n'
'</div>\n\n'
'{3}'
'\n\n'
'<div class="notebook-links">\n'
'<a class="colab-link" href="https://colab.research.google.com/'
'github/kubeflow/website/blob/master/{4}">Run in Google Colab'
'</a>\n'
'<a class="github-link" href="https://github.com/kubeflow/websi'
'te/blob/master/{4}">View source on GitHub</a>\n'
'</div>')
markdown.write_file(
template.format(title, description, weight, content, self.file_path))
def format_as_terminal(self, commands: str) -> str:
"""Formats a command block to indicate that it contains terminal commands.
Args:
commands: The command block to format.
Returns:
The reformatted command block.
"""
lines = commands.split('\n')
buffer = []
for line in lines:
if line.startswith('!'):
line = '$ {}'.format(line[1:])
buffer.append(line)
return '\n'.join(buffer)
def get_clean_notebook(self):
"""Cleans up formatting when converting notebook content to Markdown."""
nb = nbformat.read(self.file_path, as_version=4)
for cell in nb.cells:
if cell.cell_type == 'code' and cell.source.find('!') != -1:
cell.source = self.format_as_terminal(cell.source)
return nb
def main(argv):
"""[nb_to_md.py] Publish Jupyter notebooks as a Kubeflow on Google Cloud Markdown page"""
if FLAGS.notebook is not None:
notebook = NotebookFile(FLAGS.notebook)
if notebook.exists():
notebook.publish_markdown()
print('Markdown content has been updated!')
else:
print(('Could not update Markdown content.'
' Notebook file was not found at "{}"').format(FLAGS.notebook))
else:
print(('Could not update Markdown content.'
' No notebook parameter was specified.'))
if __name__ == '__main__':
app.run(main)