notebooks/notebook_template

""" AutoReview: Script to automatically review Vertex AI notebooks for conformance to notebook template requirements: python3 notebook_template_review.py [options] # options for selecting notebooks --notebook: review the specified notebook --notebook-dir: recursively traverse the directory and review each notebook enocuntered --notebook-file: A CSV file with list of notebooks to review. --skip-file: a CSV file with list of notebooks to skip. # options for error handling --errors: Report detected errors. --errors-codes: A list of error codes to report errors. Otherwise, all errors are reported. --errors-csv: Report errors in CSV format # options for automatic fixing --fix: Automatic fix --fix-codes: A list of fix codes to fix. Otherwise, all fix codes are enabled. # index generatation --repo: Generate index in markdown format --web: Generate index in HTML format --title: Add title to index --desc: Add description to index --steps: Add steps to index --uses: Add "resources" used to index --linkback: Add linkback to index Format of CSV file for notebooks to review: tags,notebook-path,backlink tags: Double quoted ist of tags: e.g., "AutoML, Tabular Data" notebook-path: path of notebook, relative to https://github.com/GoogleCloudPlatform/vertex-ai-samples/notebooks backlink: webdoc page with more details, relative to https://cloud.google.com/ """ import argparse import json import os import sys import urllib.request import csv from enum import Enum from abc import ABC, abstractmethod from typing import List parser = argparse.ArgumentParser() parser.add_argument('--notebook-dir', dest='notebook_dir', default=None, type=str, help='Notebook directory') parser.add_argument('--notebook', dest='notebook', default=None, type=str, help='Notebook to review') parser.add_argument('--notebook-file', dest='notebook_file', default=None, type=str, help='File with list of notebooks to review') parser.add_argument('--skip-file', dest='skip_file', default=None, type=str, help='File with list of notebooks to skip') parser.add_argument('--errors', dest='errors', action='store_true', default=False, help='Report errors') parser.add_argument('--errors-csv', dest='errors_csv', action='store_true', default=False, help='Report errors as CSV') parser.add_argument('--errors-codes', dest='errors_codes', default=None, type=str, help='Report only specified errors') parser.add_argument('--title', dest='title', action='store_true', default=False, help='Output description') parser.add_argument('--desc', dest='desc', action='store_true', default=False, help='Output description') parser.add_argument('--uses', dest='uses', action='store_true', default=False, help='Output uses (resources)') parser.add_argument('--steps', dest='steps', action='store_true', default=False, help='Ouput steps') parser.add_argument('--linkback', dest='linkback', action='store_true', default=False, help='Ouput linkback') parser.add_argument('--web', dest='web', action='store_true', default=False, help='Output format in HTML') parser.add_argument('--repo', dest='repo', action='store_true', default=False, help='Output format in Markdown') parser.add_argument('--fix', dest='fix', action='store_true', default=False, help='Fix the notebook non-conformance errors') parser.add_argument('--fix-codes', dest='fix_codes', default=None, type=str, help='Fix only specified errors') args = parser.parse_args() if args.errors_codes: args.errors_codes = args.errors_codes.split(',') args.errors = True if args.errors_csv: args.errors = True if args.fix_codes: args.fix_codes = args.fix_codes.split(',') args.fix = True class ErrorCode(Enum): # Copyright cell # Google copyright cell required ERROR_COPYRIGHT = 0, # Links cell # H1 heading required # git, colab and workbench link required # links must be valid links ERROR_TITLE_HEADING = 1, ERROR_HEADING_CASE = 2, ERROR_HEADING_CAP = 3, ERROR_LINK_GIT_MISSING = 4, ERROR_LINK_COLAB_MISSING = 5, ERROR_LINK_WORKBENCH_MISSING = 6, ERROR_LINK_GIT_BAD = 7, ERROR_LINK_COLAB_BAD = 8, ERROR_LINK_WORKBENCH_BAD = 9, ERROR_LINK_COLAB_ENTERPRISE_BAD = 102, # Overview cells # Overview cell required # Objective cell required # Dataset cell required # Costs cell required # Check for required Vertex and optional BQ and Dataflow ERROR_OVERVIEW_NOTFOUND = 10, ERROR_LINKBACK_NOTFOUND = 11, ERROR_OBJECTIVE_NOTFOUND = 12, ERROR_OBJECTIVE_MISSING_DESC = 13, ERROR_OBJECTIVE_MISSING_USES = 14, ERROR_OBJECTIVE_MISSING_STEPS = 15, ERROR_DATASET_NOTFOUND = 16, ERROR_COSTS_NOTFOUND = 17, ERROR_COSTS_MISSING = 18, # Installation cell # Installation cell required # Wrong heading for installation cell # Installation code cell not found # pip3 required # option -q required # option {USER_FLAG} required # installation code cell not match template # all packages must be installed as a single pip3 ERROR_INSTALLATION_NOTFOUND = 19, ERROR_INSTALLATION_HEADING = 20, ERROR_INSTALLATION_CODE_NOTFOUND = 21, ERROR_INSTALLATION_PIP3 = 22, ERROR_INSTALLATION_QUIET = 23, ERROR_INSTALLATION_USER_FLAG = 24, ERROR_INSTALLATION_CODE_TEMPLATE = 25, ERROR_INSTALLATION_SINGLE_PIP3 = 26, # Restart kernel cell # Restart code cell required # Restart code cell not found ERROR_RESTART_NOTFOUND = 27, ERROR_RESTART_CODE_NOTFOUND = 28, # Before you begin cell # Before you begin cell required # Before you begin cell incomplete ERROR_BEFOREBEGIN_NOTFOUND = 29, ERROR_BEFOREBEGIN_INCOMPLETE = 30, # Set Project ID # Set project ID cell required # Set project ID code cell not found # Set project ID not match template ERROR_PROJECTID_NOTFOUND = 31, ERROR_PROJECTID_CODE_NOTFOUND = 32, ERROR_PROJECTID_TEMPLATE = 33, # Technical Writer Rules ERROR_TWRULE_TODO = 51, ERROR_TWRULE_FIRSTPERSON = 52, ERROR_TWRULE_FUTURETENSE = 53, ERROR_TWRULE_BRANDING = 54, ERROR_EMPTY_CALL = 101 class FixCode(Enum): FIX_BAD_LINK = 0, FIX_PLACEHOLDER = 1 # globals last_tag = '' skip_list = [] def parse_dir(directory: str) -> int: """ Recursively walk the specified directory, reviewing each notebook (.ipynb) encountered. directory: The directory path. Returns the number of errors """ exit_code = 0 sorted_entries = [] entries = os.scandir(directory) for entry in entries: inserted = False for ix in range(len(sorted_entries)): if entry.name < sorted_entries[ix].name: sorted_entries.insert(ix, entry) inserted = True break if not inserted: sorted_entries.append(entry) entries = sorted_entries for entry in entries: if entry.is_dir(): if entry.name[0] == '.': continue if entry.name == 'src' or entry.name == 'images' or entry.name == 'sample_data': continue exit_code += parse_dir(entry.path) elif entry.name.endswith('.ipynb'): if entry.name in skip_list: print(f"Warning: skipping notebook {entry.name}", file=sys.stderr) continue tag = directory.split('/')[-1] if tag == 'automl': tag = 'AutoML' elif tag == 'bigquery_ml': tag = 'BigQuery ML' elif tag == 'custom': tag = 'Vertex AI Training' elif tag == 'experiments': tag = 'Vertex AI Experiments' elif tag == 'explainable_ai': tag = 'Vertex Explainable AI' elif tag == 'feature_store': tag = 'Vertex AI Feature Store' elif tag == 'matching_engine': tag = 'Vertex AI Matching Engine' elif tag == 'migration': tag = 'CAIP to Vertex AI migration' elif tag == 'ml_metadata': tag = 'Vertex ML Metadata' elif tag == 'model_evaluation': tag = 'Vertex AI Model Evaluation' elif tag == 'model_monitoring': tag = 'Vertex AI Model Monitoring' elif tag == 'model_registry': tag = 'Vertex AI Model Registry' elif tag == 'pipelines': tag = 'Vertex AI Pipelines' elif tag == 'prediction': tag = 'Vertex AI Prediction' elif tag == 'pytorch': tag = 'Vertex AI Training' elif tag == 'reduction_server': tag = 'Vertex AI Reduction Server' elif tag == 'sdk': tag = 'Vertex AI SDK' elif tag == 'structured_data': tag = 'AutoML / BQML' elif tag == 'tabnet': tag = 'Vertex AI TabNet' elif tag == 'tabular_workflows': tag = 'AutoML Tabular Workflows' elif tag == 'tensorboard': tag = 'Vertex AI TensorBoard' elif tag == 'training': tag = 'Vertex AI Training' elif tag == 'vizier': tag = 'Vertex AI Vizier' # special case if 'workbench' in directory: tag = 'Vertex AI Workbench' exit_code += parse_notebook(entry.path, tags=[tag], linkback=None, rules=rules) return exit_code def parse_notebook(path: str, tags: List, linkback: str, rules: List) -> int: """ Review the specified notebook for conforming to the notebook template and notebook authoring requirements. path: The path to the notebook. tags: The associated tags linkback: A link back to the web docs rules: The cell rules to apply Returns the number of errors """ notebook = Notebook(path) for rule in rules: rule.validate(notebook) # Automatic Index Generation if objective.desc != '': if overview.linkbacks: linkbacks = overview.linkbacks else: if linkback: linkbacks = [linkback] else: linkbacks = [] if overview.tags: tags = overview.tags add_index(path, tags, linkbacks, title.title, objective.desc, objective.uses, objective.steps, links.git_link, links.colab_link, links.colab_enterprise_link, links.workbench_link, ) if args.fix: notebook.writeback() return notebook.num_errors class Notebook(object): ''' Class for navigating through a notebook ''' def __init__(self, path): """ Initializer path: The path to the notebook """ self._path = path with open(self._path, 'r') as f: try: self._content = json.load(f) except: print("Corrupted notebook:", path) return self._cells = self._content['cells'] self._cell_index = 0 self._num_errors = 0 # cross cell information self._costs = [] def get(self) -> list: ''' Get the next cell in the notebook Returns the current cell ''' cell = self._cells[self._cell_index] self._cell_index += 1 return cell def peek(self) -> list: ''' Peek at the next cell in the notebook Returns the current cell ''' cell = self._cells[self._cell_index] return cell def pop(self, n_cells=1): ''' Advance the specified number of cells n_cells: The number of cells to advance ''' self._cell_index += n_cells @property def path(self): ''' Getter: return the filename path for the notebook ''' return self._path @property def num_errors(self): ''' Getter: return the number of errors ''' return self._num_errors def report_error(self, code: ErrorCode, errmsg: str): """ Report an error. If args.errors_codes set, then only report these errors. Otherwise, all errors. code: The error code number. errmsg: The error message """ if args.errors: code = code.value[0] if args.errors_codes: if str(code) not in args.errors_codes: return if args.errors_csv: print(self._path, ',', code) else: print(f"{self._path}: ERROR ({code}): {errmsg}", file=sys.stderr) self._num_errors += 1 return False return True def report_fix(self, code: FixCode, fixmsg: str): """ Report an automatic fix code: The fox code number. fixmsg: The autofix message Returns: Whether code is to be fixed """ if args.fix: code = code.value[0] if args.fix_codes: if str(code) not in args.fix_codes: return False print(f"{self._path}: FIXED ({code}): {fixmsg}", file=sys.stderr) return True return False def writeback(self): """ Write back the updated (autofixed) notebook """ with open(self._path, 'w') as f: json.dump(self._content, f) class NotebookRule(ABC): """ Abstract class for defining notebook conformance rules """ @abstractmethod def validate(self, notebook: Notebook) -> bool: ''' Applies cell specific rules to validate whether the cell does or does not conform to the rules. Returns whether the cell passed the validation rules ''' pass class CopyrightRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the copyright cell """ cell = notebook.get() if not 'Copyright' in cell['source'][0]: return notebook.report_error(ErrorCode.ERROR_COPYRIGHT, "missing copyright cell") return True class NoticesRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the (optional) notices cell """ cell = notebook.peek() if cell['source'][0].startswith('This notebook'): notebook.pop() return True class TitleRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the title in the links cell """ ret = True self.title = '' cell = notebook.peek() if not cell['source'][0].startswith('# '): notebook.report_error(ErrorCode.ERROR_TITLE_HEADING, "title cell must start with H1 heading") if not cell['source'][0].startswith('## '): ret = False else: self.title = cell['source'][0][3:].strip() SentenceCaseTWRule().validate(notebook, [self.title]) # H1 title only if len(cell['source']) == 1: notebook.pop() else: self.title = cell['source'][0][2:].strip() SentenceCaseTWRule().validate(notebook, [self.title]) # H1 title only if len(cell['source']) == 1: notebook.pop() return ret class LinksRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the links in the links cell """ self.git_link = None self.colab_link = None self.colab_enterprise_link = None self.workbench_link = None source = '' ret = True cell = notebook.get() for ix in range(len(cell['source'])): line = cell['source'][ix] source += line if '<a href="https://github.com' in line: self.git_link = line.strip()[9:-2].replace('" target="_blank', '').replace('" target=\'_blank', '') derived_link = os.path.join('https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/', notebook.path) if self.git_link != derived_link: if notebook.report_fix(FixCode.FIX_BAD_LINK, f"fixed GitHub link: {derived_link}"): fix_link = f"<a href=\"{derived_link}\" target='_blank'>\n" cell['source'][ix] = fix_link else: ret = notebook.report_error(ErrorCode.ERROR_LINK_GIT_BAD, f"bad GitHub link: {self.git_link}") if '<a href="https://colab.research.google.com/' in line: self.colab_link = 'https://colab.research.google.com/github/' + line.strip()[50:-2].replace('" target="_blank', '').replace('" target=\'_blank', '') derived_link = os.path.join('https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks', notebook.path) if self.colab_link != derived_link: if notebook.report_fix(FixCode.FIX_BAD_LINK, f"fixed Colab link: {derived_link}"): fix_link = f"<a href=\"{derived_link}\" target='_blank'>\n" cell['source'][ix] = fix_link else: ret = notebook.report_error(ErrorCode.ERROR_LINK_COLAB_BAD, f"bad Colab link: {self.colab_link}") if '<a href="https://console.cloud.google.com/vertex-ai/colab/' in line: self.colab_enterprise_link = line.strip()[9:-2].replace('" target="_blank', '').replace('" target=\'_blank', '') modified_notebook_path = notebook.path.replace("/", "%2F") derived_link = os.path.join('https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2F', modified_notebook_path) if self.workbench_link != derived_link: if notebook.report_fix(FixCode.FIX_BAD_LINK, f"fixed Colab Enterprise link: {derived_link}"): fix_link = f"<a href=\"{derived_link}\" target='_blank'>\n" cell['source'][ix] = fix_link else: ret = notebook.report_error(ErrorCode.ERROR_LINK_COLAB_ENTERPRISE_BAD, f"bad Colab Enterprise link: {self.colab_enterprise_link}") if '<a href="https://console.cloud.google.com/vertex-ai/workbench/' in line: self.workbench_link = line.strip()[9:-2].replace('" target="_blank', '').replace('" target=\'_blank', '') derived_link = os.path.join('https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/', notebook.path) if self.workbench_link != derived_link: if notebook.report_fix(FixCode.FIX_BAD_LINK, f"fixed Workbench link: {derived_link}"): fix_link = f"<a href=\"{derived_link}\" target='_blank'>\n" cell['source'][ix] = fix_link else: ret = notebook.report_error(ErrorCode.ERROR_LINK_WORKBENCH_BAD, f"bad Workbench link: {self.workbench_link}") if 'View on GitHub' not in source or not self.git_link: ret = notebook.report_error(ErrorCode.ERROR_LINK_GIT_MISSING, 'Missing link for GitHub') if 'Run in Colab' not in source or not self.colab_link: ret = notebook.report_error(ErrorCode.ERROR_LINK_COLAB_MISSING, 'Missing link for Colab') if 'Open in Vertex AI Workbench' not in source or not self.workbench_link: ret = notebook.report_error(ErrorCode.ERROR_LINK_WORKBENCH_MISSING, 'Missing link for Workbench') return ret class TableRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the (optional) table of contents cell """ cell = notebook.peek() if cell['source'][0].startswith('## Table of contents'): notebook.pop() return True class TestEnvRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the (optional) test in which environment cell """ cell = notebook.peek() if cell['source'][0].startswith('**_NOTE_**: This notebook has been tested'): notebook.pop() return True class OverviewRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the overview cell """ self.linkbacks = [] self.tags = [] cell = notebook.get() if not cell['source'][0].startswith("## Overview"): return notebook.report_error(ErrorCode.ERROR_OVERVIEW_NOTFOUND, "Overview section not found") last_line = cell['source'][-1] if last_line.startswith('Learn more about ['): for more in last_line.split('[')[1:]: tag = more.split(']')[0] linkback = more.split('(')[1].split(')')[0] self.tags.append(tag) self.linkbacks.append(linkback) else: return notebook.report_error(ErrorCode.ERROR_LINKBACK_NOTFOUND, "Linkback missing in overview section") return True class ObjectiveRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the objective cell. Find the description, uses and steps. """ self.desc = '' self.uses = '' self.steps = '' self.costs = [] ret = True cell = notebook.get() if not cell['source'][0].startswith("### Objective"): ret = notebook.report_error(ErrorCode.ERROR_OBJECTIVE_NOTFOUND, "Objective section not found") notebook.costs = [] return ret in_desc = True in_uses = False in_steps = False for line in cell['source'][1:]: # TOC anchor if line.startswith('<a name='): continue if line.startswith('This tutorial uses'): in_desc = False in_steps = False in_uses = True self.uses += line continue elif line.startswith('The steps performed'): in_desc = False in_uses = False in_steps = True self.steps += line continue if in_desc: if len(self.desc) > 0 and line.strip() == '': in_desc = False continue self.desc += line elif in_uses: sline = line.strip() if len(sline) == 0: self.uses += '\n' else: ch = sline[0] if ch in ['-', '*', '1', '2', '3', '4', '5', '6', '7', '8', '9']: self.uses += line elif in_steps: sline = line.strip() if len(sline) == 0: self.steps += '\n' else: ch = sline[0] if ch in ['-', '*', '1', '2', '3', '4', '5', '6', '7', '8', '9']: # check for italic font setting if ch == '*' and sline[1] != ' ': in_steps = False # special case elif sline.startswith('* Prediction Service'): in_steps = False else: self.steps += line elif ch == '#': in_steps = False if self.desc == '': ret = notebook.report_error(ErrorCode.ERROR_OBJECTIVE_MISSING_DESC, "Objective section missing desc") else: self.desc = self.desc.lstrip() bracket = False paren = False sentences = "" for _ in range(len(self.desc)): if self.desc[_] == '[': bracket = True continue elif self.desc[_] == ']': bracket = False continue elif self.desc[_] == '(': paren = True elif self.desc[_] == ')': paren = False continue if not paren: sentences += self.desc[_] sentences = sentences.split('.') if len(sentences) > 1: self.desc = sentences[0] + '.\n' if self.desc.startswith('In this tutorial, you learn') or self.desc.startswith('In this notebook, you learn'): self.desc = self.desc[22].upper() + self.desc[23:] if self.uses == '': ret = notebook.report_error(ErrorCode.ERROR_OBJECTIVE_MISSING_USES, "Objective section missing uses services list") else: if 'BigQuery' in self.uses: self.costs.append('BQ') if 'Vertex' in self.uses: self.costs.append('Vertex') if 'Dataflow' in self.uses: self.costs.append('Dataflow') if self.steps == '': ret = notebook.report_error(ErrorCode.ERROR_OBJECTIVE_MISSING_STEPS, "Objective section missing steps list") notebook.costs = self.costs return ret class RecommendationsRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the (optional) recommendations cell """ # (optional) Recommendation cell = notebook.peek() if cell['source'][0].startswith("### Recommendations"): notebook.pop() return True class DatasetRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the dataset cell """ cell = notebook.get() if not cell['source'][0].startswith("### Dataset") and not cell['source'][0].startswith("### Model") and not cell['source'][0].startswith("### Embedding"): return notebook.report_error(ErrorCode.ERROR_DATASET_NOTFOUND, "Dataset/Model section not found") return True class CostsRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the costs cell """ ret = True cell = notebook.get() if not cell['source'][0].startswith("### Costs"): ret = notebook.report_error(ErrorCode.ERROR_COSTS_NOTFOUND, "Costs section not found") else: text = '' for line in cell['source']: text += line if 'BQ' in notebook.costs and 'BigQuery' not in text: ret = notebook.report_error(ErrorCode.ERROR_COSTS_MISSING, 'Costs section missing reference to BiqQuery') if 'Vertex' in notebook.costs and 'Vertex' not in text: ret = notebook.report_error(ErrorCode.ERROR_COSTS_MISSING, 'Costs section missing reference to Vertex') if 'Dataflow' in notebook.costs and 'Dataflow' not in text: ret = notebook.report_error(ErrorCode.ERROR_COSTS_MISSING, 'Costs section missing reference to Dataflow') return ret class SetupLocalRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the (optional) setup local environment cell """ cell = notebook.peek() if cell['source'][0].startswith('## Before you begin'): notebook.pop() cell = notebook.peek() if not cell['source'][0].startswith('### Set up your local development environment'): return True notebook.pop() cell = notebook.peek() if cell['source'][0].startswith('**Otherwise**, make sure your environment meets'): notebook.pop() return True class HelpersRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the (optional) helpers text/code cell """ cell = notebook.peek() if 'helper' in cell['source'][0]: notebook.pop(2) # text and cell return True class InstallationRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the installation cells """ ret = True cell = notebook.get() if 'Install' not in cell['source'][0]: return notebook.report_error(ErrorCode.ERROR_INSTALLATION_NOTFOUND, "Installation section not found") if not cell['source'][0].startswith("## Install"): ret = notebook.report_error(ErrorCode.ERROR_INSTALLATION_HEADING, "Installation section needs to be H2 heading") cell = notebook.get() if cell['cell_type'] != 'code': ret = notebook.report_error(ErrorCode.ERROR_INSTALLATION_NOTFOUND, "Installation section not found") else: if cell['source'][0].startswith('! mkdir'): cell = notebook.get() if 'requirements.txt' in cell['source'][0]: cell = notebook.get() text = '' for line in cell['source']: text += line if 'pip ' in line: if 'pip3' not in line: notebook.report_error(ErrorCode.ERROR_INSTALLATION_PIP3, "Installation code section: use pip3") if line.endswith('\\\n'): continue if '-q' not in line and '--quiet' not in line : notebook.report_error(ErrorCode.ERROR_INSTALLATION_QUIET, "Installation code section: use -q with pip3") if 'USER_FLAG' not in line and 'sh(' not in line: notebook.report_error(ErrorCode.ERROR_INSTALLATION_USER_FLAG, "Installation code section: use {USER_FLAG} with pip3") if 'required_packages <' in text: pass # R kernel elif 'if IS_WORKBENCH_NOTEBOOK:' not in text: ret = notebook.report_error(ErrorCode.ERROR_INSTALLATION_CODE_TEMPLATE, "Installation code section out of date (see template)") return ret class RestartRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the restart cells """ ret = True while True: cont = False cell = notebook.peek() for line in cell['source']: if 'pip' in line: ret = notebook.report_error(ErrorCode.ERROR_INSTALLATION_SINGLE_PIP3, f"All pip installations must be in a single code cell: {line}") cont = True break if not cont: break notebook.pop() cell = notebook.peek() if not cell['source'][0].startswith("### Restart the kernel"): ret = notebook.report_error(ErrorCode.ERROR_RESTART_NOTFOUND, "Restart the kernel section not found") else: notebook.pop() cell = notebook.get() # code cell if cell['cell_type'] != 'code': ret = notebook.report_error(ErrorCode.ERROR_RESTART_CODE_NOTFOUND, "Restart the kernel code section not found") return ret class VersionsRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the (optional) package versions code/text cell """ cell = notebook.peek() if cell['source'][0].startswith('#### Check package versions'): notebook.pop(2) # text and code return True class BeforeBeginRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the before you begin cell """ ret = True cell = notebook.get() if not cell['source'][0].startswith("## Before you begin"): ret = notebook.report_error(ErrorCode.ERROR_BEFOREBEGIN_NOTFOUND, "Before you begin section not found") else: # is two cells instead of one if len(cell['source']) < 2: cell = notebook.get() if not cell['source'][0].startswith("### Set up your Google Cloud project"): ret = notebook.report_error(ErrorCode.ERROR_BEFOREBEGIN_INCOMPLETE, "Before you begin section incomplete") return ret class EnableAPIsRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the (optional) enable apis code/text cell """ cell = notebook.peek() if cell['source'][0].startswith("### Enable APIs"): notebook.pop(2) # text and code return True class SetupProjectRule(NotebookRule): def validate(self, notebook: Notebook) -> bool: """ Parse the set project cells """ ret = True cell = notebook.get() if not cell['source'][0].startswith('#### Set your project ID'): ret = notebook.report_error(ErrorCode.ERROR_PROJECTID_NOTFOUND, "Set project ID section not found") else: cell = notebook.get() if cell['cell_type'] != 'code': ret = notebook.report_error(ErrorCode.ERROR_PROJECTID_CODE_NOTFOUND, "Set project ID code section not found") elif not cell['source'][0].startswith('PROJECT_ID = "[your-project-id]"'): ret = notebook.report_error(ErrorCode.ERROR_PROJECTID_TEMPLATE, "Set project ID not match template") cell = notebook.get() if cell['cell_type'] != 'code' or 'or PROJECT_ID == "[your-project-id]":' not in cell['source'][0]: ret = notebook.report_error(ErrorCode.ERROR_PROJECTID_TEMPLATE, "Set project ID not match template") cell = notebook.get() if cell['cell_type'] != 'code' or '! gcloud config set project' not in cell['source'][0]: ret = notebook.report_error(ErrorCode.ERROR_PROJECTID_TEMPLATE, "Set project ID not match template") return ret class TextRule(ABC): """ Abstract class for defining text writing conformance rules """ @abstractmethod def validate(self, notebook: Notebook, text: List[str]) -> bool: ''' Applies text writing specific rules to validate whether the text does or does not conform to the rules. Returns whether the test passed the validation rules ''' return False class BrandingRule(TextRule): def validate(self, notebook: Notebook, text: List[str]) -> bool: """ Check the text for branding issues 1. Product branding names 2. No future tense 3. No 1st person """ ret = True branding = { 'Vertex SDK': 'Vertex AI SDK', 'Vertex Training': 'Vertex AI Training', 'Vertex Prediction': 'Vertex AI Prediction', 'Vertex Batch Prediction': 'Vertex AI batch prediction', 'Vertex XAI': 'Vertex Explainable AI', 'Vertex Explainability': 'Vertex Explainable AI', 'Vertex AI Explainability': 'Vertex Explainable AI', 'Vertex Pipelines': 'Vertex AI Pipelines', 'Vertex Experiments': 'Vertex AI Experiments', 'Vertex TensorBoard': 'Vertex AI TensorBoard', 'Vertex Hyperparameter Tuning': 'Vertex AI hyperparameter tuning', 'Vertex Metadata': 'Vertex ML Metadata', 'Vertex AI Metadata': 'Vertex ML Metadata', 'Vertex AI ML Metadata': 'Vertex ML Metadata', 'Vertex Vizier': 'Vertex AI Vizier', 'Vertex Feature Store': 'Vertex AI Feature Store', 'Vertex Forecasting': 'Vertex AI forecasting', 'Vertex Vector Search': 'Vertex AI Vector Search', 'Vertex Dataset': 'Vertex AI dataset', 'Vertex Model': 'Vertex AI model', 'Vertex Endpoint': 'Vertex AI endpoint', 'Vertex Private Endpoint': 'Vertex AI private endpoint', 'Automl': 'AutoML', 'AutoML Image': 'AutoML Vision', 'AutoML Language': 'AutoML Natural Language', 'Tensorflow': 'TensorFlow', 'Tensorboard': 'Vertex AI TensorBoard', 'Google Cloud Notebooks': 'Vertex AI Workbench Notebooks', 'BQ ': 'BigQuery', 'BQ.': 'BigQuery', 'Bigquery': 'BigQuery', 'Big Query': 'BigQuery', 'BQML': 'BigQuery ML', 'GCS ': 'Cloud Storage', 'GCS.': 'Cloud Storage', 'Google Cloud Storage': 'Cloud Storage', 'Pytorch': 'PyTorch', 'Sklearn': 'scikit-learn', 'sklearn': 'scikit-learn' } for line in text: for mistake, brand in branding.items(): if mistake in line: ret = notebook.report_error(ErrorCode.ERROR_TWRULE_BRANDING, f"Branding {mistake} -> {brand}: {line}") return ret class SentenceCaseTWRule(TextRule): def validate(self, notebook, text: List[str]) -> bool: """ Check that headings are in sentence case path: used only for reporting an error text: the heading to check """ ret = True ACRONYMS = ['E2E', 'Vertex', 'AutoML', 'ML', 'AI', 'GCP', 'API', 'R', 'CMEK', 'TF', 'TFX', 'TFDV', 'SDK', 'VM', 'CPR', 'NVIDIA', 'ID', 'DASK', 'ARIMA_PLUS', 'KFP', 'I/O', 'GPU', 'Google', 'TensorFlow', 'PyTorch' ] # Check the first line words = text[0].replace('#', '').split(' ') if not words[0][0].isupper(): ret = notebook.report_error(ErrorCode.ERROR_HEADING_CAP, f"heading must start with capitalized word: {words[0]}") for word in words[1:]: word = word.replace(':', '').replace('(', '').replace(')', '') if word in ACRONYMS: continue if word.isupper(): ret = notebook.report_error(ErrorCode.ERROR_HEADING_CASE, f"heading is not sentence case: {word}") return ret class TextTWRule(TextRule): def validate(self, notebook: Notebook, text: List[str]) -> bool: """ Check for conformance to the following techwriter rules 1. No future tense 2. No 1st person """ ret = True for line in text: # HTML code if '<a ' in line: continue if 'TODO' in line or 'WIP' in line: ret = notebook.report_error(ErrorCode.ERROR_TWRULE_TODO, f'TODO in cell: {line}') if 'we ' in line.lower() or "let's" in line.lower() in line.lower(): ret = notebook.report_error(ErrorCode.ERROR_TWRULE_FIRSTPERSON, f'Do not use first person (e.g., we), replace with 2nd person (you): {line}') if 'will' in line.lower() or 'would' in line.lower(): ret = notebook.report_error(ErrorCode.ERROR_TWRULE_FUTURETENSE, f'Do not use future tense (e.g., will), replace with present tense: {line}') return ret def add_index(path: str, tags: List, linkbacks: List, title : str, desc: str, uses: str, steps: str, git_link: str, colab_link: str, colab_enterprise_link: str, workbench_link: str ): """ Add a discoverability index for this notebook path: The path to the notebook tags: The tags (if any) for the notebook title: The H1 title for the notebook desc: The notebook description uses: The resources/services used by the notebook steps: The steps specified by the notebook git_link: The link to the notebook in the git repo colab_link: Link to launch notebook in Colab colab_enterpise_link: Link to launch notebook in Colab Enterprise workbench_link: Link to launch notebook in Workbench linkbacks: The linkbacks per tag """ global last_tag if not args.web and not args.repo: return title = title.split(':')[-1].strip() title = title[0].upper() + title[1:] if args.web: title = replace_cl(replace_backtick(title)) print(' <tr>') print(' <td>') for tag in tags: tag = replace_cl(tag) print(f' {tag.strip()}<br/>\n') print(' </td>') print(' <td>') print(f' <b>{title}</b>. ') if args.desc: desc = replace_cl(replace_backtick(desc)) print('<br/>') print(f' {desc}\n') if args.linkback and linkbacks: num = len(tags) for _ in range(num): if linkbacks[_].startswith("vertex-ai"): print(f' Learn more about <a href="https://cloud.google.com/{linkbacks[_]}" target="_blank">{replace_cl(tags[_])}</a>.\n') else: print(f' Learn more about <a href="{linkbacks[_]}" target="_blank">{replace_cl(tags[_])}</a>.\n') if args.steps: print("<devsite-expandable>\n") print(' <p class="showalways">Tutorial steps</p>\n') print(' <ul>\n') if ":" in steps: steps = replace_backtick(steps) steps = steps.split(':')[1].replace('*', '').replace('-', '').strip().split('\n') else: steps = [] for step in steps: print(f' <li>{replace_cl(step)}</li>\n') print(' </ul>\n') print("</devsite-expandable>\n") print(' </td>') print(' <td>') if colab_link: print(f' <a href="{colab_link}" target="_blank" track-type="notebookTutorial" track-name="colabLink">Colab</a><br/>\n') if colab_enterprise_link: print(f' <a href="{colab_enterprise_link}" target="_blank" track-type="notebookTutorial" track-name="colabEnterpriseLink">Colab Enterprise</a><br/>\n') if git_link: print(f' <a href="{git_link}" target="_blank" track-type="notebookTutorial" track-name="gitHubLink">GitHub</a><br/>\n') if workbench_link: print(f' <a href="{workbench_link}" target="_blank" track-type="notebookTutorial" track-name="workbenchLink">Vertex AI Workbench</a><br/>\n') print(' </td>') print(' </tr>\n') elif args.repo: try: if tags != last_tag and tag != '': last_tag = tags flat_list = '' for item in tags: flat_list += item.replace("'", '') + ' ' print(f"\n### {flat_list}\n") except: pass print(f"\n[{title}]({git_link})\n") print("```") if args.desc: print(desc) if args.uses: print(uses) if args.steps: print(steps.rstrip() + '\n') print("```\n") if args.linkback and linkbacks: num = len(tags) for _ in range(num): if linkbacks[_].startswith("vertex-ai"): print(f'   Learn more about [{tags[_]}]({linkbacks[_]}).\n') else: print(f'   Learn more about [{tags[_]}]({linkbacks[_]}).\n') def replace_cl(text : str ) -> str: ''' Replace product names with CL substitution variables ''' substitutions = { #'AutoML Tabular Workflow': '{{automl_name}} tabular workflow', #'AutoML Tables': '{{automl_tables_name}}', #'AutoML Tabular': '{{automl_tables_name}}', #'AutoML Vision': '{{automl_vision_name}}', #'AutoML Image': '{{automl_vision_name}}', 'AutoML': '{{automl_name}}', 'BigQuery ML': '{{bigqueryml_name}}', 'BQML': '{{bigqueryml_name}}', 'BigQuery': '{{bigquery_name}}', 'BQ': '{{bigquery_name}}', 'Vertex Dataset': '{{vertex_ai_name}} dataset', 'Vertex Model': '{{vertex_ai_name}} model', 'Vertex Endpoint': '{{vertex_ai_name}} endpoint', 'Vertex Model Registry': '{{vertex_model_registry_name}}', 'model registry': '{{vertex_model_registry_name_short}}', 'Model Registry': '{{vertex_model_registry_name_short}}', 'Vertex AI Model Registry': '{{vertex_model_registry_name}}', 'Vertex Training': '{{vertex_training_name}}', 'Vertex AI Training': '{{vertex_training_name}}', 'Vertex Prediction': '{{vertex_prediction_name}}', 'Vertex AI Prediction': '{{vertex_prediction_name}}', 'Vertex TensorBoard': '{{vertex_tensorboard_name}}', 'Vertex AI TensorBoard': '{{vertex_tensorboard_name}}', 'TensorBoard': '{{vertex_tensorboard_name}}', 'Tensorboard': '{{vertex_tensorboard_name}}', 'Vertex ML Metadata': '{{vertex_metadata_name}}', 'Vertex Pipelines': '{{vertex_pipelines_name}}', 'Vertex AI Pipelines': '{{vertex_pipelines_name}}', 'Vertex AI Data Labeling': '{{vertex_data_labeling_name}}', 'Vertex AI Experiments': '{{vertex_experiments_name}}', 'Vertex Experiments': '{{vertex_experiments_name}}', 'Vertex AI Matching Engine': '{{vector_search_name}}', 'Vertex Matching Engine': '{{vector_search_name}}', 'Vertex Vector Search': '{{vector_search_name}}', 'Vector Search': '{{vector_search_name}}', 'Vertex AI Vector Search': '{{vector_search_name}}', 'Vertex Model Monitoring': '{{vertex_model_monitoring_name}}', 'Model Monitoring': '{{vertex_model_monitoring_name_short}}', 'Vertex AI Model Monitoring': '{{vertex_model_monitoring_name}}', 'Vertex Feature Store': '{{vertex_featurestore_name}}', 'Vertex AI Feature Store': '{{vertex_featurestore_name}}', 'Feature Store': '{{vertex_featurestore_name}}', 'Vertex Vizier': '{{vertex_vizier_name}}', 'Vertex AI Vizier': '{{vertex_vizier_name}}', 'Vizier': '{{vertex_vizier_name}}', 'Vertex Explainable AI': '{{xai_name_short}}', 'Explainable AI': '{{vertex_xai_name}}', 'NAS': '{{vertex_nas_name_short}}', 'Vertex AI Neural Architectural Search': '{{vertex_nas_name}}', 'Neural Architectural Search': '{{vertex_nas_name_short}}', 'Vertex Workbench': '{{vertex_workbench_name}}', 'Vertex AI Workbench': '{{vertex_workbench_name}}', #'Vertex SDK': '{{vertex_sdk_name}}', #'Vertex AI SDK': '{{vertex_sdk_name}}', 'Vertex AI SDK for Python': '{{vertex_sdk_python}}', 'Vertex AI batch prediction': '{{vertex_ai_name}} {{batch_prediction_name}}', 'Vertex AI': '{{vertex_ai_name}}', 'Ray on Vertex AI': '{{ray_vertex_ai_name}}', 'Google Cloud console': '{{console_name}}', 'Cloud Storage': '{{storage_name}}', 'GCS': '{{storage_name}}', 'GCP': '{{gcp_name}}', 'TensorFlow Enterprise': '{{tf4gcp_name}}', 'TensorFlow': '{{tensorflow_name}}', } for key, value in substitutions.items(): if key in text: text = text.replace(key, value) return text def replace_backtick(text: str) -> str: backtick = False updated_text = '' for _ in range(len(text)): if text[_] == '`': if not backtick: updated_text += "<code>" else: updated_text += "</code>" backtick = not backtick else: updated_text += text[_] return updated_text # Instantiate the rules copyright = CopyrightRule() notices = NoticesRule() title = TitleRule() links = LinksRule() testenv = TestEnvRule() table = TableRule() overview = OverviewRule() objective = ObjectiveRule() recommendations = RecommendationsRule() dataset = DatasetRule() costs = CostsRule() setuplocal = SetupLocalRule() helpers = HelpersRule() installation = InstallationRule() restart = RestartRule() versions = VersionsRule() beforebegin = BeforeBeginRule() enableapis = EnableAPIsRule() setupproject = SetupProjectRule() # Cell Validation rules = [ copyright, notices, title, links, testenv, table, overview, objective, recommendations, dataset, costs, setuplocal, helpers, installation, restart, versions, beforebegin, enableapis, setupproject ] if args.web: print('<style>') print('table, th, td {') print(' border: 1px solid black;') print(' padding-left:10px') print('}') print('</style>') print('<table>') print(' <thead>') print(' <tr>') print(' <th width="180px">Services</th>') print(' <th>Description</th>') print(' <th width="80px">Open in</th>') print(' </tr>') print(' </thead>') print(' <tbody class="list">') if args.skip_file: if not os.path.isfile(args.skip_file): print(f"Error: file does not exist: {args.skip_file}", file=sys.stderr) exit(1) else: with open(args.skip_file, 'r') as csvfile: reader = csv.reader(csvfile) for row in reader: if len(row) > 0: notebook = row[0] skip_list.append(notebook) print(f"Skip entry {notebook}", file=sys.stderr) if args.notebook_dir: if not os.path.isdir(args.notebook_dir): print(f"Error: not a directory: {args.notebook_dir}", file=sys.stderr) exit(1) exit_code = parse_dir(args.notebook_dir) elif args.notebook: if not os.path.isfile(args.notebook): print(f"Error: not a notebook: {args.notebook}", file=sys.stderr) exit(1) exit_code = parse_notebook(args.notebook, tags=[], linkback=None, rules=rules) elif args.notebook_file: if not os.path.isfile(args.notebook_file): print(f"Error: file does not exist {args.notebook_file}", file=sys.stderr) else: exit_code = 0 with open(args.notebook_file, 'r') as csvfile: reader = csv.reader(csvfile) heading = True for row in reader: if heading: heading = False else: tags = row[0].split(',') notebook = row[1] try: linkback = row[2] except: linkback = None exit_code += parse_notebook(notebook, tags=tags, linkback=linkback, rules=rules) else: print("Error: must specify a directory or notebook", file=sys.stderr) exit(1) if args.web: print(' </tbody>\n') print('</table>\n') exit(exit_code)

notebooks/notebook_template_review.py (1,000 lines of code) (raw):