"""
    AutoReview: Script to automatically review Vertex AI notebooks for conformance to notebook template requirements:
    
    python3 notebook_template_review.py [options]
        # options for selecting notebooks
        --notebook: review the specified notebook
        --notebook-dir: recursively traverse the directory and review each notebook enocuntered
        --notebook-file: A CSV file with list of notebooks to review.
        --skip-file: a CSV file with list of notebooks to skip.
        
        # options for error handling
        --errors: Report detected errors.
        --errors-codes: A list of error codes to report errors. Otherwise, all errors are reported.
        --errors-csv: Report errors in CSV format
        
        # options for automatic fixing
        --fix: Automatic fix
        --fix-codes: A list of fix codes to fix. Otherwise, all fix codes are enabled.
        
        # index generatation
        --repo: Generate index in markdown format
        --web: Generate index in HTML format
        --title: Add title to index
        --desc: Add description to index
        --steps: Add steps to index
        --uses: Add "resources" used to index
        --linkback: Add linkback to index
        
    Format of CSV file for notebooks to review:
    
        tags,notebook-path,backlink
        
        tags: Double quoted ist of tags: e.g., "AutoML, Tabular Data"
        notebook-path: path of notebook, relative to https://github.com/GoogleCloudPlatform/vertex-ai-samples/notebooks
        backlink: webdoc page with more details, relative to https://cloud.google.com/
"""

import argparse
import json
import os
import sys
import urllib.request
import csv
from enum import Enum
from abc import ABC, abstractmethod
from typing import List


parser = argparse.ArgumentParser()
parser.add_argument('--notebook-dir', dest='notebook_dir',
                    default=None, type=str, help='Notebook directory')
parser.add_argument('--notebook', dest='notebook',
                    default=None, type=str, help='Notebook to review')
parser.add_argument('--notebook-file', dest='notebook_file',
                    default=None, type=str, help='File with list of notebooks to review')
parser.add_argument('--skip-file', dest='skip_file',
                    default=None, type=str, help='File with list of notebooks to skip')
parser.add_argument('--errors', dest='errors', action='store_true', 
                    default=False, help='Report errors')
parser.add_argument('--errors-csv', dest='errors_csv', action='store_true', 
                    default=False, help='Report errors as CSV')
parser.add_argument('--errors-codes', dest='errors_codes',
                    default=None, type=str, help='Report only specified errors')
parser.add_argument('--title', dest='title', action='store_true',
                    default=False, help='Output description')
parser.add_argument('--desc', dest='desc', action='store_true', 
                    default=False, help='Output description')
parser.add_argument('--uses', dest='uses', action='store_true', 
                    default=False, help='Output uses (resources)')
parser.add_argument('--steps', dest='steps', action='store_true', 
                    default=False, help='Ouput steps')
parser.add_argument('--linkback', dest='linkback', action='store_true', 
                    default=False, help='Ouput linkback')
parser.add_argument('--web', dest='web', action='store_true', 
                    default=False, help='Output format in HTML')
parser.add_argument('--repo', dest='repo', action='store_true', 
                    default=False, help='Output format in Markdown')
parser.add_argument('--fix', dest='fix', action='store_true', 
                    default=False, help='Fix the notebook non-conformance errors')
parser.add_argument('--fix-codes', dest='fix_codes',
                    default=None, type=str, help='Fix only specified errors')
args = parser.parse_args()

if args.errors_codes:
    args.errors_codes = args.errors_codes.split(',')
    args.errors = True

if args.errors_csv:
    args.errors = True

if args.fix_codes:
    args.fix_codes = args.fix_codes.split(',')
    args.fix = True


class ErrorCode(Enum):
    # Copyright cell
    #   Google copyright cell required
    ERROR_COPYRIGHT = 0,

    # Links cell
    #   H1 heading required
    #   git, colab and workbench link required
    #   links must be valid links
    ERROR_TITLE_HEADING = 1,
    ERROR_HEADING_CASE = 2,
    ERROR_HEADING_CAP = 3,
    ERROR_LINK_GIT_MISSING = 4,
    ERROR_LINK_COLAB_MISSING = 5,
    ERROR_LINK_WORKBENCH_MISSING = 6,
    ERROR_LINK_GIT_BAD = 7,
    ERROR_LINK_COLAB_BAD = 8,
    ERROR_LINK_WORKBENCH_BAD = 9,
    ERROR_LINK_COLAB_ENTERPRISE_BAD = 102,

    # Overview cells
    #   Overview cell required
    #   Objective cell required
    #   Dataset cell required
    #   Costs cell required
    #     Check for required Vertex and optional BQ and Dataflow
    ERROR_OVERVIEW_NOTFOUND = 10,
    ERROR_LINKBACK_NOTFOUND = 11,
    ERROR_OBJECTIVE_NOTFOUND = 12,
    ERROR_OBJECTIVE_MISSING_DESC = 13,
    ERROR_OBJECTIVE_MISSING_USES = 14,
    ERROR_OBJECTIVE_MISSING_STEPS = 15,
    ERROR_DATASET_NOTFOUND = 16,
    ERROR_COSTS_NOTFOUND = 17,
    ERROR_COSTS_MISSING = 18,

    # Installation cell
    #   Installation cell required
    #   Wrong heading for installation cell
    #   Installation code cell not found
    #   pip3 required
    #   option -q required
    #   option {USER_FLAG} required
    #   installation code cell not match template
    #   all packages must be installed as a single pip3
    ERROR_INSTALLATION_NOTFOUND = 19,
    ERROR_INSTALLATION_HEADING = 20,
    ERROR_INSTALLATION_CODE_NOTFOUND = 21,
    ERROR_INSTALLATION_PIP3 = 22,
    ERROR_INSTALLATION_QUIET = 23,
    ERROR_INSTALLATION_USER_FLAG = 24,
    ERROR_INSTALLATION_CODE_TEMPLATE = 25,
    ERROR_INSTALLATION_SINGLE_PIP3 = 26,

    # Restart kernel cell
    #    Restart code cell required
    #    Restart code cell not found
    ERROR_RESTART_NOTFOUND = 27,
    ERROR_RESTART_CODE_NOTFOUND = 28,

    # Before you begin cell
    #    Before you begin cell required
    #    Before you begin cell incomplete
    ERROR_BEFOREBEGIN_NOTFOUND = 29,
    ERROR_BEFOREBEGIN_INCOMPLETE = 30,

    # Set Project ID
    #    Set project ID cell required
    #    Set project ID code cell not found
    #    Set project ID not match template
    ERROR_PROJECTID_NOTFOUND = 31,
    ERROR_PROJECTID_CODE_NOTFOUND = 32,
    ERROR_PROJECTID_TEMPLATE = 33,

    # Technical Writer Rules
    ERROR_TWRULE_TODO = 51,
    ERROR_TWRULE_FIRSTPERSON = 52,
    ERROR_TWRULE_FUTURETENSE = 53,
    ERROR_TWRULE_BRANDING = 54,

    ERROR_EMPTY_CALL = 101

class FixCode(Enum):
    FIX_BAD_LINK = 0,
    FIX_PLACEHOLDER = 1


# globals
last_tag = ''
skip_list = []


def parse_dir(directory: str) -> int:
    """
        Recursively walk the specified directory, reviewing each notebook (.ipynb) encountered.
        
            directory: The directory path.
            
        Returns the number of errors
    """
    exit_code = 0
    
    sorted_entries = []
    entries = os.scandir(directory)
    for entry in entries:

        inserted = False
        for ix in range(len(sorted_entries)):
            if entry.name < sorted_entries[ix].name:
                sorted_entries.insert(ix, entry)
                inserted = True
                break
        
        if not inserted:
            sorted_entries.append(entry)
    
    entries = sorted_entries
    for entry in entries:
        if entry.is_dir():
            if entry.name[0] == '.':
                continue
            if entry.name == 'src' or entry.name == 'images' or entry.name == 'sample_data':
                continue
            exit_code += parse_dir(entry.path)
        elif entry.name.endswith('.ipynb'):
            if entry.name in skip_list:
                print(f"Warning: skipping notebook {entry.name}", file=sys.stderr)
                continue
            tag = directory.split('/')[-1]
            if tag == 'automl':
                tag = 'AutoML'
            elif tag == 'bigquery_ml':
                tag = 'BigQuery ML'
            elif tag == 'custom':
                tag = 'Vertex AI Training'
            elif tag == 'experiments':
                tag = 'Vertex AI Experiments'
            elif tag == 'explainable_ai':
                tag = 'Vertex Explainable AI'
            elif tag == 'feature_store':
                tag = 'Vertex AI Feature Store'
            elif tag == 'matching_engine':
                tag = 'Vertex AI Matching Engine'
            elif tag == 'migration':
                tag = 'CAIP to Vertex AI migration'
            elif tag == 'ml_metadata':
                tag = 'Vertex ML Metadata'
            elif tag == 'model_evaluation':
                tag = 'Vertex AI Model Evaluation'
            elif tag == 'model_monitoring':
                tag = 'Vertex AI Model Monitoring'
            elif tag == 'model_registry':
                tag = 'Vertex AI Model Registry'
            elif tag == 'pipelines':
                tag = 'Vertex AI Pipelines'
            elif tag == 'prediction':
                tag = 'Vertex AI Prediction'
            elif tag == 'pytorch':
                tag = 'Vertex AI Training'
            elif tag == 'reduction_server':
                tag = 'Vertex AI Reduction Server'
            elif tag == 'sdk':
                tag = 'Vertex AI SDK'
            elif tag == 'structured_data':
                tag = 'AutoML / BQML'
            elif tag == 'tabnet':
                tag = 'Vertex AI TabNet'
            elif tag == 'tabular_workflows':
                tag = 'AutoML Tabular Workflows'
            elif tag == 'tensorboard':
                tag = 'Vertex AI TensorBoard'
            elif tag == 'training':
                tag = 'Vertex AI Training'
            elif tag == 'vizier':
                tag = 'Vertex AI Vizier'
                
            # special case
            if 'workbench' in directory:
                tag = 'Vertex AI Workbench'
                
            exit_code += parse_notebook(entry.path, tags=[tag], linkback=None, rules=rules)
            
    return exit_code


def parse_notebook(path: str,
                   tags: List,
                   linkback: str,
                   rules: List) -> int:
    """
        Review the specified notebook for conforming to the notebook template
        and notebook authoring requirements.
        
            path: The path to the notebook.
            tags: The associated tags
            linkback: A link back to the web docs
            rules: The cell rules to apply
            
        Returns the number of errors
    """
    notebook = Notebook(path)
    
    for rule in rules:
        rule.validate(notebook)

    
    # Automatic Index Generation
    if objective.desc != '':
        if overview.linkbacks:
            linkbacks = overview.linkbacks
        else:
            if linkback:
                linkbacks = [linkback]
            else:
                linkbacks = []

        if overview.tags:
            tags = overview.tags
                
        add_index(path, 
                  tags, 
                  linkbacks,
                  title.title, 
                  objective.desc, 
                  objective.uses, 
                  objective.steps, 
                  links.git_link, 
                  links.colab_link, 
                  links.colab_enterprise_link, 
                  links.workbench_link,
        )
        
    if args.fix:
        notebook.writeback()
        
    return notebook.num_errors

class Notebook(object):
    '''
    Class for navigating through a notebook
    '''
    def __init__(self, path):
        """
        Initializer
            path: The path to the notebook
        """
        self._path = path
        
        with open(self._path, 'r') as f:
            try:
                self._content = json.load(f)
            except:
                print("Corrupted notebook:", path)
                return

        self._cells = self._content['cells']
        self._cell_index = 0
        self._num_errors = 0
        
        # cross cell information
        self._costs = []

        
    def get(self) -> list:
        '''
        Get the next cell in the notebook
        
        Returns the current cell
        '''
        cell = self._cells[self._cell_index]
        self._cell_index += 1
        return cell

    
    def peek(self) -> list:
        '''
        Peek at the next cell in the notebook
        
        Returns the current cell
        '''
        cell = self._cells[self._cell_index]
        return cell
    

    def pop(self, n_cells=1):
        '''
        Advance the specified number of cells
        
            n_cells: The number of cells to advance
        '''
        self._cell_index += n_cells
    

    @property
    def path(self):
        '''
        Getter: return the filename path for the notebook
        '''
        return self._path
    

    @property
    def num_errors(self):
        '''
        Getter: return the number of errors
        '''
        return self._num_errors

        
    def report_error(self,
                     code: ErrorCode,
                     errmsg: str):
        """
        Report an error.
            If args.errors_codes set, then only report these errors. Otherwise, all errors.

        code: The error code number.
        errmsg: The error message
        """

        if args.errors:
            code = code.value[0]
            if args.errors_codes:
                if str(code) not in args.errors_codes:
                    return

            if args.errors_csv:
                print(self._path, ',', code)
            else:
                print(f"{self._path}: ERROR ({code}): {errmsg}", file=sys.stderr)
                self._num_errors += 1
                
            return False
        return True


    def report_fix(self,
                   code: FixCode,
                   fixmsg: str):
        """
        Report an automatic fix
        
            code: The fox code number.
            fixmsg: The autofix message
        Returns:
            Whether code is to be fixed
        """
        if args.fix:
            code = code.value[0]
            if args.fix_codes:
                if str(code) not in args.fix_codes:
                    return False
                
            print(f"{self._path}: FIXED ({code}): {fixmsg}", file=sys.stderr)
            return True
        return False
        
                
    def writeback(self):
        """
        Write back the updated (autofixed) notebook 
        """
        with open(self._path, 'w') as f:
            json.dump(self._content, f)


class NotebookRule(ABC):
    """
    Abstract class for defining notebook conformance rules
    """
    @abstractmethod
    def validate(self, notebook: Notebook) -> bool:
        '''
        Applies cell specific rules to validate whether the cell 
        does or does not conform to the rules.
        
        Returns whether the cell passed the validation rules
        '''
        pass


class CopyrightRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool:
        """
        Parse the copyright cell
        """
        cell = notebook.get()
        if not 'Copyright' in cell['source'][0]:
            return notebook.report_error(ErrorCode.ERROR_COPYRIGHT, "missing copyright cell")
        return True


class NoticesRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool:
        """
        Parse the (optional) notices cell
        """
        cell = notebook.peek()
        if cell['source'][0].startswith('This notebook'):
            notebook.pop()
        return True


class TitleRule(NotebookRule): 
    def validate(self, notebook: Notebook) -> bool: 
        """
        Parse the title in the links cell
        """
        ret = True
        self.title = ''
        
        cell = notebook.peek()
        if not cell['source'][0].startswith('# '):
            notebook.report_error(ErrorCode.ERROR_TITLE_HEADING, "title cell must start with H1 heading")
            if not cell['source'][0].startswith('## '):
                ret = False
            else:
                
                self.title = cell['source'][0][3:].strip()
                SentenceCaseTWRule().validate(notebook, [self.title])

                # H1 title only
                if len(cell['source']) == 1:
                    notebook.pop()
        else:
            self.title = cell['source'][0][2:].strip()
            SentenceCaseTWRule().validate(notebook, [self.title])

            # H1 title only
            if len(cell['source']) == 1:
                notebook.pop()
                
        return ret


class LinksRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool: 
        """
        Parse the links in the links cell
        """

        self.git_link = None
        self.colab_link = None
        self.colab_enterprise_link = None
        self.workbench_link = None
        source = ''
        ret = True
        
        cell = notebook.get()
        for ix in range(len(cell['source'])):
        
            line = cell['source'][ix]
            source += line
            if '<a href="https://github.com' in line:
                self.git_link = line.strip()[9:-2].replace('" target="_blank', '').replace('" target=\'_blank', '')
                
                derived_link = os.path.join('https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/', notebook.path)
                if self.git_link != derived_link:
                    if notebook.report_fix(FixCode.FIX_BAD_LINK, f"fixed GitHub link: {derived_link}"):
                        fix_link = f"<a href=\"{derived_link}\" target='_blank'>\n"
                        cell['source'][ix] = fix_link
                    else:
                        ret = notebook.report_error(ErrorCode.ERROR_LINK_GIT_BAD, f"bad GitHub link: {self.git_link}")
                    
            if '<a href="https://colab.research.google.com/' in line:
                self.colab_link = 'https://colab.research.google.com/github/' + line.strip()[50:-2].replace('" target="_blank', '').replace('" target=\'_blank', '')
 
                derived_link = os.path.join('https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks', notebook.path)
                if self.colab_link != derived_link:
                    if notebook.report_fix(FixCode.FIX_BAD_LINK, f"fixed Colab link: {derived_link}"):
                        fix_link = f"<a href=\"{derived_link}\" target='_blank'>\n"
                        cell['source'][ix] = fix_link
                    else:
                        ret = notebook.report_error(ErrorCode.ERROR_LINK_COLAB_BAD, f"bad Colab link: {self.colab_link}")
            
            if '<a href="https://console.cloud.google.com/vertex-ai/colab/' in line:
                self.colab_enterprise_link = line.strip()[9:-2].replace('" target="_blank', '').replace('" target=\'_blank', '')
                modified_notebook_path = notebook.path.replace("/", "%2F")
                derived_link = os.path.join('https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2F', modified_notebook_path)
                if self.workbench_link != derived_link:
                    if notebook.report_fix(FixCode.FIX_BAD_LINK, f"fixed Colab Enterprise link: {derived_link}"):
                        fix_link = f"<a href=\"{derived_link}\" target='_blank'>\n"
                        cell['source'][ix] = fix_link
                    else:
                        ret = notebook.report_error(ErrorCode.ERROR_LINK_COLAB_ENTERPRISE_BAD, f"bad Colab Enterprise link: {self.colab_enterprise_link}")

            if '<a href="https://console.cloud.google.com/vertex-ai/workbench/' in line:
                self.workbench_link = line.strip()[9:-2].replace('" target="_blank', '').replace('" target=\'_blank', '')

                derived_link = os.path.join('https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/', notebook.path)
                if self.workbench_link != derived_link:
                    if notebook.report_fix(FixCode.FIX_BAD_LINK, f"fixed Workbench link: {derived_link}"):
                        fix_link = f"<a href=\"{derived_link}\" target='_blank'>\n"
                        cell['source'][ix] = fix_link
                    else:
                        ret = notebook.report_error(ErrorCode.ERROR_LINK_WORKBENCH_BAD, f"bad Workbench link: {self.workbench_link}")


        if 'View on GitHub' not in source or not self.git_link:
            ret = notebook.report_error(ErrorCode.ERROR_LINK_GIT_MISSING, 'Missing link for GitHub')
        if 'Run in Colab' not in source or not self.colab_link:
            ret = notebook.report_error(ErrorCode.ERROR_LINK_COLAB_MISSING, 'Missing link for Colab')  
        if 'Open in Vertex AI Workbench' not in source or not self.workbench_link:
            ret = notebook.report_error(ErrorCode.ERROR_LINK_WORKBENCH_MISSING, 'Missing link for Workbench')
        
        return ret


class TableRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool:
        """
        Parse the (optional) table of contents cell
        """
        cell = notebook.peek()
        if cell['source'][0].startswith('## Table of contents'):
            notebook.pop()
        return True


class TestEnvRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool:
        """
        Parse the (optional) test in which environment cell
        """
        cell = notebook.peek()
        if cell['source'][0].startswith('**_NOTE_**: This notebook has been tested'):
            notebook.pop()
        return True


class OverviewRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool: 
        """
        Parse the overview cell
        """
        self.linkbacks = []
        self.tags = []
        
        cell = notebook.get()
        if not cell['source'][0].startswith("## Overview"):
            return notebook.report_error(ErrorCode.ERROR_OVERVIEW_NOTFOUND, "Overview section not found")
        
        last_line = cell['source'][-1]
        if last_line.startswith('Learn more about ['):
            for more in last_line.split('[')[1:]:
                tag = more.split(']')[0]
                linkback = more.split('(')[1].split(')')[0]
                self.tags.append(tag)
                self.linkbacks.append(linkback)
        else:
            return notebook.report_error(ErrorCode.ERROR_LINKBACK_NOTFOUND, "Linkback missing in overview section")
                
        return True


class ObjectiveRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool: 
        """
        Parse the objective cell.
            Find the description, uses and steps.
        """
        
        self.desc = ''
        self.uses = ''
        self.steps = ''
        self.costs = []
        ret = True

        cell = notebook.get()
        if not cell['source'][0].startswith("### Objective"):
            ret = notebook.report_error(ErrorCode.ERROR_OBJECTIVE_NOTFOUND, "Objective section not found")
            notebook.costs = []
            return ret

        in_desc = True
        in_uses = False
        in_steps = False
    
        for line in cell['source'][1:]:
            # TOC anchor
            if line.startswith('<a name='):
                continue
                
            if line.startswith('This tutorial uses'):
                in_desc = False
                in_steps = False
                in_uses = True
                self.uses += line
                continue
            elif line.startswith('The steps performed'):
                in_desc = False
                in_uses = False
                in_steps = True
                self.steps += line
                continue

            if in_desc:
                if len(self.desc) > 0 and line.strip() == '':
                    in_desc = False
                    continue
                self.desc += line
            elif in_uses:
                sline = line.strip()
                if len(sline) == 0:
                    self.uses += '\n'
                else:
                    ch = sline[0]
                    if ch in ['-', '*', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
                        self.uses += line
            elif in_steps:
                sline = line.strip()
                if len(sline) == 0:
                    self.steps += '\n'
                else:
                    ch = sline[0]
                    if ch in ['-', '*', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
                        # check for italic font setting
                        if ch == '*' and sline[1] != ' ':
                            in_steps = False
                        # special case
                        elif sline.startswith('* Prediction Service'):
                            in_steps = False
                        else:
                            self.steps += line
                    elif ch == '#':
                        in_steps = False

            
        if self.desc == '':
            ret = notebook.report_error(ErrorCode.ERROR_OBJECTIVE_MISSING_DESC, "Objective section missing desc")
        else:
            self.desc = self.desc.lstrip()
            
            bracket = False
            paren = False
            sentences = ""
            for _ in range(len(self.desc)):
                if self.desc[_] == '[':
                    bracket = True
                    continue
                elif self.desc[_] == ']':
                    bracket = False
                    continue
                elif self.desc[_] == '(':
                    paren = True
                elif self.desc[_] == ')':
                    paren = False
                    continue
                    
                if not paren:
                    sentences += self.desc[_]
            sentences = sentences.split('.')
            if len(sentences) > 1:
                self.desc = sentences[0] + '.\n'
            if self.desc.startswith('In this tutorial, you learn') or self.desc.startswith('In this notebook, you learn'):
                self.desc = self.desc[22].upper() + self.desc[23:]

        if self.uses == '':
            ret = notebook.report_error(ErrorCode.ERROR_OBJECTIVE_MISSING_USES, "Objective section missing uses services list")
        else:
            if 'BigQuery' in self.uses:
                self.costs.append('BQ')
            if 'Vertex' in self.uses:
                self.costs.append('Vertex')
            if 'Dataflow' in self.uses:
                self.costs.append('Dataflow')

        if self.steps == '':
            ret = notebook.report_error(ErrorCode.ERROR_OBJECTIVE_MISSING_STEPS, "Objective section missing steps list")
            
        notebook.costs = self.costs
        return ret


class RecommendationsRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool: 
        """
        Parse the (optional) recommendations cell
        """
        # (optional) Recommendation
        cell = notebook.peek()
        if cell['source'][0].startswith("### Recommendations"):
            notebook.pop()
        return True


class DatasetRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool: 
        """
        Parse the dataset cell
        """
        cell = notebook.get()
        if not cell['source'][0].startswith("### Dataset") and not cell['source'][0].startswith("### Model") and not cell['source'][0].startswith("### Embedding"):
            return notebook.report_error(ErrorCode.ERROR_DATASET_NOTFOUND, "Dataset/Model section not found")
        return True


class CostsRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool: 
        """
        Parse the costs cell
        """
        ret = True
        
        cell = notebook.get()
        if not cell['source'][0].startswith("### Costs"):
            ret = notebook.report_error(ErrorCode.ERROR_COSTS_NOTFOUND, "Costs section not found")
        else:
            text = ''
            for line in cell['source']:
                text += line
            if 'BQ' in notebook.costs and 'BigQuery' not in text:
                ret = notebook.report_error(ErrorCode.ERROR_COSTS_MISSING, 'Costs section missing reference to BiqQuery')
            if 'Vertex' in notebook.costs and 'Vertex' not in text:
                ret = notebook.report_error(ErrorCode.ERROR_COSTS_MISSING, 'Costs section missing reference to Vertex')
            if 'Dataflow' in notebook.costs and 'Dataflow' not in text:    
                ret = notebook.report_error(ErrorCode.ERROR_COSTS_MISSING, 'Costs section missing reference to Dataflow')
        return ret


class SetupLocalRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool:
        """
        Parse the (optional) setup local environment cell
        """
        cell = notebook.peek()
        if cell['source'][0].startswith('## Before you begin'):
            notebook.pop()

        cell = notebook.peek()
        if not cell['source'][0].startswith('### Set up your local development environment'):
            return True
        notebook.pop()
        
        cell = notebook.peek()
        if cell['source'][0].startswith('**Otherwise**, make sure your environment meets'):
            notebook.pop()
            
        return True


class HelpersRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool:
        """
        Parse the (optional) helpers text/code cell
        """
        cell = notebook.peek()
        if 'helper' in cell['source'][0]:
            notebook.pop(2)  # text and cell
        return True


class InstallationRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool:
        """
        Parse the installation cells
        """
        ret = True
        
        cell = notebook.get()
        
        if 'Install' not in cell['source'][0]:
            return notebook.report_error(ErrorCode.ERROR_INSTALLATION_NOTFOUND, "Installation section not found")

        if not cell['source'][0].startswith("## Install"):
            ret = notebook.report_error(ErrorCode.ERROR_INSTALLATION_HEADING, "Installation section needs to be H2 heading")
       
            
        cell = notebook.get()
        if cell['cell_type'] != 'code':
            ret = notebook.report_error(ErrorCode.ERROR_INSTALLATION_NOTFOUND, "Installation section not found")
        else:
            if cell['source'][0].startswith('! mkdir'):
                cell = notebook.get()
            if 'requirements.txt' in cell['source'][0]:
                cell = notebook.get()

            text = ''
            for line in cell['source']:
                text += line
                if 'pip ' in line:
                    if 'pip3' not in line:
                        notebook.report_error(ErrorCode.ERROR_INSTALLATION_PIP3, "Installation code section: use pip3")
                    if line.endswith('\\\n'):
                        continue
                    if '-q' not in line and '--quiet' not in line :
                        notebook.report_error(ErrorCode.ERROR_INSTALLATION_QUIET, "Installation code section: use -q with pip3")
                    if 'USER_FLAG' not in line and 'sh(' not in line:
                        notebook.report_error(ErrorCode.ERROR_INSTALLATION_USER_FLAG, "Installation code section: use {USER_FLAG} with pip3")
            if 'required_packages <' in text:
                pass  # R kernel
            elif 'if IS_WORKBENCH_NOTEBOOK:' not in text:
                ret = notebook.report_error(ErrorCode.ERROR_INSTALLATION_CODE_TEMPLATE, "Installation code section out of date (see template)")
        return ret


class RestartRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool:
        """
        Parse the restart cells
        """
        ret = True

        while True:
            cont = False
            cell = notebook.peek()
            for line in cell['source']:
                if 'pip' in line:
                    ret = notebook.report_error(ErrorCode.ERROR_INSTALLATION_SINGLE_PIP3, f"All pip installations must be in a single code cell: {line}")
                    cont = True
                    break
            if not cont:
                break
            notebook.pop()

        cell = notebook.peek()
        if not cell['source'][0].startswith("### Restart the kernel"):
            ret = notebook.report_error(ErrorCode.ERROR_RESTART_NOTFOUND, "Restart the kernel section not found")
        else:
            notebook.pop()
            cell = notebook.get()  # code cell
            if cell['cell_type'] != 'code':
                ret = notebook.report_error(ErrorCode.ERROR_RESTART_CODE_NOTFOUND, "Restart the kernel code section not found")
                
        return ret


class VersionsRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool:
        """
        Parse the (optional) package versions code/text cell
        """
        cell = notebook.peek()
        if cell['source'][0].startswith('#### Check package versions'):
            notebook.pop(2)  # text and code
        return True


class BeforeBeginRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool:
        """
        Parse the before you begin cell
        """
        ret = True
        
        cell = notebook.get()
        if not cell['source'][0].startswith("## Before you begin"):
            ret = notebook.report_error(ErrorCode.ERROR_BEFOREBEGIN_NOTFOUND, "Before you begin section not found")
        else:
            # is two cells instead of one
            if len(cell['source']) < 2:
                cell = notebook.get()
                if not cell['source'][0].startswith("### Set up your Google Cloud project"):
                    ret = notebook.report_error(ErrorCode.ERROR_BEFOREBEGIN_INCOMPLETE, "Before you begin section incomplete")
        return ret


class EnableAPIsRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool:
        """
        Parse the (optional) enable apis code/text cell
        """
        cell = notebook.peek()
        if cell['source'][0].startswith("### Enable APIs"):
            notebook.pop(2)  # text and code
        return True


class SetupProjectRule(NotebookRule):
    def validate(self, notebook: Notebook) -> bool:
        """
        Parse the set project cells
        """
        ret = True
        
        cell = notebook.get()
        if not cell['source'][0].startswith('#### Set your project ID'):
            ret = notebook.report_error(ErrorCode.ERROR_PROJECTID_NOTFOUND, "Set project ID section not found")
        else: 
            cell = notebook.get()
            if cell['cell_type'] != 'code':
                ret = notebook.report_error(ErrorCode.ERROR_PROJECTID_CODE_NOTFOUND, "Set project ID code section not found")
            elif not cell['source'][0].startswith('PROJECT_ID = "[your-project-id]"'):
                ret = notebook.report_error(ErrorCode.ERROR_PROJECTID_TEMPLATE, "Set project ID not match template")

            cell = notebook.get()
            if cell['cell_type'] != 'code' or 'or PROJECT_ID == "[your-project-id]":' not in cell['source'][0]:
                ret = notebook.report_error(ErrorCode.ERROR_PROJECTID_TEMPLATE, "Set project ID not match template")  

            cell = notebook.get()
            if cell['cell_type'] != 'code' or '! gcloud config set project' not in cell['source'][0]:
                ret = notebook.report_error(ErrorCode.ERROR_PROJECTID_TEMPLATE, "Set project ID not match template")
                
        return ret


class TextRule(ABC):
    """
    Abstract class for defining text writing conformance rules
    """
    @abstractmethod
    def validate(self, notebook: Notebook, text: List[str]) -> bool:
        '''
        Applies text writing specific rules to validate whether the text 
        does or does not conform to the rules.
        
        Returns whether the test passed the validation rules
        '''
        return False


class BrandingRule(TextRule):
    def validate(self, notebook: Notebook, text: List[str]) -> bool:
        """
            Check the text for branding issues
                1. Product branding names
                2. No future tense
                3. No 1st person

        """
        ret = True
        branding = {
                'Vertex SDK': 'Vertex AI SDK',
                'Vertex Training': 'Vertex AI Training',
                'Vertex Prediction': 'Vertex AI Prediction',
                'Vertex Batch Prediction': 'Vertex AI batch prediction',
                'Vertex XAI': 'Vertex Explainable AI',
                'Vertex Explainability': 'Vertex Explainable AI',
                'Vertex AI Explainability': 'Vertex Explainable AI',
                'Vertex Pipelines': 'Vertex AI Pipelines',
                'Vertex Experiments': 'Vertex AI Experiments',
                'Vertex TensorBoard': 'Vertex AI TensorBoard',
                'Vertex Hyperparameter Tuning': 'Vertex AI hyperparameter tuning',
                'Vertex Metadata': 'Vertex ML Metadata',
                'Vertex AI Metadata': 'Vertex ML Metadata',
                'Vertex AI ML Metadata': 'Vertex ML Metadata',
                'Vertex Vizier': 'Vertex AI Vizier',
                'Vertex Feature Store': 'Vertex AI Feature Store',
                'Vertex Forecasting': 'Vertex AI forecasting',
                'Vertex Vector Search': 'Vertex AI Vector Search',
                'Vertex Dataset': 'Vertex AI dataset',
                'Vertex Model': 'Vertex AI model',
                'Vertex Endpoint': 'Vertex AI endpoint',
                'Vertex Private Endpoint': 'Vertex AI private endpoint',
                'Automl': 'AutoML',
                'AutoML Image': 'AutoML Vision',
                'AutoML Language': 'AutoML Natural Language',
                'Tensorflow': 'TensorFlow',
                'Tensorboard': 'Vertex AI TensorBoard',
                'Google Cloud Notebooks': 'Vertex AI Workbench Notebooks',
                'BQ ': 'BigQuery',
                'BQ.': 'BigQuery',
                'Bigquery': 'BigQuery',
                'Big Query': 'BigQuery',
                'BQML': 'BigQuery ML',
                'GCS ': 'Cloud Storage',
                'GCS.': 'Cloud Storage',
                'Google Cloud Storage': 'Cloud Storage',
                'Pytorch': 'PyTorch',
                'Sklearn': 'scikit-learn',
                'sklearn': 'scikit-learn'
        }

        for line in text:
            for mistake, brand in branding.items():
                if mistake in line:
                    ret = notebook.report_error(ErrorCode.ERROR_TWRULE_BRANDING, f"Branding {mistake} -> {brand}: {line}")

        return ret


class SentenceCaseTWRule(TextRule):
    def validate(self,
                 notebook,
                 text: List[str]) -> bool:
        """
        Check that headings are in sentence case

        path: used only for reporting an error
        text: the heading to check
        """
        ret = True

        ACRONYMS = ['E2E', 'Vertex', 'AutoML', 'ML', 'AI', 'GCP', 'API', 'R', 'CMEK', 
                    'TF', 'TFX', 'TFDV', 'SDK', 'VM', 'CPR', 'NVIDIA', 'ID', 'DASK', 
                    'ARIMA_PLUS', 'KFP', 'I/O', 'GPU', 'Google', 'TensorFlow', 'PyTorch'
                    ]

        # Check the first line
        words = text[0].replace('#', '').split(' ')
        if not words[0][0].isupper():
            ret = notebook.report_error(ErrorCode.ERROR_HEADING_CAP, f"heading must start with capitalized word: {words[0]}")

        for word in words[1:]:
            word = word.replace(':', '').replace('(', '').replace(')', '')
            if word in ACRONYMS:
                continue
            if word.isupper():
                ret = notebook.report_error(ErrorCode.ERROR_HEADING_CASE, f"heading is not sentence case: {word}")
                
        return ret


class TextTWRule(TextRule):
    def validate(self, notebook: Notebook, text: List[str]) -> bool:
        """
        Check for conformance to the following techwriter rules
                1. No future tense
                2. No 1st person

        """
        ret = True
    
        for line in text:
            # HTML code
            if '<a ' in line:
                continue

            if 'TODO' in line or 'WIP' in line:
                ret = notebook.report_error(ErrorCode.ERROR_TWRULE_TODO, f'TODO in cell: {line}')
            if 'we ' in line.lower() or "let's" in line.lower() in line.lower():
                ret = notebook.report_error(ErrorCode.ERROR_TWRULE_FIRSTPERSON, f'Do not use first person (e.g., we), replace with 2nd person (you): {line}')
            if 'will' in line.lower() or 'would' in line.lower():
                ret = notebook.report_error(ErrorCode.ERROR_TWRULE_FUTURETENSE, f'Do not use future tense (e.g., will), replace with present tense: {line}')

                    
        return ret


def add_index(path: str, 
              tags: List, 
              linkbacks: List,
              title : str, 
              desc: str, 
              uses: str, 
              steps: str, 
              git_link: str, 
              colab_link: str, 
              colab_enterprise_link: str,
              workbench_link: str
             ):
    """
    Add a discoverability index for this notebook
    
        path: The path to the notebook
        tags: The tags (if any) for the notebook
        title: The H1 title for the notebook
        desc: The notebook description
        uses: The resources/services used by the notebook
        steps: The steps specified by the notebook
        git_link: The link to the notebook in the git repo
        colab_link: Link to launch notebook in Colab
        colab_enterpise_link: Link to launch notebook in Colab Enterprise
        workbench_link: Link to launch notebook in Workbench
        linkbacks: The linkbacks per tag
    """
    global last_tag
    
    if not args.web and not args.repo:
        return
    
    title = title.split(':')[-1].strip()
    title = title[0].upper() + title[1:]
    if args.web:
        title = replace_cl(replace_backtick(title))
        
        print('    <tr>')
        print('        <td>')
        for tag in tags:
            tag = replace_cl(tag)
            print(f'            {tag.strip()}<br/>\n')
        print('        </td>')
        print('        <td>')
        print(f'            <b>{title}</b>. ')
        if args.desc:
            desc = replace_cl(replace_backtick(desc))
            print('<br/>')
            print(f'            {desc}\n')
            
            
        if args.linkback and linkbacks:
            num = len(tags)
            for _ in range(num):
                if linkbacks[_].startswith("vertex-ai"):
                    print(f' Learn more about <a href="https://cloud.google.com/{linkbacks[_]}" target="_blank">{replace_cl(tags[_])}</a>.\n')
                else:
                    print(f' Learn more about <a href="{linkbacks[_]}" target="_blank">{replace_cl(tags[_])}</a>.\n')
                    
        if args.steps:
            print("<devsite-expandable>\n")
            print('  <p class="showalways">Tutorial steps</p>\n')
            print('  <ul>\n')
            
            if ":" in steps:
                steps = replace_backtick(steps)
                steps = steps.split(':')[1].replace('*', '').replace('-', '').strip().split('\n')
            else:
                steps = []
              
            for step in steps:
                print(f'    <li>{replace_cl(step)}</li>\n')
            print('  </ul>\n')
            print("</devsite-expandable>\n")
                    
        print('        </td>')
        print('        <td>')
        if colab_link:
            print(f'            <a href="{colab_link}" target="_blank" track-type="notebookTutorial" track-name="colabLink">Colab</a><br/>\n')
        if colab_enterprise_link:
            print(f'            <a href="{colab_enterprise_link}" target="_blank" track-type="notebookTutorial" track-name="colabEnterpriseLink">Colab Enterprise</a><br/>\n')
        if git_link:
            print(f'            <a href="{git_link}" target="_blank" track-type="notebookTutorial" track-name="gitHubLink">GitHub</a><br/>\n')
        if workbench_link:
            print(f'            <a href="{workbench_link}" target="_blank" track-type="notebookTutorial" track-name="workbenchLink">Vertex AI Workbench</a><br/>\n')
        print('        </td>')
        print('    </tr>\n')
    elif args.repo:
        try:
            if tags != last_tag and tag != '':
                last_tag = tags
                flat_list = ''
                for item in tags:
                    flat_list += item.replace("'", '') + ' '
                print(f"\n### {flat_list}\n")
        except:
            pass
        print(f"\n[{title}]({git_link})\n")
    
        print("```")
        if args.desc:
            print(desc)

        if args.uses:
            print(uses)

        if args.steps:
            print(steps.rstrip() + '\n')
            
        print("```\n")
            
        if args.linkback and linkbacks:
            num = len(tags)
            for _ in range(num):
                if linkbacks[_].startswith("vertex-ai"):
                    print(f'&nbsp;&nbsp;&nbsp;Learn more about [{tags[_]}]({linkbacks[_]}).\n')
                else:
                    print(f'&nbsp;&nbsp;&nbsp;Learn more about [{tags[_]}]({linkbacks[_]}).\n')

def replace_cl(text : str ) -> str:
    '''
    Replace product names with CL substitution variables
    '''
    substitutions = {
        #'AutoML Tabular Workflow': '{{automl_name}} tabular workflow',
        #'AutoML Tables': '{{automl_tables_name}}',
        #'AutoML Tabular': '{{automl_tables_name}}',
        #'AutoML Vision': '{{automl_vision_name}}',
        #'AutoML Image': '{{automl_vision_name}}',
        'AutoML': '{{automl_name}}',
        
        'BigQuery ML': '{{bigqueryml_name}}',
        'BQML': '{{bigqueryml_name}}',
        'BigQuery': '{{bigquery_name}}',
        'BQ': '{{bigquery_name}}',
        
        'Vertex Dataset': '{{vertex_ai_name}} dataset',
        'Vertex Model': '{{vertex_ai_name}} model',
        'Vertex Endpoint': '{{vertex_ai_name}} endpoint',
        'Vertex Model Registry': '{{vertex_model_registry_name}}',
        'model registry': '{{vertex_model_registry_name_short}}',
        'Model Registry': '{{vertex_model_registry_name_short}}',
        'Vertex AI Model Registry': '{{vertex_model_registry_name}}',
        'Vertex Training': '{{vertex_training_name}}',
        'Vertex AI Training': '{{vertex_training_name}}',
        'Vertex Prediction': '{{vertex_prediction_name}}',
        'Vertex AI Prediction': '{{vertex_prediction_name}}',
        'Vertex TensorBoard': '{{vertex_tensorboard_name}}',
        'Vertex AI TensorBoard': '{{vertex_tensorboard_name}}',
        'TensorBoard': '{{vertex_tensorboard_name}}',
        'Tensorboard': '{{vertex_tensorboard_name}}',
        'Vertex ML Metadata': '{{vertex_metadata_name}}',
        'Vertex Pipelines': '{{vertex_pipelines_name}}',
        'Vertex AI Pipelines': '{{vertex_pipelines_name}}',
        'Vertex AI Data Labeling': '{{vertex_data_labeling_name}}',
        'Vertex AI Experiments': '{{vertex_experiments_name}}',
        'Vertex Experiments': '{{vertex_experiments_name}}',
        'Vertex AI Matching Engine': '{{vector_search_name}}',
        'Vertex Matching Engine': '{{vector_search_name}}',
        'Vertex Vector Search': '{{vector_search_name}}',
        'Vector Search': '{{vector_search_name}}',
        'Vertex AI Vector Search': '{{vector_search_name}}',
        'Vertex Model Monitoring': '{{vertex_model_monitoring_name}}',
        'Model Monitoring': '{{vertex_model_monitoring_name_short}}',
        'Vertex AI Model Monitoring': '{{vertex_model_monitoring_name}}',
        'Vertex Feature Store': '{{vertex_featurestore_name}}',
        'Vertex AI Feature Store': '{{vertex_featurestore_name}}',
        'Feature Store': '{{vertex_featurestore_name}}',
        'Vertex Vizier': '{{vertex_vizier_name}}',
        'Vertex AI Vizier': '{{vertex_vizier_name}}',
        'Vizier': '{{vertex_vizier_name}}',
        'Vertex Explainable AI': '{{xai_name_short}}',
        'Explainable AI': '{{vertex_xai_name}}',
        'NAS': '{{vertex_nas_name_short}}',
        'Vertex AI Neural Architectural Search': '{{vertex_nas_name}}',
        'Neural Architectural Search': '{{vertex_nas_name_short}}',
        'Vertex Workbench': '{{vertex_workbench_name}}',
        'Vertex AI Workbench': '{{vertex_workbench_name}}',
        #'Vertex SDK': '{{vertex_sdk_name}}',
        #'Vertex AI SDK': '{{vertex_sdk_name}}',
        'Vertex AI SDK for Python': '{{vertex_sdk_python}}',
        'Vertex AI batch prediction': '{{vertex_ai_name}} {{batch_prediction_name}}',
        'Vertex AI': '{{vertex_ai_name}}',
        'Ray on Vertex AI': '{{ray_vertex_ai_name}}',
        'Google Cloud console': '{{console_name}}',
        
        'Cloud Storage': '{{storage_name}}',
        'GCS': '{{storage_name}}',
        'GCP': '{{gcp_name}}',
        'TensorFlow Enterprise': '{{tf4gcp_name}}',
        'TensorFlow': '{{tensorflow_name}}',
    }
    
    for key, value in substitutions.items():
        if key in text:
            text = text.replace(key, value)
            
    return text

def replace_backtick(text: str) -> str:
    backtick = False
    updated_text = ''
    for _ in range(len(text)):
        if text[_] == '`':
            if not backtick:
                updated_text += "<code>"
            else:
                updated_text += "</code>"
            backtick = not backtick
        else:
            updated_text += text[_]

    return updated_text


# Instantiate the rules
copyright = CopyrightRule()
notices = NoticesRule()
title = TitleRule()
links = LinksRule()
testenv = TestEnvRule()
table = TableRule()
overview = OverviewRule()
objective = ObjectiveRule()
recommendations = RecommendationsRule()
dataset = DatasetRule()
costs = CostsRule()
setuplocal = SetupLocalRule()
helpers = HelpersRule()
installation = InstallationRule()
restart = RestartRule()
versions = VersionsRule()
beforebegin = BeforeBeginRule()
enableapis = EnableAPIsRule()
setupproject = SetupProjectRule()

 # Cell Validation
rules = [ copyright, notices, title, links, testenv, table, overview, objective,
          recommendations, dataset, costs, setuplocal, helpers,
          installation, restart, versions, beforebegin, enableapis,
          setupproject
]

if args.web:
    print('<style>')
    print('table, th, td {')
    print('  border: 1px solid black;')
    print('  padding-left:10px')
    print('}')
    print('</style>')
    print('<table>')
    print('    <thead>')
    print('        <tr>')
    print('            <th width="180px">Services</th>')
    print('            <th>Description</th>')
    print('            <th width="80px">Open in</th>')
    print('        </tr>')
    print('    </thead>')
    print('    <tbody class="list">')
    

if args.skip_file:
    if not os.path.isfile(args.skip_file):
        print(f"Error: file does not exist: {args.skip_file}", file=sys.stderr)
        exit(1)
    else:
        with open(args.skip_file, 'r') as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                if len(row) > 0:
                    notebook = row[0]
                    skip_list.append(notebook)
                    print(f"Skip entry {notebook}", file=sys.stderr)

if args.notebook_dir:
    if not os.path.isdir(args.notebook_dir):
        print(f"Error: not a directory: {args.notebook_dir}", file=sys.stderr)
        exit(1)
    exit_code = parse_dir(args.notebook_dir)
elif args.notebook:
    if not os.path.isfile(args.notebook):
        print(f"Error: not a notebook: {args.notebook}", file=sys.stderr)
        exit(1)
    exit_code = parse_notebook(args.notebook, tags=[], linkback=None, rules=rules)
elif args.notebook_file:
    if not os.path.isfile(args.notebook_file):
        print(f"Error: file does not exist {args.notebook_file}", file=sys.stderr)
    else:
        exit_code = 0
        with open(args.notebook_file, 'r') as csvfile:
            reader = csv.reader(csvfile)
            heading = True
            for row in reader:
                if heading:
                    heading = False
                else:
                    tags = row[0].split(',')
                    notebook = row[1]
                    try:
                        linkback = row[2]
                    except:
                        linkback = None
                    exit_code += parse_notebook(notebook, tags=tags, linkback=linkback, rules=rules)
else:
    print("Error: must specify a directory or notebook", file=sys.stderr)
    exit(1)

if args.web:
    print('    </tbody>\n')
    print('</table>\n')

exit(exit_code)