In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Pipelines: Evaluating batch prediction results from custom tabular regression model

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/model_evaluation/custom_tabular_regression_model_evaluation.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fmodel_evaluation%2Fcustom_tabular_regression_model_evaluation.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/model_evaluation/custom_tabular_regression_model_evaluation.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/model_evaluation/custom_tabular_regression_model_evaluation.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

This notebook demonstrates how to use the Vertex AI regression model evaluation component to evaluate a custom regression model. Model evaluation helps you determine your model performance based on the evaluation metrics and improve the model if necessary. 

Learn more about [Vertex AI Model Evaluation](https://cloud.google.com/vertex-ai/docs/evaluation/introduction) and [Custom training](https://cloud.google.com/vertex-ai/docs/training/custom-training).

### Objective

In this tutorial, you learn how to evaluate a Vertex AI model resource through a Vertex AI pipeline job using google cloud pipeline components.

This tutorial uses the following Google Cloud ML services and resources:

- Vertex AI Training (Custom Training)
- Vertex AI Batch Predictions
- Vertex AI Pipelines
- Vertex AI Model Registry


The steps performed include:

- Create a Vertex AI Custom Training Job to train a TensorFlow model.
- Run the custom training job. 
- Retrieve and load the model artifacts.
- View the model evaluation.
- Upload the model as a Vertex AI model resource.
- Import a pre-trained Vertex AI model resource into the pipeline.
- Run a batch prediction job in the pipeline.
- Evaluate the model using the regression evaluation component.
- Import the Regression Metrics to the Vertex AI model resource.

### Dataset

The dataset used for this tutorial is the [Boston Housing Prices dataset](https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html). The version of the dataset you use in this tutorial is the one that's available from TensorFlow SDK. The trained model predicts the median price of a house in units of 1K USD.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Get started

### Install Vertex AI SDK for Python and other required packages


In [None]:
! pip3 install --upgrade --quiet    google-cloud-aiplatform \
                                    tensorflow==2.15.1 \
                                    google-cloud-pipeline-components==1.0.26 \
                                    matplotlib \
                                    google-cloud-storage 

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project. Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

#### Service Account

You use a service account to create Vertex AI Pipeline jobs. If you don't want to use your project's Compute Engine service account, set `SERVICE_ACCOUNT` to another service account ID.

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [None]:
import sys

IS_COLAB = "google.colab" in sys.modules

if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    else:  # IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

#### Set service account access for Vertex AI Pipelines

Run the following commands to grant your service account access to read and write pipeline artifacts in the bucket that you created in the previous step. You only need to run this step once per service account.

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

### Import libraries

In [None]:
import json
import os

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from google.cloud import aiplatform, aiplatform_v1
from tensorflow.keras.datasets import boston_housing

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI, location=LOCATION)

#### Set hardware accelerators

You can set hardware accelerators for training and prediction.

Set the variables `TRAIN_GPU/TRAIN_NGPU` and `DEPLOY_GPU/DEPLOY_NGPU` to use a container image supporting a GPU and the number of GPUs allocated to the virtual machine (VM) instance. For example, to use a GPU container image with 4 Nvidia Telsa T4 GPUs allocated to each VM, you'd specify:

    (aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_T4, 4)


Otherwise specify `(None, None)` to use a container image to run on a CPU.

Learn more about [hardware accelerator support for your region](https://cloud.google.com/vertex-ai/docs/general/locations#accelerators) 

*Note*: TF releases before 2.3 for GPU support fail to load the custom model in this tutorial. It's a known issue and fixed in TF 2.3 which is caused by static graph ops that are generated in the serving function. If you encounter this issue on your own custom models, use a container image for TF 2.3 with GPU support.

In [None]:
if os.getenv("IS_TESTING_TRAIN_GPU"):
    TRAIN_GPU, TRAIN_NGPU = (
        aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_T4,
        int(os.getenv("IS_TESTING_TRAIN_GPU")),
    )
else:
    TRAIN_GPU, TRAIN_NGPU = (None, None)

if os.getenv("IS_TESTING_DEPLOY_GPU"):
    DEPLOY_GPU, DEPLOY_NGPU = (
        aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_T4,
        int(os.getenv("IS_TESTING_DEPLOY_GPU")),
    )
else:
    DEPLOY_GPU, DEPLOY_NGPU = (None, None)

#### Set pre-built containers

Set the pre-built Docker container image for training and prediction.


For the latest list, see [Pre-built containers for training](https://cloud.google.com/ai-platform-unified/docs/training/pre-built-containers).


For the latest list, see [Pre-built containers for prediction](https://cloud.google.com/ai-platform-unified/docs/predictions/pre-built-containers).

In [None]:
if os.getenv("IS_TESTING_TF"):
    TF = os.getenv("IS_TESTING_TF")
else:
    TF = "2-9"

if TF[0] == "2":
    if TRAIN_GPU:
        TRAIN_VERSION = "tf-gpu.{}".format(TF)
    else:
        TRAIN_VERSION = "tf-cpu.{}".format(TF)
    if DEPLOY_GPU:
        DEPLOY_VERSION = "tf2-gpu.{}".format(TF)
    else:
        DEPLOY_VERSION = "tf2-cpu.{}".format(TF)
else:
    if TRAIN_GPU:
        TRAIN_VERSION = "tf-gpu.{}".format(TF)
    else:
        TRAIN_VERSION = "tf-cpu.{}".format(TF)
    if DEPLOY_GPU:
        DEPLOY_VERSION = "tf-gpu.{}".format(TF)
    else:
        DEPLOY_VERSION = "tf-cpu.{}".format(TF)

TRAIN_IMAGE = "us-docker.pkg.dev/vertex-ai/training/{}:latest".format(TRAIN_VERSION)
DEPLOY_IMAGE = "us-docker.pkg.dev/vertex-ai/prediction/{}:latest".format(DEPLOY_VERSION)

print("Training:", TRAIN_IMAGE, TRAIN_GPU, TRAIN_NGPU)
print("Deployment:", DEPLOY_IMAGE, DEPLOY_GPU, DEPLOY_NGPU)

#### Set machine type

Next, set the machine type to use for training and prediction.

- Set the variables `TRAIN_COMPUTE` and `DEPLOY_COMPUTE` to configure  the compute resources for the VMs you use for training and prediction.
 - `machine type`
     - `n1-standard`: 3.75GB of memory per vCPU.
     - `n1-highmem`: 6.5GB of memory per vCPU
     - `n1-highcpu`: 0.9 GB of memory per vCPU
 - `vCPUs`: number of \[2, 4, 8, 16, 32, 64, 96 \]

**Note**: The following isn't supported for training:

 - `standard`: 2 vCPUs
 - `highcpu`: 2, 4 and 8 vCPUs

**Note**: You may also use n2 and e2 machine types for training and deployment, but they don't support GPUs.

In [None]:
if os.getenv("IS_TESTING_TRAIN_MACHINE"):
    MACHINE_TYPE = os.getenv("IS_TESTING_TRAIN_MACHINE")
else:
    MACHINE_TYPE = "n1-standard"

VCPU = "4"
TRAIN_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Train machine type", TRAIN_COMPUTE)

if os.getenv("IS_TESTING_DEPLOY_MACHINE"):
    MACHINE_TYPE = os.getenv("IS_TESTING_DEPLOY_MACHINE")
else:
    MACHINE_TYPE = "n1-standard"

VCPU = "4"
DEPLOY_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Deploy machine type", DEPLOY_COMPUTE)

## Training a custom model

Now you're ready to start creating your own custom model and training for Boston Housing. 

Learn more about [custom model training on Vertex AI](https://cloud.google.com/vertex-ai/docs/training/custom-training)

### Examine the training package

#### Package layout

Before you start the training, look at how a Python package is assembled for a custom training job. When unarchived, the package contains the following directory/file layout.

- PKG-INFO
- README.md
- setup.cfg
- setup.py
- trainer
  - \_\_init\_\_.py
  - task.py

The files `setup.cfg` and `setup.py` are the instructions for installing the package into the operating environment of the Docker image.

The file `trainer/task.py` is the Python script for executing the custom training job. 

**Note:** when `trainer/task.py` is referred to in the worker pool specification, the directory slash is replaced with a dot and the file suffix (.py) is dropped (trainer.task).

#### Package Assembly

In the following cells, you assemble the training package.

In [None]:
# Make folder for Python training script
! rm -rf custom
! mkdir custom

# Add package information
! touch custom/README.md

setup_cfg = "[egg_info]\n\ntag_build =\n\ntag_date = 0"
! echo "$setup_cfg" > custom/setup.cfg

setup_py = "import setuptools\n\nsetuptools.setup(\n\n    install_requires=[\n\n        'tensorflow_datasets==1.3.0',\n\n    ],\n\n    packages=setuptools.find_packages())"
! echo "$setup_py" > custom/setup.py

pkg_info = "Metadata-Version: 1.0\n\nName: Boston Housing tabular regression\n\nVersion: 0.0.0\n\nSummary: Demostration training script\n\nHome-page: www.google.com\n\nAuthor: Google\n\nAuthor-email: aferlitsch@google.com\n\nLicense: Public\n\nDescription: Demo\n\nPlatform: Vertex"
! echo "$pkg_info" > custom/PKG-INFO

# Make the training subfolder
! mkdir custom/trainer
! touch custom/trainer/__init__.py

#### Create task.py

In the next cell, write the contents of the training script *task.py*.

To summarize, the script performs the following steps:

- Gets the directory for where to save the model artifacts from the command line (`--model_dir`), and if not specified, then from the environment variable `AIP_MODEL_DIR`.
- Loads Boston Housing dataset from TF.Keras built-in datasets.
- Builds a simple deep neural network model using TF.Keras model API.
- Compiles the model (`compile()`).
- Sets a training distribution strategy according to the argument `args.distribute`.
- Trains the model (`fit()`) with epochs specified by `args.epochs`.
- Saves the trained model (`save(args.model_dir)`) to the specified model directory.
- Saves the maximum value for each feature `f.write(str(params))` to the specified parameters file.

In [None]:
%%writefile custom/trainer/task.py
# Single, Mirror and Multi-Machine Distributed Training for Boston Housing

import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.python.client import device_lib
import numpy as np
import argparse
import os
import sys
tfds.disable_progress_bar()

parser = argparse.ArgumentParser()
parser.add_argument('--model-dir', dest='model_dir',
                    default=os.getenv('AIP_MODEL_DIR'), type=str, help='Model dir.')
parser.add_argument('--lr', dest='lr',
                    default=0.001, type=float,
                    help='Learning rate.')
parser.add_argument('--epochs', dest='epochs',
                    default=20, type=int,
                    help='Number of epochs.')
parser.add_argument('--steps', dest='steps',
                    default=100, type=int,
                    help='Number of steps per epoch.')
parser.add_argument('--distribute', dest='distribute', type=str, default='single',
                    help='distributed training strategy')
parser.add_argument('--param-file', dest='param_file',
                    default='/tmp/param.txt', type=str,
                    help='Output file for parameters')
args = parser.parse_args()

print('Python Version = {}'.format(sys.version))
print('TensorFlow Version = {}'.format(tf.__version__))
print('TF_CONFIG = {}'.format(os.environ.get('TF_CONFIG', 'Not found')))

# Single Machine, single compute device
if args.distribute == 'single':
    if tf.test.is_gpu_available():
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    else:
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
# Single Machine, multiple compute device
elif args.distribute == 'mirror':
    strategy = tf.distribute.MirroredStrategy()
# Multiple Machine, multiple compute device
elif args.distribute == 'multi':
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

# Multi-worker configuration
print('num_replicas_in_sync = {}'.format(strategy.num_replicas_in_sync))


def make_dataset():

  
  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.boston_housing.load_data(
    path="boston_housing.npz", test_split=0.2, seed=113
  )
  
    
  
  #Get maximum value of each column
  max_value_in_each_column_array_x_train=np.max(x_train,axis=0)

  #dividing each value by the maximum value of that column  
    
  x_train=x_train/max_value_in_each_column_array_x_train

  max_value_in_each_column_array_x_test=np.max(x_test,axis=0)

  #dividing each value by the maximum value of that column  
    
  x_test=x_test/max_value_in_each_column_array_x_test

  params=max_value_in_each_column_array_x_train

  

  # store the normalization (max) value for each feature
  with tf.io.gfile.GFile(args.param_file, 'w') as f:
    f.write(str(params))
  return (x_train, y_train), (x_test, y_test)


# Build the Keras model
def build_and_compile_dnn_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Dense(128, activation='relu', input_shape=(13,)),
      tf.keras.layers.Dense(128, activation='relu'),
      tf.keras.layers.Dense(1, activation='linear')
  ])
  model.compile(
      loss='mse',
      optimizer=tf.keras.optimizers.RMSprop(learning_rate=args.lr))
  return model

NUM_WORKERS = strategy.num_replicas_in_sync
# Here the batch size scales up by number of workers since
# `tf.data.Dataset.batch` expects the global batch size.
BATCH_SIZE = 16
GLOBAL_BATCH_SIZE = BATCH_SIZE * NUM_WORKERS

with strategy.scope():
  # Creation of dataset, and model building/compiling need to be within
  # `strategy.scope()`.
  model = build_and_compile_dnn_model()

# Train the model
(x_train, y_train), (x_test, y_test) = make_dataset()
model.fit(x_train, y_train, epochs=args.epochs, batch_size=GLOBAL_BATCH_SIZE)
model.save(args.model_dir)

### Store the training script in your Cloud Storage bucket.

Next, package the training folder into a compressed tar ball, and then store the folder in your Cloud Storage bucket.

In [None]:
! rm -f custom.tar custom.tar.gz
! tar cvf custom.tar custom
! gzip custom.tar
! gsutil cp custom.tar.gz $BUCKET_URI/trainer_boston.tar.gz

### Create and run custom training job


To train a custom model, you perform two steps:

1) Create a custom training job

2) Specify your training parameters and run the job.

#### Create a custom training job

A custom training job is created using the `CustomTrainingJob` class, with the following parameters:

- `display_name`: The human readable name for the custom training job
- `container_uri`: The training container image
- `requirements`: Package requirements for the training container image (e.g., pandas)
- `script_path`: The relative path to the training script

In [None]:
train_job = aiplatform.CustomTrainingJob(
    display_name="boston",
    script_path="custom/trainer/task.py",
    container_uri=TRAIN_IMAGE,
    requirements=["gcsfs==0.7.1", "tensorflow-datasets==4.4"],
)

print(train_job)

#### Prepare your training parameters

Now define the command-line arguments for your custom training container:

- `args`: The command-line arguments to pass to the executable that's set as the entry point into the container.
    - `--model-dir`: Command-line argument to specify where to store the model artifacts. You can use either of the following methods to specify the storage location for artifacts.
        - **method-1**(set `DIRECT` to `True`): You pass the Cloud Storage location as a command line argument to your training script.
        - **method-2**(set `DIRECT` to `False`): The service passes the Cloud Storage location as the environment variable AIP_MODEL_DIR to your training script. In this case, you tell the service the model artifact location in the job specification.
    - `--epochs`: The number of epochs for training.
    - `--steps`: The number of steps per epoch.

In [None]:
MODEL_DIR = "{}/{}".format(BUCKET_URI, "model")

EPOCHS = 20
STEPS = 100

DIRECT = True
if DIRECT:
    CMDARGS = [
        "--model-dir=" + MODEL_DIR,
        "--epochs=" + str(EPOCHS),
        "--steps=" + str(STEPS),
    ]
else:
    CMDARGS = [
        "--epochs=" + str(EPOCHS),
        "--steps=" + str(STEPS),
    ]

#### Run the custom training job

Next, you run the custom job to start the training job by invoking the `run()` method, with the following parameters:

- `args`: The command-line arguments to pass to the training script.
- `replica_count`: The number of compute instances for training (replica_count = 1 is single node training).
- `machine_type`: The machine type for the compute instances.
- `accelerator_type`: The hardware accelerator type.
- `accelerator_count`: The number of accelerators to attach to a worker replica.
- `base_output_dir`: The Cloud Storage location to write the model artifacts.
- `sync`: Set **True** to wait until the completion of the job.

In [None]:
if TRAIN_GPU:
    train_job.run(
        args=CMDARGS,
        replica_count=1,
        machine_type=TRAIN_COMPUTE,
        accelerator_type=TRAIN_GPU.name,
        accelerator_count=TRAIN_NGPU,
        base_output_dir=MODEL_DIR,
        sync=True,
    )
else:
    train_job.run(
        args=CMDARGS,
        replica_count=1,
        machine_type=TRAIN_COMPUTE,
        base_output_dir=MODEL_DIR,
        sync=True,
    )

model_path_to_deploy = MODEL_DIR

#### Load the saved model

Your model is stored in a TensorFlow SavedModel format in a Cloud Storage bucket. Once you load the model from the Cloud Storage bucket you can run model evaluation and prepare it for prediction requests.

To load the model, pass the Cloud Storage path "MODEL_DIR" to the `tf.saved_model.load()` method.

In [None]:
loaded = tf.saved_model.load(model_path_to_deploy)

#### Get the serving function signature

You can get the signatures of your model's input and output layers by reloading the model into memory, and querying it for the signatures corresponding to each layer.

When making a prediction request, you need to route the request to the serving function instead of the model, so you need to know the input layer name of the serving function which you use later when you make a prediction request.

You also need to know the name of the serving function's input and output layer for constructing the explanation metadata **during a later step**.

In [None]:
serving_input = list(
    loaded.signatures["serving_default"].structured_input_signature[1].keys()
)[0]
print("Serving function input:", serving_input)
serving_output = list(loaded.signatures["serving_default"].structured_outputs.keys())[0]
print("Serving function output:", serving_output)

## Configure feature-based explanations (Optional) 

**For configuring explanations to the model, follow this step. This step is optional.**

To use Vertex Explainable AI with a custom-trained model, you must configure certain options when you create the Model resource that you plan to request explanations from, or when you deploy the model, or when you submit a batch explanation job.

If you want to use Vertex Explainable AI with an AutoML tabular model, then you don't need to perform any configuration. Vertex AI automatically configures the model for Vertex Explainable AI.

### Explanation Specification

To get explanations for the predictions, you must enable the explanation feature and set corresponding settings when you upload your custom model to Vertex AI Model Registry. These settings are referred to as the explanation metadata, which consists of:

- `parameters`: Specification for the explainability algorithm to use for explanations on your model. You can choose between:
  - Shapley (**Note**: not recommended for image data since can involve a long-running operation)
  - XRAI
  - Integrated Gradients
- `metadata`: Specification for how the algoithm is applied on your custom model

Learn more about [explanation specification](https://cloud.google.com/vertex-ai/docs/explainable-ai/configuring-explanations-feature-based#when-creating-or-importing-model).



In the next code cell, set the variable `XAI` to the explainabilty algorithm that you use on your custom model.

In [None]:
XAI = "ig"  # [ shapley, ig, xrai ]

if XAI == "shapley":
    PARAMETERS = {"sampled_shapley_attribution": {"path_count": 10}}
elif XAI == "ig":
    PARAMETERS = {"integrated_gradients_attribution": {"step_count": 50}}
elif XAI == "xrai":
    PARAMETERS = {"xrai_attribution": {"step_count": 50}}

parameters = aiplatform.explain.ExplanationParameters(PARAMETERS)

In the next code cell, define the metadata.

In [None]:
INPUT_METADATA = {
    "input_tensor_name": serving_input,
    "encoding": "BAG_OF_FEATURES",
    "modality": "numeric",
    "index_feature_mapping": [
        "crim",
        "zn",
        "indus",
        "chas",
        "nox",
        "rm",
        "age",
        "dis",
        "rad",
        "tax",
        "ptratio",
        "b",
        "lstat",
    ],
}

OUTPUT_METADATA = {"output_tensor_name": serving_output}

input_metadata = aiplatform.explain.ExplanationMetadata.InputMetadata(INPUT_METADATA)
output_metadata = aiplatform.explain.ExplanationMetadata.OutputMetadata(OUTPUT_METADATA)

metadata = aiplatform.explain.ExplanationMetadata(
    inputs={"features": input_metadata}, outputs={"medv": output_metadata}
)

### Create instance schema and prediction schema yaml files

In next cells, write the contents of *instance_schema.yaml* and *prediction_schema.yaml* files. Content structure is the same for both files.


#### Create instance schema yaml file

The *instance_schema.yaml* file defines the structure of the prediction instances you provide to your batch predictions. 

- Specify the title and description.
- Specify type of the input. In your case input layer to batch prediction is 
**{"dense_input": [0.02715405449271202, 0.0, 0.027177177369594574, 0.0, 0.0010195195209234953, 0.009660660289227962, 0.1501501500606537, 0.0027548049110919237, 0.036036036908626556, 1.0, 0.03033033013343811, 0.04091591760516167, 0.043618619441986084]}**
which is an object. Inside object, there are properties like dense_input.
- For each property, provide description and mention its type. 
- If type of the property is an array, mention the information about array items in `items` key.


In [None]:
%%writefile instance_schema.yaml
title: TabularRegression
description: 'Regression Instances.'

type: object
properties:
  dense_input:
    type: array
    items:
      type: float
      minimum: 0.0
      maximum: 1.0
    description: 'Input values to model'


#### Create prediction schema yaml file

The *prediction_schema.yaml* file defines the structure of the prediction output you get from your batch prediction job. 

Output of batch prediction job is "prediction": [value], which is of type array.

In [None]:
%%writefile prediction_schema.yaml
title: TabularRegression
description: 'Regression results.'

type: array

Upload both the files to your Cloud Storage bucket.

In [None]:
!gsutil cp instance_schema.yaml {BUCKET_URI}/instance_schema.yaml
!gsutil cp prediction_schema.yaml {BUCKET_URI}/prediction_schema.yaml

## Upload the model

Next, upload your model to Vertex AI Model Registry using `Model.upload()` method, with the following parameters:

- `display_name`: The human readable name for the model resource.
- `artifact`: The Cloud Storage location of the trained model artifacts.
- `serving_container_image_uri`: The serving container image.
- `instance_schema_uri`: Points to a YAML file stored on Google Cloud Storage describing the format of a single instance.
- `prediction_schema_uri`: Points to a YAML file stored on Google Cloud Storage describing the format of a single prediction produced by this model.
- `sync`: Whether to execute the upload asynchronously or synchronously.
- `explanation_parameters`: Parameters to configure explaining for model's predictions.
- `explanation_metadata`: Metadata describing the model's input and output for explanation.

If the `upload()` method is run asynchronously, you can subsequently block until completion with the `wait()` method.

**Note:** If you want to configure explanations for the model, set `explanation_parameters`,  `explanation_metadata` parameters. Otherwise don't set them.

In [None]:
model = aiplatform.Model.upload(
    display_name="boston_new_model",
    artifact_uri=MODEL_DIR,
    serving_container_image_uri=DEPLOY_IMAGE,
    instance_schema_uri=f"{BUCKET_URI}/instance_schema.yaml",
    prediction_schema_uri=f"{BUCKET_URI}/prediction_schema.yaml",
    explanation_parameters=parameters,
    explanation_metadata=metadata,
    sync=False,
)

model.wait()

### Load data for the pipeline

Load the Boston Housing test (holdout) data from `tf.keras.datasets`, using the method `load_data()`. This returns the dataset as a tuple of two elements. The first element is the training data and the second is the test data. Each element is also a tuple of two elements: the feature data, and the corresponding labels (median value of owner-occupied home).

You don't need the training data, and therefore you load it as `(_, _)`.

Before you can run the data through the pipeline, you need to preprocess it. Normalize (rescale) the data in each column by dividing each value by the maximum value of that column. This replaces each single value with a 32-bit floating point number between 0 and 1.

In [None]:
(_, _), (x_test, y_test) = boston_housing.load_data(
    path="boston_housing.npz", test_split=0.2, seed=113
)

max_value_in_each_column_array = np.max(x_test, axis=0)


# dividing each value by the maximum value of that column
x_test = x_test / max_value_in_each_column_array


x_test = x_test.astype(np.float32)

print(x_test.shape, x_test.dtype, y_test.shape)
print("scaled", x_test[0])

### Prepare the input file for the pipeline

Prepare an input file and store it in your Cloud Storage bucket. Each instance in the file is a dictionary entry of the form:

                        {serving_input: content, grount_truth_column:value}

- `serving_input`: The name of the input layer of the underlying model.
- `content`: The feature values of the test item as a list.
- `ground_truth_column`: Give any name to this key. Use the same name in target_field_name in the below pipeline parameters.
- `value`: Ground truth value of this instance.

 

In [None]:
gcs_input_uri = BUCKET_URI + "/" + "test_file_with_ground_truth.jsonl"
with tf.io.gfile.GFile(gcs_input_uri, "w") as f:
    for i in range(10):
        data = {serving_input: x_test[i].tolist(), "MEDV": y_test[i]}
        f.write(json.dumps(data) + "\n")

## Model Evaluation

Now, run a Vertex AI Batch Prediction job and generate evaluations and feature-attributions on its results by creating a Vertex AI pipeline using `evaluate` function. Learn more about [evaluate function](https://github.com/googleapis/python-aiplatform/blob/main/google/cloud/aiplatform/models.py#L5127).

### Define parameters to run the evaluate function

Specify the required parameters to run `evaluate` function. 

The following is the instruction of `evaluate` function paramters:

- `prediction_type`: The problem type being addressed by this evaluation run. 'classification' and 'regression' are the currently supported problem types.
- `target_field_name`: Name of the column to be used as the target for regression.
- `gcs_source_uris`: List of the Cloud Storage bucket uris of input instances for batch prediction.
- `generate_feature_attributions`: (**Optional**) Whether the model evaluation job should generate feature attributions. Defaults to False if not specified.

**The pipeline takes about 1 hour to complete.**

In [None]:
job = model.evaluate(
    prediction_type="regression",
    target_field_name="MEDV",
    gcs_source_uris=[BUCKET_URI + "/" + "test_file_with_ground_truth.jsonl"],
    generate_feature_attributions=True,
)

print("Waiting model evaluation is in process")
job.wait()

In the results from the last step, click on the generated link to see your run in the Cloud Console.


##### Runtime Graph of Model Evaluation pipeline

In the UI, you can click on the DAG nodes to expand or collapse them. Here's a partially-expanded view of the DAG (click image to see larger version).

<img src="images/custom_tabular_regression_evaluation_pipeline.PNG" style="height:622px;width:726px"></img>

## Get the model evaluation results

After the evalution pipeline is finished, run the below cell to print the evaluation metrics.

In [None]:
model_evaluation = job.get_model_evaluation()

In [None]:
# Iterate over the pipeline tasks
for (
    task
) in model_evaluation._backing_pipeline_job._gca_resource.job_detail.task_details:
    # Obtain the artifacts from the evaluation task
    if (
        ("model-evaluation" in task.task_name)
        and ("model-evaluation-import" not in task.task_name)
        and (
            task.state == aiplatform_v1.types.PipelineTaskDetail.State.SUCCEEDED
            or task.state == aiplatform_v1.types.PipelineTaskDetail.State.SKIPPED
        )
    ):
        evaluation_metrics = task.outputs.get("evaluation_metrics").artifacts[
            0
        ]  # ['artifacts']
        evaluation_metrics_gcs_uri = evaluation_metrics.uri

print(evaluation_metrics)
print(evaluation_metrics_gcs_uri)

### Visualize the metrics

After the evalution pipeline is finished, run the below cell to visualize the evaluation metrics.

In [None]:
metrics = []
values = []
for i in evaluation_metrics.metadata.items():
    # if (
    #     i[0] == "meanAbsolutePercentageError"
    # ):  # we are not considering MAPE as it is infinite. MAPE is infinite if groud truth is 0 as in our case Age is 0 for some instances.
    #     continue
    metrics.append(i[0])
    values.append(i[1])
plt.figure(figsize=(15, 5))
plt.bar(x=metrics, height=values)
plt.title("Evaluation Metrics")
plt.ylabel("Value")
plt.show()

### Get the Feature Attributions (Optional)

**If you have configured explanations for the model, run below cell. Else skip below cell.**


Feature attributions indicate how much each feature in your model contributed to the predictions for each given instance.

Learn more about [Feature attributions](https://cloud.google.com/vertex-ai/docs/explainable-ai/overview#feature_attributions).

Run the below cell to get the feature attributions. 

In [None]:
# Iterate over the pipeline tasks
for (
    task
) in model_evaluation._backing_pipeline_job._gca_resource.job_detail.task_details:
    # Obtain the artifacts from the feature-attribution task
    if (task.task_name == "feature-attribution") and (
        task.state == aiplatform_v1.types.PipelineTaskDetail.State.SUCCEEDED
        or task.state == aiplatform_v1.types.PipelineTaskDetail.State.SKIPPED
    ):
        feat_attrs = task.outputs.get("feature_attributions").artifacts[0]
        feat_attrs_gcs_uri = feat_attrs.uri

print(feat_attrs)
print(feat_attrs_gcs_uri)

From the obtained Cloud Storage URI for the feature attributions, get the attribution values.

In [None]:
# Load the results
attributions = !gsutil cat $feat_attrs_gcs_uri

# Print the results obtained
attributions = json.loads(attributions[0])
print(attributions)

### Visualize the Feature Attributions

Visualize the obtained attributions for each feature using a bar-chart.

In [None]:
data = attributions["explanation"]["attributions"][0]["featureAttributions"]
features = []
attr_values = []
for key, value in data.items():
    features.append(key)
    attr_values.append(value[0])

plt.figure(figsize=(5, 3))
plt.bar(x=features, height=attr_values)
plt.title("Feature Attributions")
plt.xticks(rotation=90)
plt.ylabel("Attribution value")
plt.show()

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.

Set `delete_bucket` to **True** to create the Cloud Storage bucket created in this notebook.

In [None]:
# Delete model resource
model.delete()

# Delete the training job
train_job.delete()

# Delete the evaluation pipeline
job.delete()

# Delete the batch prediction jobs
batch_prediction_jobs = aiplatform.BatchPredictionJob.list()
for batch_prediction_job in batch_prediction_jobs:
    if any(
        keyword in batch_prediction_job.display_name
        for keyword in [
            "model-registry-batch-predict-evaluation",
            "model-registry-batch-explain-evaluation",
        ]
    ):
        batch_prediction_job.delete()

# Delete locally generated files
! rm -rf custom custom.tar.gz instance_schema.yaml prediction_schema.yaml

# Delete Cloud Storage objects
delete_bucket = False
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI