In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI: Track parameters and metrics for custom training jobs

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/ml_metadata/sdk-metric-parameter-tracking-for-custom-jobs.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fml_metadata%2Fsdk-metric-parameter-tracking-for-custom-jobs.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/ml_metadata/sdk-metric-parameter-tracking-for-custom-jobs.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
<a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/ml_metadata/sdk-metric-parameter-tracking-for-custom-jobs.ipynb" target='_blank'>
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
     </a>
   </td>
</table>
<br/><br/><br/>

## Overview

This notebook demonstrates how to track metrics and parameters for Vertex AI custom training jobs, and how to perform detailed analysis using this data.

Learn more about [Vertex ML Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata),
[Custom training](https://cloud.google.com/vertex-ai/docs/training/custom-training), and 
[Vertex AI Experiments](https://cloud.google.com/vertex-ai/docs/experiments/intro-vertex-ai-experiments).

### Objective

In this notebook, you learn how to use Vertex AI SDK for Python to:

This tutorial uses the following Google Cloud ML services and resources:
- Vertex AI Dataset
- Vertex AI Model
- Vertex AI Endpoint
- Vertex AI Custom Training Job
- Vertex AI Experiments

The steps performed include:
- Track training parameters and prediction metrics for a custom training job.
- Extract and perform analysis for all parameters and metrics within an Experiment.

### Dataset

This example uses the Abalone Dataset. For more information about this dataset please visit: https://archive.ics.uci.edu/ml/datasets/abalone

### Costs 


This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and 
[Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the 
[Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

### Get Started
Install Vertex AI SDK for Python and other required packages

In [None]:
! pip3 install --upgrade tensorflow \
                         google-cloud-aiplatform \
                         scikit-learn -q \
                         pandas

### Restart runtime (Colab only)
To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">,
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>,
</div>

### Authenticate your notebook environment (Colab only)
Authenticate your environment on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information
Learn more about [setting up a project and a development environment.](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**If your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

### Import libraries and define constants

Import required libraries.


In [None]:
import pandas as pd
from google.cloud import aiplatform
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.python.keras.utils import data_utils

## Initialize Vertex AI and set an experiment


Define experiment name.

In [None]:
EXPERIMENT_NAME = "my-experiment-unique"

Initialize the *client* for Vertex AI.

In [None]:
aiplatform.init(
    project=PROJECT_ID,
    location=LOCATION,
    staging_bucket=BUCKET_URI,
    experiment=EXPERIMENT_NAME,
)

### Tracking parameters and metrics in Vertex AI custom training jobs

# Download the Dataset to Cloud Storage

In [None]:
!wget https://storage.googleapis.com/download.tensorflow.org/data/abalone_train.csv
!gsutil cp abalone_train.csv {BUCKET_URI}/data/

gcs_csv_path = f"{BUCKET_URI}/data/abalone_train.csv"

### Create a Vertex AI Tabular dataset from CSV data

A Vertex AI dataset can be used to create an AutoML model or a custom model. 

In [None]:
ds = aiplatform.TabularDataset.create(display_name="abalone", gcs_source=[gcs_csv_path])

ds.resource_name

### Write the training script

Next, you create the training script that is used in the sample custom training job.

In [None]:
%%writefile training_script.py

import pandas as pd
import argparse
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

parser = argparse.ArgumentParser()
parser.add_argument('--epochs', dest='epochs',
                    default=10, type=int,
                    help='Number of epochs.')
parser.add_argument('--num_units', dest='num_units',
                    default=64, type=int,
                    help='Number of unit for first layer.')
args = parser.parse_args()

col_names = ["Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Age"]
target = "Age"

def aip_data_to_dataframe(wild_card_path):
    return pd.concat([pd.read_csv(fp.numpy().decode(), names=col_names)
                      for fp in tf.data.Dataset.list_files([wild_card_path])])

def get_features_and_labels(df):
    return df.drop(target, axis=1).values, df[target].values

def data_prep(wild_card_path):
    return get_features_and_labels(aip_data_to_dataframe(wild_card_path))


model = tf.keras.Sequential([layers.Dense(args.num_units), layers.Dense(1)])
model.compile(loss='mse', optimizer='adam')

model.fit(*data_prep(os.environ["AIP_TRAINING_DATA_URI"]),
          epochs=args.epochs ,
          validation_data=data_prep(os.environ["AIP_VALIDATION_DATA_URI"]))
print(model.evaluate(*data_prep(os.environ["AIP_TEST_DATA_URI"])))

# save as Vertex AI Managed model
tf.saved_model.save(model, os.environ["AIP_MODEL_DIR"])

### Launch a custom training job and track its trainig parameters on Vertex ML Metadata

In [None]:
job = aiplatform.CustomTrainingJob(
    display_name="train-abalone-dist-1-replica",
    script_path="training_script.py",
    container_uri="us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest",
    requirements=["gcsfs==0.7.1"],
    model_serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest",
)

Start a new experiment run to track training parameters and start the training job. Note that this operation takes around 10 minutes.

In [None]:
aiplatform.start_run(
    "custom-training-run-unique"
)  # Change this to your desired run name
parameters = {"epochs": 10, "num_units": 64}
aiplatform.log_params(parameters)

model = job.run(
    ds,
    replica_count=1,
    model_display_name="abalone-model",
    args=[f"--epochs={parameters['epochs']}", f"--num_units={parameters['num_units']}"],
)

### Deploy model and calculate prediction metrics

Next, deploy your Vertex AI Model resource to a Vertex AI endpoint resource. This operation takes 10-20 minutes.

In [None]:
endpoint = model.deploy(machine_type="n1-standard-4")

### Prediction dataset preparation and online prediction

Once model is deployed, perform online prediction using the `abalone_test` dataset and calculate prediction metrics.

Prepare the prediction dataset.

In [None]:
def read_data(uri):
    dataset_path = data_utils.get_file("abalone_test.data", uri)
    col_names = [
        "Length",
        "Diameter",
        "Height",
        "Whole weight",
        "Shucked weight",
        "Viscera weight",
        "Shell weight",
        "Age",
    ]
    dataset = pd.read_csv(
        dataset_path,
        names=col_names,
        na_values="?",
        comment="\t",
        sep=",",
        skipinitialspace=True,
    )
    return dataset


def get_features_and_labels(df):
    target = "Age"
    return df.drop(target, axis=1).values, df[target].values


test_dataset, test_labels = get_features_and_labels(
    read_data(
        "https://storage.googleapis.com/download.tensorflow.org/data/abalone_test.csv"
    )
)

Perform online prediction.

In [None]:
prediction = endpoint.predict(test_dataset.tolist())
prediction

Calculate and track prediction evaluation metrics.

In [None]:
mse = mean_squared_error(test_labels, prediction.predictions)
mae = mean_absolute_error(test_labels, prediction.predictions)

aiplatform.log_metrics({"mse": mse, "mae": mae})

### Extract all parameters and metrics created during this experiment.

In [None]:
aiplatform.get_experiment_df()

### View data in the Cloud Console

Parameters and metrics can also be viewed in the Cloud Console. 


In [None]:
print("Vertex AI Experiments:")
print(
    f"https://console.cloud.google.com/ai/platform/experiments/experiments?folder=&organizationId=&project={PROJECT_ID}"
)

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:
Training Job
Model
Cloud Storage Bucket

* Vertex AI Dataset
* Training Job
* Model
* Endpoint
* Cloud Storage Bucket


In [None]:
# Warning: Setting this to true deletes everything in your bucket
delete_bucket = False

# Delete dataset
ds.delete()

# Delete experiment
experiment = aiplatform.Experiment(
    experiment_name=EXPERIMENT_NAME, project=PROJECT_ID, location=LOCATION
)
experiment.delete()

# Delete the training job
job.delete()

# Undeploy model from endpoint
endpoint.undeploy_all()

# Delete the endpoint
endpoint.delete()

# Delete the model
model.delete()


if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI