In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Evaluating T5 1.1 XL model fined tuned for abstractive summarization


---


This notebook demonstrates how to evaluate a T5X XL model fine-tuned for text summarization on the abstractive summarization task. The model is evaluated on the `test` split of the [XSum dataset](https://www.tensorflow.org/datasets/catalog/xsum).

During fine-tuning, evaluations are run on the `validation` split of a  dataset. As running evaluations on the full split may be computationally intensive you often limit the number of evaluation steps during fine tuning. This is how the example job that fine tunes T5 XL on XSum (`finetune-t511-xl-xsum.ipynb` is configured.

After fine-tuning is completed, you can run evaluations  on the full `test` split to obtain the test performance metrics. You can run  evaluations  using a few checkpoints that showed the best performance on the `validation` split. For example, you could run evaluations 
on the 1006000 and 1007000 splits created during the following fine tuning run.

![Metrics](../images/metrics.png)

## Imports and initialization

Please refer to the [environment setup](../README.md) section in the README 
file to setup the development environment and install the required libraries 
before importing them.

In [None]:
# reloads modules automatically before executing any code/script
%load_ext autoreload
%autoreload 2

### Import libraries

In [None]:
import os
import time
from datetime import datetime
import pandas as pd

import utils

# import vertex ai sdk for python
from google.cloud import aiplatform as vertex_ai

### Configure environment settings

- **`PROJECT_ID`:** Google Cloud Project ID
- **`REGION`:** [Region](https://cloud.google.com/vertex-ai/docs/general/locations) 
  to be used for Vertex AI operations throughout the rest of this notebook
- **`BUCKET`:** Google Cloud Storage bucket name to be used by vertex AI for 
  any operations such as to stage the code, save any  artifacts generated etc.
- **`TFDS_DATA_DIR`:** GCS location of the prestaged datasets. 


In [None]:
# Project definitions
PROJECT_ID = '<YOUR PROJECT ID>' # Change to your project id.
REGION = '<YOUR REGION>'  # Change to your region.

# Bucket definitions
BUCKET = '<YOUR BUCKET NAME>' # Change to your bucket.

In [None]:
# Project definitions
PROJECT_ID = 'jk-mlops-dev' # Change to your project id.
REGION = 'us-central1'  # Change to your region.

# Bucket definitions
BUCKET = 'jk-t5x-staging' # Change to your bucket.


### Configure dataset location

Configure the Cloud Storage location where the TFDS XSum dataset is 
staged. If you followed the the [environment setup]() instructions the dataset is in `gs://<YOUR BUCKET>/datasets`.

In [None]:
TFDS_DATA_DIR = f'gs://{BUCKET}/datasets'

### Configure custom training container image

In this example, you use the base T5X custom training container.

In [None]:
# configure the custom container image name
IMAGE_NAME = 't5x-base' # Change to your image name
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'

Validate image exists in the Container Registry

In [None]:
! gcloud container images describe $IMAGE_URI

### Configure experiment settings

Choose a Vertex AI experiment to store the evaluation results. If you have an existing experiment that contains a run with fine tuned checkpoints you want to evaluate, set `EXPERIMENT_NAME` to that experiment's name.  

In [None]:
EXPERIMENT_NAME = '<YOUR EXPERIMENT NAME>' # Change to your experiment name

EXPERIMENT_WORKSPACE = f'gs://{BUCKET}/experiments/{EXPERIMENT_NAME}'
EXPERIMENT_RUNS = f'{EXPERIMENT_WORKSPACE}/runs'

### Initialize Vertex AI SDK for Python


In [None]:
vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=EXPERIMENT_WORKSPACE,
    experiment=EXPERIMENT_NAME
)

## Run model evaluation job

### Define the job's gin file


In [None]:
JOB_GIN_FILE = '../configs/eval_t511_xl_xsum.gin'

!cat {JOB_GIN_FILE}

The default evaluation configuration as defined in `t5x/t5x/configs/runs/eval.gin` does not put any constraints on the length of input and target features. Their dimensions are computed by looking for the longest sequences in the data split used for evaluation. In the preprocessed  `test` split of the XSum dataset, the longest sequence is 15861 tokens so the feature dimension of input batches would be set to 15861. This can lead to out of memory errors during evaluation. To mitigate this,  the maximum length of inputs and targets is set to 2048 and 64 respectively. These numbers were set based on the statistics of the `test` split. The 99th percentile of inputs is 1988 and the 99th percentile of targets is 54.

This configuration has been tested on a v2-32 TPU slice using a 4-way model parallelism.

### Select the checkpoint for evaluation

If your experiment contains a previous run(s) you can retrieve a location of the checkpoint from the run's record.  It is recommended to review TensorBoard logs generated during the run to select a checkpoint or checkpoints for full evaluation. 




In [None]:
# get all experiment runs and run directories
utils.get_all_experiment_run_directories(EXPERIMENT_NAME)

**NOTE**:  Model checkpoint path is a sub-folder under the `RUN_DIR` with prefix `checkpoint_`. There could be multiple checkpoint folders. Pick the model checkpoint you would like to evaluate the model with. For example:
```
! gsutil ls -r $RUN_DIR
      gs://{BUCKET_NAME}/experiments/{EXPERIMENT_NAME}/runs/{RUN_ID}/checkpoint_1000000/
      gs://{BUCKET_NAME}/experiments/{EXPERIMENT_NAME}/runs/{RUN_ID}/checkpoint_1005000/
```

In [None]:
CHECKPOINT_PATH = '<YOUR CHECKPOINT PATH>' # Change to the checkpoint path on GCS

### Configure Vertex AI CustomJob 


In [None]:
EVAL_RUN_NAME = f'<YOUR RUN NAME>' # Change to your run name for the custom job
EVAL_RUN_ID = f'{EXPERIMENT_NAME}-{EVAL_RUN_NAME}-{datetime.now().strftime("%Y%m%d%H%M")}'
EVAL_RUN_DIR = f'{EXPERIMENT_RUNS}/{EVAL_RUN_ID}'
RUN_MODE = 'eval'

In [None]:
GIN_FILES = [JOB_GIN_FILE]
GIN_OVERWRITES = [
    'USE_CACHED_TASKS=False',
    f'CHECKPOINT_PATH="{CHECKPOINT_PATH}"',
    f'EVAL_OUTPUT_DIR="{EVAL_RUN_DIR}"'
]

Log local variables defined for any troubleshooting

In [None]:
for key in [
    "PROJECT_ID", "REGION", "BUCKET",
    "IMAGE_NAME", "IMAGE_URI", 
    "EXPERIMENT_NAME", "EXPERIMENT_WORKSPACE", "EXPERIMENT_RUNS", 
    "TFDS_DATA_DIR", "GIN_FILES", "GIN_OVERWRITES", 
    "EVAL_RUN_NAME", "EVAL_RUN_ID", "EVAL_RUN_DIR", "RUN_MODE",
    "CHECKPOINT_PATH"]:
        print(f"{key}={eval(key)}")

Configure Cloud TPU topology. Double check if your [region](https://cloud.google.com/vertex-ai/docs/general/locations#accelerators) supports the configured TPU topology.

In [None]:
MACHINE_TYPE = 'cloud-tpu'
ACCELERATOR_TYPE = 'TPU_V2'
ACCELERATOR_COUNT = 32

Create the custom job spec

In [None]:
job = utils.create_t5x_custom_job(
    display_name=EVAL_RUN_ID,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    image_uri=IMAGE_URI,
    run_mode=RUN_MODE,
    gin_files=GIN_FILES,
    model_dir=EVAL_RUN_DIR,
    tfds_data_dir=TFDS_DATA_DIR,
    gin_overwrites=GIN_OVERWRITES
)

In [None]:
job.job_spec

### Submit the custom job to Vertex AI and track the experiment


In [None]:
utils.submit_and_track_t5x_vertex_job(
    custom_job=job,
    job_display_name=EVAL_RUN_ID,
    run_name=EVAL_RUN_ID,
    experiment_name=EXPERIMENT_NAME,
    execution_name=EVAL_RUN_ID,
    tfds_data_dir=TFDS_DATA_DIR,
    model_dir=EVAL_RUN_DIR,
    vertex_ai=vertex_ai,
    run_mode=RUN_MODE
)

## Explore and log metrics

After the job is completed, you can explore the metrics and log them to the 
experiment. 

Run the below utility function to parse the run logs and push the metrics to Vertex AI Experiments.

In [None]:
GCS_EVAL_DIR = os.path.join(EVAL_RUN_DIR, 'inference_eval')

results = utils.parse_and_log_eval_metrics(
    summary_dir=GCS_EVAL_DIR,
    run_name=EVAL_RUN_ID,
    vertex_ai=vertex_ai
)
results