In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Fine-tuning T5 1.1 large on SQuAD dataset


---

## Objective

This notebook demonstrates how to fine tune the T5 1.1 large model on the question and answer task using the [SQuAD dataset](https://www.tensorflow.org/datasets/catalog/squad).

In [None]:
# reloads modules automatically before executing any code/script
%load_ext autoreload
%autoreload 2

## Import libraries

Please refer to the [environment setup](../README.md) section in the README 
file to setup the development environment and install the required libraries 
before importing them.

In [None]:
import os
import time
from datetime import datetime
import pandas as pd

import utils

# import vertex ai sdk for python
from google.cloud import aiplatform as vertex_ai

## Configure environment settings

Based on the [environment setup](../README.md) done previously, configure the 
following environment settings:

- **`PROJECT_ID`:** Configure the Google Cloud Project ID
- **`REGION`:** Configure the [region](https://cloud.google.com/vertex-ai/docs/general/locations) 
  to be used for Vertex AI operations throughout the rest of this notebook
- **`BUCKET`:** Google Cloud Storage bucket name to be used by vertex AI for 
  any operations such as to stage the code, save any  artifacts generated etc.
- **`TENSORBOARD_NAME`:** Configure the managed TensorBoard instance name 
  created during the environment setup.

In [None]:
# Project definitions
PROJECT_ID = '<YOUR PROJECT ID>' # Change to your project id.
REGION = '<YOUR REGION>'  # Change to your region.

# Bucket definitions
BUCKET = '<YOUR BUCKET NAME>' # Change to your bucket.

# Tensorboard definitions
TENSORBOARD_NAME = '<YOUR TENSORBOARD NAME>' # Change to your Tensorboard instance name

Get Vertex AI TensorBoard ID based on name.

In [None]:
TENSORBOARD_ID = ! gcloud ai tensorboards list --filter="displayName={TENSORBOARD_NAME}" --format="value(name)" --region={REGION} 2>/dev/null 
TENSORBOARD_ID = TENSORBOARD_ID[0]

print(f"TENSORBOARD_ID = {TENSORBOARD_ID}")

### Configure custom container image

In this example, you use the base T5X custom training container.

In [None]:
IMAGE_NAME = 't5x-base' 
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'

Validate image exists in the Container Registry

In [None]:
! gcloud container images describe $IMAGE_URI

## Configure experiment settings


In [None]:
EXPERIMENT_NAME = '<YOUR EXPERIMENT>' # Change to your experiment name

EXPERIMENT_WORKSPACE = f'gs://{BUCKET}/experiments/{EXPERIMENT_NAME}'
EXPERIMENT_RUNS = f'{EXPERIMENT_WORKSPACE}/runs'

### Initialize Vertex AI SDK for Python


In [None]:
vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=EXPERIMENT_WORKSPACE,
    experiment=EXPERIMENT_NAME
)

## Configure dataset location


In [None]:
TFDS_DATA_DIR = f'gs://{BUCKET}/datasets'

## Configure T5X fine tuning job 

This job is configured using the following Gin file.

In [None]:
JOB_GIN_FILE = '../configs/finetune_t511_large_squad.gin'

!cat {JOB_GIN_FILE}

This configuration has been tested on a v2-128 TPU slice using an 8-way model parallelism and 16-way data parallelism. The batch size is set to 128. If you want run it on a different slice topology make sure to adjust the global batch size and a number of model parallel partitions.

The job uses the custom `squad` SeqIO Task.

The default settings for finetuning do not set any constraints on the length of an input sequence when computing metrics defined in the SeqIO Task. This may lead to out of memory errors when using a dataset with long input sequences. To avoid the errors, the `task_feature_lengths` property for the inference evaluation dataset config is set to the same value as for training and validation datasets.

In [None]:
GIN_FILES = [JOB_GIN_FILE]  
GIN_OVERWRITES = [
        'USE_CACHED_TASKS=False'
    ]

## Configure and run job on Vertex AI

### Configure Vertex AI CustomJob

In [None]:
RUN_NAME = f'<YOUR RUN NAME>' # Change to your run name for the custom job
RUN_ID = f'{EXPERIMENT_NAME}-{RUN_NAME}-{datetime.now().strftime("%Y%m%d%H%M")}'
RUN_DIR = f'{EXPERIMENT_RUNS}/{RUN_ID}'
RUN_MODE = 'train'

Log local variables defined for any troubleshooting

In [None]:
for key in [
    "PROJECT_ID", "REGION", "BUCKET", "TENSORBOARD_NAME", "TENSORBOARD_ID", 
    "IMAGE_NAME", "IMAGE_URI", 
    "EXPERIMENT_NAME", "EXPERIMENT_WORKSPACE", "EXPERIMENT_RUNS", 
    "TFDS_DATA_DIR", "GIN_FILES", "GIN_OVERWRITES", 
    "RUN_NAME", "RUN_ID", "RUN_DIR", "RUN_MODE"
]:
    print(f"{key}={eval(key)}")

Configure a Cloud TPU slice for the job. Double check if your [region](https://cloud.google.com/vertex-ai/docs/general/locations#accelerators) supports the specified TPU topology.

In [None]:
MACHINE_TYPE = 'cloud-tpu'
ACCELERATOR_TYPE = 'TPU_V2'
ACCELERATOR_COUNT = 128

Create the custom job spec

In [None]:
job = utils.create_t5x_custom_job(
    display_name=RUN_ID,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    image_uri=IMAGE_URI,
    run_mode=RUN_MODE,
    gin_files=GIN_FILES,
    model_dir=RUN_DIR,
    tfds_data_dir=TFDS_DATA_DIR,
    gin_overwrites=GIN_OVERWRITES
)

job.job_spec

### Submit the custom job to Vertex AI and track the experiment


In [None]:
utils.submit_and_track_t5x_vertex_job(
    custom_job=job,
    job_display_name=RUN_ID,
    run_name=RUN_ID,
    experiment_name=EXPERIMENT_NAME,
    execution_name=RUN_ID,
    tfds_data_dir=TFDS_DATA_DIR,
    model_dir=RUN_DIR,
    run_model=RUN_MODE,
    vertex_ai=vertex_ai
)

### Monitor the job with Vertex AI TensorBoard

Currently Vertex AI Training does not support native integration with Vertex AI 
Tensorboard for TPU based training jobs. As a mitigation you can start 
`tb-gcp-uploader` command line utility to manually [upload Vertex AI 
TensorBoard logs](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview#uploading_logs) 
to Vertex AI TensorBoard. This integration allows you to monitor the training 
in near real time as Vertex AI TensorBoard streams in Vertex AI TensorBoard 
logs as they are written to Cloud Storage bucket.

**Execute the following command from the terminal window to sync logs to Vertex 
AI TensorBoard**

In [None]:
cmd = f"""
tb-gcp-uploader --tensorboard_resource_name {TENSORBOARD_ID} \
--logdir {EXPERIMENT_RUNS} \
--experiment_name {EXPERIMENT_NAME}
"""

print(cmd)

To access the TensorBoard instance for the experiment, click the below URL

In [None]:
TENSORBOARD_URL = f"https://{REGION}.tensorboard.googleusercontent.com/experiment/{TENSORBOARD_ID.replace('/', '+')}+experiments+{EXPERIMENT_NAME}/"
print(f"TensorBoard URL for the experiment is located at {TENSORBOARD_URL}")

Alternatively, you can access the Vertex AI TensorBoard experiment from the [console](https://console.cloud.google.com/vertex-ai/experiments/).

### Explore and log metrics

In [None]:
# Set path to read inference eval metrics
GCS_VAL_DIR = os.path.join(RUN_DIR, 'inference_eval/')

In [None]:
results = utils.parse_and_log_eval_metrics(
    summary_dir=GCS_VAL_DIR,
    run_name=RUN_ID,
    vertex_ai=vertex_ai
)
results