In [None]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex client library: TF Hub image classification model for online prediction

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/master/notebooks/community/gapic/custom/showcase_tfhub_image_classification_online.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/master/notebooks/community/gapic/custom/showcase_tfhub_image_classification_online.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview


This tutorial demonstrates how to use the Vertex client library for Python to deploy a pretrained TensorFlow Hub image classification model for online prediction.

### Dataset

The dataset used for this tutorial is the [Flowers dataset](https://www.tensorflow.org/datasets/catalog/tf_flowers) from [TensorFlow Datasets](https://www.tensorflow.org/datasets/catalog/overview). The version of the dataset you will use in this tutorial is stored in a public Cloud Storage bucket. The trained model predicts the type of flower an image is from a class of five flowers: daisy, dandelion, rose, sunflower, or tulip.

### Objective

In this tutorial, you will deploy a TensorFlow Hub pretrained model, and then do a prediction on the deployed model by sending data.

The steps performed include:

- Download a TensorFlow Hub pretrained model.
- Retrieve and load the model artifacts.
- Upload the model as a Vertex `Model` resource.
- Deploy the `Model` resource to a serving `Endpoint` resource.
- Make a prediction.
- Undeploy the `Model` resource.

### Costs

This tutorial uses billable components of Google Cloud (GCP):

* Vertex AI
* Cloud Storage

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Installation

Install the latest version of Vertex client library.

In [None]:
import os
import sys

# Google Cloud Notebook
if os.path.exists("/opt/deeplearning/metadata/env_version"):
    USER_FLAG = "--user"
else:
    USER_FLAG = ""

! pip3 install -U google-cloud-aiplatform $USER_FLAG

Install the latest GA version of *google-cloud-storage* library as well.

In [None]:
! pip3 install -U google-cloud-storage $USER_FLAG

### Restart the kernel

Once you've installed the Vertex client library and Google *cloud-storage*, you need to restart the notebook kernel so it can find the packages.

In [None]:
if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Before you begin

### GPU runtime

*Make sure you're running this notebook in a GPU runtime if you have that option. In Colab, select* **Runtime > Change Runtime Type > GPU**

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project.](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [Enable the Vertex APIs and Compute Engine APIs.](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)

4. [The Google Cloud SDK](https://cloud.google.com/sdk) is already installed in Google Cloud Notebook.

5. Enter your project ID in the cell below. Then run the  cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex. Not all regions provide support for all Vertex services. For the latest support per region, see the [Vertex locations documentation](https://cloud.google.com/vertex-ai/docs/general/locations)

In [None]:
REGION = "us-central1"  # @param {type: "string"}

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append onto the name of resources which will be created in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Authenticate your Google Cloud account

**If you are using Google Cloud Notebook**, your environment is already authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

In the Cloud Console, go to the [Create service account key](https://console.cloud.google.com/apis/credentials/serviceaccountkey) page.

**Click Create service account**.

In the **Service account name** field, enter a name, and click **Create**.

In the **Grant this service account access to project** section, click the Role drop-down list. Type "Vertex" into the filter box, and select **Vertex Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

Click Create. A JSON file that contains your key downloads to your local environment.

Enter the path to your service account key as the GOOGLE_APPLICATION_CREDENTIALS variable in the cell below and run the cell.

In [None]:
# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# If on Google Cloud Notebook, then don't execute this code
if not os.path.exists("/opt/deeplearning/metadata/env_version"):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

When you submit a custom training job using the Vertex client library, you upload a Python package
containing your training code to a Cloud Storage bucket. Vertex runs
the code from this package. In this tutorial, Vertex also saves the
trained model that results from your job in the same bucket. You can then
create an `Endpoint` resource based on this output in order to serve
online predictions.

Set the name of your Cloud Storage bucket below. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.

In [None]:
BUCKET_NAME = "gs://[your-bucket-name]"  # @param {type:"string"}

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "gs://[your-bucket-name]":
    BUCKET_NAME = "gs://" + PROJECT_ID + "aip-" + TIMESTAMP

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_NAME

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_NAME

### Set up variables

Next, set up some variables used throughout the tutorial.
### Import libraries and define constants

#### Import Vertex client library

Import the Vertex client library into our Python environment.

In [None]:
import time

from google.cloud.aiplatform import gapic as aip
from google.protobuf import json_format
from google.protobuf.json_format import MessageToJson, ParseDict
from google.protobuf.struct_pb2 import Struct, Value

#### Vertex constants

Setup up the following constants for Vertex:

- `API_ENDPOINT`: The Vertex API service endpoint for dataset, model, job, pipeline and endpoint services.
- `PARENT`: The Vertex location root path for dataset, model, job, pipeline and endpoint resources.

In [None]:
# API service endpoint
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

# Vertex location root path for your dataset, model and endpoint resources
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION

#### Hardware Accelerators

Set the hardware accelerators (e.g., GPU), if any, for prediction.

Set the variable `DEPLOY_GPU/DEPLOY_NGPU` to use a container image supporting a GPU and the number of GPUs allocated to the virtual machine (VM) instance. For example, to use a GPU container image with 4 Nvidia Telsa K80 GPUs allocated to each VM, you would specify:

    (aip.AcceleratorType.NVIDIA_TESLA_K80, 4)

For GPU, available accelerators include:
   - aip.AcceleratorType.NVIDIA_TESLA_K80
   - aip.AcceleratorType.NVIDIA_TESLA_P100
   - aip.AcceleratorType.NVIDIA_TESLA_P4
   - aip.AcceleratorType.NVIDIA_TESLA_T4
   - aip.AcceleratorType.NVIDIA_TESLA_V100

Otherwise specify `(None, None)` to use a container image to run on a CPU.

In [None]:
if os.getenv("IS_TESTING_DEPOLY_GPU"):
    DEPLOY_GPU, DEPLOY_NGPU = (
        aip.AcceleratorType.NVIDIA_TESLA_K80,
        int(os.getenv("IS_TESTING_DEPOLY_GPU")),
    )
else:
    DEPLOY_GPU, DEPLOY_NGPU = (None, None)

#### Container (Docker) image

Next, we will set the Docker container images for prediction

- Set the variable `TF` to the TensorFlow version of the container image. For example, `2-1` would be version 2.1, and `1-15` would be version 1.15. The following list shows some of the pre-built images available:

 - TensorFlow 1.15
   - `gcr.io/cloud-aiplatform/prediction/tf-cpu.1-15:latest`
   - `gcr.io/cloud-aiplatform/prediction/tf-gpu.1-15:latest`
 - TensorFlow 2.1
   - `gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-1:latest`
   - `gcr.io/cloud-aiplatform/prediction/tf2-gpu.2-1:latest`
 - TensorFlow 2.2
   - `gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-2:latest`
   - `gcr.io/cloud-aiplatform/prediction/tf2-gpu.2-2:latest`
 - TensorFlow 2.3
   - `gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-3:latest`
   - `gcr.io/cloud-aiplatform/prediction/tf2-gpu.2-3:latest`
 - XGBoost
   - `gcr.io/cloud-aiplatform/prediction/xgboost-cpu.1-2:latest`
   - `gcr.io/cloud-aiplatform/prediction/xgboost-cpu.1-1:latest`
   - `gcr.io/cloud-aiplatform/prediction/xgboost-cpu.0-90:latest`
   - `gcr.io/cloud-aiplatform/prediction/xgboost-cpu.0-82:latest`
 - Scikit-learn
   - `gcr.io/cloud-aiplatform/prediction/sklearn-cpu.0-23:latest`
   - `gcr.io/cloud-aiplatform/prediction/sklearn-cpu.0-22:latest`
   - `gcr.io/cloud-aiplatform/prediction/sklearn-cpu.0-20:latest`

For the latest list, see [Pre-built containers for prediction](https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers)

In [None]:
if os.getenv("IS_TESTING_TF"):
    TF = os.getenv("IS_TESTING_TF")
else:
    TF = "2-1"

if TF[0] == "2":
    if DEPLOY_GPU:
        DEPLOY_VERSION = "tf2-gpu.{}".format(TF)
    else:
        DEPLOY_VERSION = "tf2-cpu.{}".format(TF)
else:
    if DEPLOY_GPU:
        DEPLOY_VERSION = "tf-gpu.{}".format(TF)
    else:
        DEPLOY_VERSION = "tf-cpu.{}".format(TF)

DEPLOY_IMAGE = "gcr.io/cloud-aiplatform/prediction/{}:latest".format(DEPLOY_VERSION)

print("Deployment:", DEPLOY_IMAGE, DEPLOY_GPU)

#### Machine Type

Next, set the machine type to use for prediction.

- Set the variable `DEPLOY_COMPUTE` to configure the compute resources for the VM you will use for prediction.
 - `machine type`
     - `n1-standard`: 3.75GB of memory per vCPU.
     - `n1-highmem`: 6.5GB of memory per vCPU
     - `n1-highcpu`: 0.9 GB of memory per vCPU
 - `vCPUs`: number of \[2, 4, 8, 16, 32, 64, 96 \]

*Note: You may also use n2 and e2 machine types for training and deployment, but they do not support GPUs*

In [None]:
if os.getenv("IS_TESTING_DEPLOY_MACHINE"):
    MACHINE_TYPE = os.getenv("IS_TESTING_DEPLOY_MACHINE")
else:
    MACHINE_TYPE = "n1-standard"

VCPU = "4"
DEPLOY_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Deploy machine type", DEPLOY_COMPUTE)

# Tutorial

Now you are ready to deploy a TensorFlow Hub pretrained image classification model.

## Set up clients

The Vertex client library works as a client/server model. On your side (the Python script) you will create a client that sends requests and receives responses from the Vertex server.

You will use different clients in this tutorial for different steps in the workflow. So set them all up upfront.

- Model Service for `Model` resources.
- Endpoint Service for deployment.
- Prediction Service for serving.

In [None]:
# client options same for all services
client_options = {"api_endpoint": API_ENDPOINT}


def create_model_client():
    client = aip.ModelServiceClient(client_options=client_options)
    return client


def create_endpoint_client():
    client = aip.EndpointServiceClient(client_options=client_options)
    return client


def create_prediction_client():
    client = aip.PredictionServiceClient(client_options=client_options)
    return client


clients = {}
clients["model"] = create_model_client()
clients["endpoint"] = create_endpoint_client()
clients["prediction"] = create_prediction_client()

for client in clients.items():
    print(client)

## Get pretrained model from TFHub

Next, you download a pre-trained model from $(TENSORFLOW) Hub.

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

IMAGE_SHAPE = (224, 224)

model = tf.keras.Sequential(
    [
        hub.KerasLayer(
            "https://tfhub.dev/google/imagenet/resnet_v1_50/feature_vector/4",
            input_shape=IMAGE_SHAPE + (3,),
        )
    ]
)

model_path_to_deploy = BUCKET_NAME + "/resnet"

## Upload the model for serving

Next, you will upload your TF.Keras model from the custom job to Vertex `Model` service, which will create a Vertex `Model` resource for your custom model. During upload, you need to define a serving function to convert data to the format your model expects. If you send encoded data to Vertex, your serving function ensures that the data is decoded on the model server before it is passed as input to your model.

### How does the serving function work

When you send a request to an online prediction server, the request is received by a HTTP server. The HTTP server extracts the prediction request from the HTTP request content body. The extracted prediction request is forwarded to the serving function. For Google pre-built prediction containers, the request content is passed to the serving function as a `tf.string`.

The serving function consists of two parts:

- `preprocessing function`:
  - Converts the input (`tf.string`) to the input shape and data type of the underlying model (dynamic graph).
  - Performs the same preprocessing of the data that was done during training the underlying model -- e.g., normalizing, scaling, etc.
- `post-processing function`:
  - Converts the model output to format expected by the receiving application -- e.q., compresses the output.
  - Packages the output for the the receiving application -- e.g., add headings, make JSON object, etc.

Both the preprocessing and post-processing functions are converted to static graphs which are fused to the model. The output from the underlying model is passed to the post-processing function. The post-processing function passes the converted/packaged output back to the HTTP server. The HTTP server returns the output as the HTTP response content.

One consideration you need to consider when building serving functions for TF.Keras models is that they run as static graphs. That means, you cannot use TF graph operations that require a dynamic graph. If you do, you will get an error during the compile of the serving function which will indicate that you are using an EagerTensor which is not supported.

### Serving function for image data

To pass images to the prediction service, you encode the compressed (e.g., JPEG) image bytes into base 64 -- which makes the content safe from modification while transmitting binary data over the network. Since this deployed model expects input data as raw (uncompressed) bytes, you need to ensure that the base 64 encoded data gets converted back to raw bytes before it is passed as input to the deployed model.

To resolve this, define a serving function (`serving_fn`) and attach it to the model as a preprocessing step. Add a `@tf.function` decorator so the serving function is fused to the underlying model (instead of upstream on a CPU).

When you send a prediction or explanation request, the content of the request is base 64 decoded into a Tensorflow string (`tf.string`), which is passed to the serving function (`serving_fn`). The serving function preprocesses the `tf.string` into raw (uncompressed) numpy bytes (`preprocess_fn`) to match the input requirements of the model:
- `io.decode_jpeg`- Decompresses the JPG image which is returned as a Tensorflow tensor with three channels (RGB).
- `image.convert_image_dtype` - Changes integer pixel values to float 32.
- `image.resize` - Resizes the image to match the input shape for the model.
- `resized / 255.0` - Rescales (normalization) the pixel data between 0 and 1.

At this point, the data can be passed to the model (`m_call`).

In [None]:
CONCRETE_INPUT = "numpy_inputs"


def _preprocess(bytes_input):
    decoded = tf.io.decode_jpeg(bytes_input, channels=3)
    decoded = tf.image.convert_image_dtype(decoded, tf.float32)
    resized = tf.image.resize(decoded, size=(32, 32))
    rescale = tf.cast(resized / 255.0, tf.float32)
    return rescale


@tf.function(input_signature=[tf.TensorSpec([None], tf.string)])
def preprocess_fn(bytes_inputs):
    decoded_images = tf.map_fn(
        _preprocess, bytes_inputs, dtype=tf.float32, back_prop=False
    )
    return {
        CONCRETE_INPUT: decoded_images
    }  # User needs to make sure the key matches model's input


@tf.function(input_signature=[tf.TensorSpec([None], tf.string)])
def serving_fn(bytes_inputs):
    images = preprocess_fn(bytes_inputs)
    prob = m_call(**images)
    return prob


m_call = tf.function(model.call).get_concrete_function(
    [tf.TensorSpec(shape=[None, 32, 32, 3], dtype=tf.float32, name=CONCRETE_INPUT)]
)

tf.saved_model.save(
    model, model_path_to_deploy, signatures={"serving_default": serving_fn}
)

## Get the serving function signature

You can get the signatures of your model's input and output layers by reloading the model into memory, and querying it for the signatures corresponding to each layer.

For your purpose, you need the signature of the serving function. Why? Well, when we send our data for prediction as a HTTP request packet, the image data is base64 encoded, and our TF.Keras model takes numpy input. Your serving function will do the conversion from base64 to a numpy array.

When making a prediction request, you need to route the request to the serving function instead of the model, so you need to know the input layer name of the serving function -- which you will use later when you make a prediction request.

In [None]:
loaded = tf.saved_model.load(model_path_to_deploy)

serving_input = list(
    loaded.signatures["serving_default"].structured_input_signature[1].keys()
)[0]
print("Serving function input:", serving_input)

### Upload the model

Use this helper function `upload_model` to upload your model, stored in SavedModel format, up to the `Model` service, which will instantiate a Vertex `Model` resource instance for your model. Once you've done that, you can use the `Model` resource instance in the same way as any other Vertex `Model` resource instance, such as deploying to an `Endpoint` resource for serving predictions.

The helper function takes the following parameters:

- `display_name`: A human readable name for the `Endpoint` service.
- `image_uri`: The container image for the model deployment.
- `model_uri`: The Cloud Storage path to our SavedModel artificat. For this tutorial, this is the Cloud Storage location where the `trainer/task.py` saved the model artifacts, which we specified in the variable `MODEL_DIR`.

The helper function calls the `Model` client service's method `upload_model`, which takes the following parameters:

- `parent`: The Vertex location root path for `Dataset`, `Model` and `Endpoint` resources.
- `model`: The specification for the Vertex `Model` resource instance.

Let's now dive deeper into the Vertex model specification `model`. This is a dictionary object that consists of the following fields:

- `display_name`: A human readable name for the `Model` resource.
- `metadata_schema_uri`: Since your model was built without an Vertex `Dataset` resource, you will leave this blank (`''`).
- `artificat_uri`: The Cloud Storage path where the model is stored in SavedModel format.
- `container_spec`: This is the specification for the Docker container that will be installed on the `Endpoint` resource, from which the `Model` resource will serve predictions. Use the variable you set earlier `DEPLOY_GPU != None` to use a GPU; otherwise only a CPU is allocated.

Uploading a model into a Vertex Model resource returns a long running operation, since it may take a few moments. You call response.result(), which is a synchronous call and will return when the Vertex Model resource is ready.

The helper function returns the Vertex fully qualified identifier for the corresponding Vertex Model instance upload_model_response.model. You will save the identifier for subsequent steps in the variable model_to_deploy_id.

In [None]:
IMAGE_URI = DEPLOY_IMAGE


def upload_model(display_name, image_uri, model_uri):
    model = {
        "display_name": display_name,
        "metadata_schema_uri": "",
        "artifact_uri": model_uri,
        "container_spec": {
            "image_uri": image_uri,
            "command": [],
            "args": [],
            "env": [{"name": "env_name", "value": "env_value"}],
            "ports": [{"container_port": 8080}],
            "predict_route": "",
            "health_route": "",
        },
    }
    response = clients["model"].upload_model(parent=PARENT, model=model)
    print("Long running operation:", response.operation.name)
    upload_model_response = response.result(timeout=180)
    print("upload_model_response")
    print(" model:", upload_model_response.model)
    return upload_model_response.model


model_to_deploy_id = upload_model(
    "flowers-" + TIMESTAMP, IMAGE_URI, model_path_to_deploy
)

### Get `Model` resource information

Now let's get the model information for just your model. Use this helper function `get_model`, with the following parameter:

- `name`: The Vertex unique identifier for the `Model` resource.

This helper function calls the Vertex `Model` client service's method `get_model`, with the following parameter:

- `name`: The Vertex unique identifier for the `Model` resource.

In [None]:
def get_model(name):
    response = clients["model"].get_model(name=name)
    print(response)


get_model(model_to_deploy_id)

## Deploy the `Model` resource

Now deploy the trained Vertex custom `Model` resource. This requires two steps:

1. Create an `Endpoint` resource for deploying the `Model` resource to.

2. Deploy the `Model` resource to the `Endpoint` resource.

### Create an `Endpoint` resource

Use this helper function `create_endpoint` to create an endpoint to deploy the model to for serving predictions, with the following parameter:

- `display_name`: A human readable name for the `Endpoint` resource.

The helper function uses the endpoint client service's `create_endpoint` method, which takes the following parameter:

- `display_name`: A human readable name for the `Endpoint` resource.

Creating an `Endpoint` resource returns a long running operation, since it may take a few moments to provision the `Endpoint` resource for serving. You call `response.result()`, which is a synchronous call and will return when the Endpoint resource is ready. The helper function returns the Vertex fully qualified identifier for the `Endpoint` resource: `response.name`.

In [None]:
ENDPOINT_NAME = "flowers_endpoint-" + TIMESTAMP


def create_endpoint(display_name):
    endpoint = {"display_name": display_name}
    response = clients["endpoint"].create_endpoint(parent=PARENT, endpoint=endpoint)
    print("Long running operation:", response.operation.name)

    result = response.result(timeout=300)
    print("result")
    print(" name:", result.name)
    print(" display_name:", result.display_name)
    print(" description:", result.description)
    print(" labels:", result.labels)
    print(" create_time:", result.create_time)
    print(" update_time:", result.update_time)
    return result


result = create_endpoint(ENDPOINT_NAME)

Now get the unique identifier for the `Endpoint` resource you created.

In [None]:
# The full unique ID for the endpoint
endpoint_id = result.name
# The short numeric ID for the endpoint
endpoint_short_id = endpoint_id.split("/")[-1]

print(endpoint_id)

### Compute instance scaling

You have several choices on scaling the compute instances for handling your online prediction requests:

- Single Instance: The online prediction requests are processed on a single compute instance.
  - Set the minimum (`MIN_NODES`) and maximum (`MAX_NODES`) number of compute instances to one.

- Manual Scaling: The online prediction requests are split across a fixed number of compute instances that you manually specified.
  - Set the minimum (`MIN_NODES`) and maximum (`MAX_NODES`) number of compute instances to the same number of nodes. When a model is first deployed to the instance, the fixed number of compute instances are provisioned and online prediction requests are evenly distributed across them.

- Auto Scaling: The online prediction requests are split across a scaleable number of compute instances.
  - Set the minimum (`MIN_NODES`) number of compute instances to provision when a model is first deployed and to de-provision, and set the maximum (`MAX_NODES) number of compute instances to provision, depending on load conditions.

The minimum number of compute instances corresponds to the field `min_replica_count` and the maximum number of compute instances corresponds to the field `max_replica_count`, in your subsequent deployment request.

In [None]:
MIN_NODES = 1
MAX_NODES = 1

### Deploy `Model` resource to the `Endpoint` resource

Use this helper function `deploy_model` to deploy the `Model` resource to the `Endpoint` resource you created for serving predictions, with the following parameters:

- `model`: The Vertex fully qualified model identifier of the model to upload (deploy) from the training pipeline.
- `deploy_model_display_name`: A human readable name for the deployed model.
- `endpoint`: The Vertex fully qualified endpoint identifier to deploy the model to.

The helper function calls the `Endpoint` client service's method `deploy_model`, which takes the following parameters:

- `endpoint`: The Vertex fully qualified `Endpoint` resource identifier to deploy the `Model` resource to.
- `deployed_model`: The requirements specification for deploying the model.
- `traffic_split`: Percent of traffic at the endpoint that goes to this model, which is specified as a dictionary of one or more key/value pairs.
   - If only one model, then specify as **{ "0": 100 }**, where "0" refers to this model being uploaded and 100 means 100% of the traffic.
   - If there are existing models on the endpoint, for which the traffic will be split, then use `model_id` to specify as **{ "0": percent, model_id: percent, ... }**, where `model_id` is the model id of an existing model to the deployed endpoint. The percents must add up to 100.

Let's now dive deeper into the `deployed_model` parameter. This parameter is specified as a Python dictionary with the minimum required fields:

- `model`: The Vertex fully qualified model identifier of the (upload) model to deploy.
- `display_name`: A human readable name for the deployed model.
- `disable_container_logging`: This disables logging of container events, such as execution failures (default is container logging is enabled). Container logging is typically enabled when debugging the deployment and then disabled when deployed for production.
- `dedicated_resources`: This refers to how many compute instances (replicas) that are scaled for serving prediction requests.
  - `machine_spec`: The compute instance to provision. Use the variable you set earlier `DEPLOY_GPU != None` to use a GPU; otherwise only a CPU is allocated.
  - `min_replica_count`: The number of compute instances to initially provision, which you set earlier as the variable `MIN_NODES`.
  - `max_replica_count`: The maximum number of compute instances to scale to, which you set earlier as the variable `MAX_NODES`.

#### Traffic Split

Let's now dive deeper into the `traffic_split` parameter. This parameter is specified as a Python dictionary. This might at first be a tad bit confusing. Let me explain, you can deploy more than one instance of your model to an endpoint, and then set how much (percent) goes to each instance.

Why would you do that? Perhaps you already have a previous version deployed in production -- let's call that v1. You got better model evaluation on v2, but you don't know for certain that it is really better until you deploy to production. So in the case of traffic split, you might want to deploy v2 to the same endpoint as v1, but it only get's say 10% of the traffic. That way, you can monitor how well it does without disrupting the majority of users -- until you make a final decision.

#### Response

The method returns a long running operation `response`. We will wait sychronously for the operation to complete by calling the `response.result()`, which will block until the model is deployed. If this is the first time a model is deployed to the endpoint, it may take a few additional minutes to complete provisioning of resources.

In [None]:
DEPLOYED_NAME = "flowers_deployed-" + TIMESTAMP


def deploy_model(
    model, deployed_model_display_name, endpoint, traffic_split={"0": 100}
):

    if DEPLOY_GPU:
        machine_spec = {
            "machine_type": DEPLOY_COMPUTE,
            "accelerator_type": DEPLOY_GPU,
            "accelerator_count": DEPLOY_NGPU,
        }
    else:
        machine_spec = {
            "machine_type": DEPLOY_COMPUTE,
            "accelerator_count": 0,
        }

    deployed_model = {
        "model": model,
        "display_name": deployed_model_display_name,
        "dedicated_resources": {
            "min_replica_count": MIN_NODES,
            "max_replica_count": MAX_NODES,
            "machine_spec": machine_spec,
        },
        "disable_container_logging": False,
    }

    response = clients["endpoint"].deploy_model(
        endpoint=endpoint, deployed_model=deployed_model, traffic_split=traffic_split
    )

    print("Long running operation:", response.operation.name)
    result = response.result()
    print("result")
    deployed_model = result.deployed_model
    print(" deployed_model")
    print("  id:", deployed_model.id)
    print("  model:", deployed_model.model)
    print("  display_name:", deployed_model.display_name)
    print("  create_time:", deployed_model.create_time)

    return deployed_model.id


deployed_model_id = deploy_model(model_to_deploy_id, DEPLOYED_NAME, endpoint_id)

## Make a online prediction request

Now do a online prediction to your deployed model.

### Get test item

You will use an example image from your dataset as a test item.

In [None]:
FLOWERS_CSV = "gs://cloud-ml-data/img/flower_photos/all_data.csv"

test_images = ! gsutil cat $FLOWERS_CSV | head -n1
test_image = test_images[0].split(",")[0]
print(test_image)

### Prepare the request content

You are going to send the flowers image as compressed JPG image, instead of the raw uncompressed bytes:

- `tf.io.read_file`: Read the compressed JPG images back into memory as raw bytes.
- `base64.b64encode`: Encode the raw bytes into a base 64 encoded string.

In [None]:
import base64

bytes = tf.io.read_file(test_image)
b64str = base64.b64encode(bytes.numpy()).decode("utf-8")

### Send the prediction request

Ok, now you have a test image. Use this helper function `predict_image`, which takes the following parameters:

- `image`: The test image data as a numpy array.
- `endpoint`: The Vertex fully qualified identifier for the `Endpoint` resource where the `Model` resource was deployed to.
- `parameters_dict`: Additional parameters for serving.

This function calls the prediction client service `predict` method with the following parameters:

- `endpoint`: The Vertex fully qualified identifier for the `Endpoint` resource where the `Model` resource was deployed to.
- `instances`: A list of instances (encoded images) to predict.
- `parameters`: Additional parameters for serving.

To pass the image data to the prediction service, in the previous step you encoded the bytes into base64 -- which makes the content safe from modification when transmitting binary data over the network. You need to tell the serving binary where your model is deployed to, that the content has been base64 encoded, so it will decode it on the other end in the serving binary.

Each instance in the prediction request is a dictionary entry of the form:

                        {serving_input: {'b64': content}}

- `input_name`: the name of the input layer of the underlying model.
- `'b64'`: A key that indicates the content is base64 encoded.
- `content`: The compressed JPG image bytes as a base64 encoded string.

Since the `predict()` service can take multiple images (instances), you will send your single image as a list of one image. As a final step, you package the instances list into Google's protobuf format -- which is what we pass to the `predict()` service.

The `response` object returns a list, where each element in the list corresponds to the corresponding image in the request. You will see in the output for each prediction:

- `predictions`: Confidence level for the prediction, between 0 and 1, for each of the classes.

In [None]:
def predict_image(image, endpoint, parameters_dict):
    # The format of each instance should conform to the deployed model's prediction input schema.
    instances_list = [{serving_input: {"b64": image}}]
    instances = [json_format.ParseDict(s, Value()) for s in instances_list]

    response = clients["prediction"].predict(
        endpoint=endpoint, instances=instances, parameters=parameters_dict
    )
    print("response")
    print(" deployed_model_id:", response.deployed_model_id)
    predictions = response.predictions
    print("predictions")
    for prediction in predictions:
        print(" prediction:", prediction)


predict_image(b64str, endpoint_id, None)

## Undeploy the `Model` resource

Now undeploy your `Model` resource from the serving `Endpoint` resoure. Use this helper function `undeploy_model`, which takes the following parameters:

- `deployed_model_id`: The model deployment identifier returned by the endpoint service when the `Model` resource was deployed to.
- `endpoint`: The Vertex fully qualified identifier for the `Endpoint` resource where the `Model` is deployed to.

This function calls the endpoint client service's method `undeploy_model`, with the following parameters:

- `deployed_model_id`: The model deployment identifier returned by the endpoint service when the `Model` resource was deployed.
- `endpoint`: The Vertex fully qualified identifier for the `Endpoint` resource where the `Model` resource is deployed.
- `traffic_split`: How to split traffic among the remaining deployed models on the `Endpoint` resource.

Since this is the only deployed model on the `Endpoint` resource, you simply can leave `traffic_split` empty by setting it to {}.

In [None]:
def undeploy_model(deployed_model_id, endpoint):
    response = clients["endpoint"].undeploy_model(
        endpoint=endpoint, deployed_model_id=deployed_model_id, traffic_split={}
    )
    print(response)


undeploy_model(deployed_model_id, endpoint_id)

# Cleaning up

To clean up all GCP resources used in this project, you can [delete the GCP
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Dataset
- Pipeline
- Model
- Endpoint
- Batch Job
- Custom Job
- Hyperparameter Tuning Job
- Cloud Storage Bucket

In [None]:
delete_dataset = True
delete_pipeline = True
delete_model = True
delete_endpoint = True
delete_batchjob = True
delete_customjob = True
delete_hptjob = True
delete_bucket = True

# Delete the dataset using the Vertex fully qualified identifier for the dataset
try:
    if delete_dataset and "dataset_id" in globals():
        clients["dataset"].delete_dataset(name=dataset_id)
except Exception as e:
    print(e)

# Delete the training pipeline using the Vertex fully qualified identifier for the pipeline
try:
    if delete_pipeline and "pipeline_id" in globals():
        clients["pipeline"].delete_training_pipeline(name=pipeline_id)
except Exception as e:
    print(e)

# Delete the model using the Vertex fully qualified identifier for the model
try:
    if delete_model and "model_to_deploy_id" in globals():
        clients["model"].delete_model(name=model_to_deploy_id)
except Exception as e:
    print(e)

# Delete the endpoint using the Vertex fully qualified identifier for the endpoint
try:
    if delete_endpoint and "endpoint_id" in globals():
        clients["endpoint"].delete_endpoint(name=endpoint_id)
except Exception as e:
    print(e)

# Delete the batch job using the Vertex fully qualified identifier for the batch job
try:
    if delete_batchjob and "batch_job_id" in globals():
        clients["job"].delete_batch_prediction_job(name=batch_job_id)
except Exception as e:
    print(e)

# Delete the custom job using the Vertex fully qualified identifier for the custom job
try:
    if delete_customjob and "job_id" in globals():
        clients["job"].delete_custom_job(name=job_id)
except Exception as e:
    print(e)

# Delete the hyperparameter tuning job using the Vertex fully qualified identifier for the hyperparameter tuning job
try:
    if delete_hptjob and "hpt_job_id" in globals():
        clients["job"].delete_hyperparameter_tuning_job(name=hpt_job_id)
except Exception as e:
    print(e)

if delete_bucket and "BUCKET_NAME" in globals():
    ! gsutil rm -r $BUCKET_NAME