In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Build a fraud detection model on Vertex AI

<table align="left">

  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/workbench/fraud_detection/fraud-detection-model.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="
https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fworkbench%2Ffraud_detection%2Ffraud-detection-model.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
   <td style="text-align: center">
<a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/workbench/fraud_detection/fraud-detection-model.ipynb" target='_blank'>
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/workbench/fraud_detection/fraud-detection-model.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

**_NOTE_**: This notebook has been tested in the following environment:

* Python version = 3.9

## Overview

This tutorial shows you how to build, deploy, and analyze predictions from a simple [Random Forest](https://en.wikipedia.org/wiki/Random_forest) model using tools like scikit-learn, Vertex AI, and the [What-IF Tool (WIT)](https://cloud.google.com/ai-platform/prediction/docs/using-what-if-tool) on a synthetic fraud transaction dataset to solve a financial fraud detection problem.

**Note:** The What-If tool widget used in this notebook only runs in a Colab environment. It isn't explicitly supported for Vertex AI user-managed notebooks instances. 

Learn more about [Vertex AI Workbench](https://cloud.google.com/vertex-ai/docs/workbench/introduction) and [Custom training](https://cloud.google.com/vertex-ai/docs/training/overview).

### Objective

This tutorial demonstrates data analysis and model-building using a synthetic financial dataset. The model is trained on identifying fraudulent cases among the transactions. Then, the trained model is deployed on a Vertex AI Endpoint and analyzed using the What-If Tool. The steps taken in this tutorial are as follows: 

This tutorial uses the following Google Cloud ML services and resources:

- Vertex AI Model
- Vertex AI Endpoint

The steps performed include:

- Installation of required libraries
- Reading the dataset from a Cloud Storage bucket
- Performing exploratory analysis on the dataset
- Preprocessing the dataset
- Training a random forest model using scikit-learn
- Saving the model to a Cloud Storage bucket
- Creating a Vertex AI model resource and deploying to an endpoint
- Running the What-If Tool on test data
- Un-deploying the model and cleaning up the model resources

### Dataset


The dataset used in this tutorial is publicly available at Kaggle. See [Synthetic Financial Datasets For Fraud Detection](https://www.kaggle.com/ealaxi/paysim1).

### Costs


This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage. 

## Get started

### Install Vertex AI SDK for Python and other required packages


In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                witwidget \
                                fsspec \
                                gcsfs
! pip3 install --quiet scikit-learn==1.2 \
                        protobuf==3.20.1

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK for Python

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# set the project id
! gcloud config set project $PROJECT_ID

LOCATION = "us-central1"  # @param {type: "string"}

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

### Import libraries

In [None]:
import os
import pickle
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from google.cloud import aiplatform, storage
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (average_precision_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.model_selection import train_test_split
from witwidget.notebook.visualization import WitConfigBuilder, WitWidget

warnings.filterwarnings("ignore")

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project.

In [None]:
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

## Load dataset

Load the dataset from the public csv file path using Pandas.

In [None]:
# set the dataset path
DATASET_SOURCE_PATH = "gs://cloud-samples-data/vertex-ai/managed_notebooks/fraud_detection/fraud_detection_data.csv"
# read the csv data using pandas
df = pd.read_csv(DATASET_SOURCE_PATH)

## Analyze the dataset
Take a quick look at the dataset and the number of rows.

In [None]:
# print the shape of dataframe
print("shape : ", df.shape)
# display the dataframe
df.head()

Check for null values.

In [None]:
# print the total null count per column
df.isnull().sum()

Check the type of transactions involved and total amount associated with each type.

In [None]:
# check value counts for type
print(df.type.value_counts())
# show total amount per type as a bar chart
var = df[["type", "amount"]].groupby("type").sum()
var.plot(kind="bar")
plt.title("Total amount per transaction type")
plt.xlabel("Type of Transaction")
plt.ylabel("Amount")
plt.show()

## Working with imbalanced data

Althuogh the outcome variable "isFraud" seems to be very imbalanced, a base model can be trained on it to check the quality of fraudulent transactions in the data. If needed, counter measures like undersampling of majority class or oversampling of the minority class can be considered.

In [None]:
# Count number of fraudulent/non-fraudulent transactions
df.isFraud.value_counts()

Show the percentage of fraudulent transactions detected as a pie chart.

In [None]:
# plot the percentage of frauds as a pie chart
piedata = df[["isFlaggedFraud", "isFraud"]].groupby(["isFlaggedFraud"]).sum()
f, axes = plt.subplots(1, 1, figsize=(6, 6))
axes.set_title("% of fraud transaction detected")
piedata.plot(
    kind="pie", y="isFraud", ax=axes, fontsize=14, shadow=False, autopct="%1.1f%%"
)
axes.set_ylabel("")
plt.legend(loc="upper left", labels=["Not Detected", "Detected"])
plt.show()

## Prepare data for modeling
To prepare the dataset for training, a few columns need to be dropped that contain unique data ('nameOrig','nameDest') and redundant fields ('isFlaggedFraud'). The categorical field "type" which describes the type of transaction and is important for fraud detection needs to be one-hot encoded.

In [None]:
# drop the unnecessary fields
df.drop(["nameOrig", "nameDest", "isFlaggedFraud"], axis=1, inplace=True)
# encode the "type" field
X = pd.concat([df.drop("type", axis=1), pd.get_dummies(df["type"])], axis=1)
X.head()

Remove the outcome variable from the training data.

In [None]:
# copy the target data
y = X[["isFraud"]]
# remove the target field from the features
X = X.drop(["isFraud"], axis=1)

Split the data and assign 70% for training and 30% for testing. 

For splitting, you specify the following parameters to Sklearn's `train_test_split` method:

- `*arrays`: The feature array(X) and the target array(y).
- `test_size`: Percentage(float) or number(integer) of test samples.
- `random_state`: Controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function calls.
- `stratify`: If none, no stratified sampling is performed.

As the data is imbalanced, you use stratified sampling while splitting. Learn more about [stratified sampling and other parameters for train-test-splitting](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).

In [None]:
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
# check the data shapes
print(X_train.shape, X_test.shape)

## Fit a Random Forest model

Fit a simple Random Forest classifier on the preprocessed training dataset.

Note: Setting `n_jobs` to -1 while defining the `RandomForestClassifier` object allows it to parallelize the training process using all processors. 

Learn more about [Random Forest algorithm](https://en.wikipedia.org/wiki/Random_forest) and Sklearn's [RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html).

In [None]:
# create a randomforestclassifier object
forest = RandomForestClassifier(n_jobs=-1, verbose=1)
# fit the model on the data
forest.fit(X_train, y_train)

## Analyze the results

Generate the prediction classes and the probabilty scores on test data.

Evaluate the model using the following metrics:

- `AP`: Average precision summarizes a precision-recall curve as the weighted mean of precisions achieved at each threshold, with the increase in recall from the previous threshold used as the weight.
- F1 Score: The F1 score is the harmonic mean of the precision and recall.
- Confusion matrix: Matrix indicating the true positives, true negatives, false positives and false negatives predicted.
- Classification report: Sklearn's classification report is a text report showing the main classification metrics like precision, recall, f1score, accuracy, and weighted and macro averages of those metrics.

Learn more about [Sklearn metrics](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics).

In [None]:
# predict probability of fraudulent transactions over test set
y_prob = forest.predict_proba(X_test)
# predict the classes over test set
y_pred = forest.predict(X_test)
# check the average precision score
print("AP :", (average_precision_score(y_test, y_prob[:, 1])))
# check the f1-score
print("F1 - score :", (f1_score(y_test, y_pred)))
# print the confusion matrix
print("Confusion_matrix : ")
print(confusion_matrix(y_test, y_pred))
# print the classification report
print("classification_report")
print(classification_report(y_test, y_pred))

Use `RandomForestClassifier`'s `feature_importances_` function to get a better understanding about which features are the most useful to the model.

In [None]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
forest_importances = pd.Series(importances, index=list(X_train))
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature Importance for Fraud Transaction Detection Model")
ax.set_ylabel("Importance")
fig.tight_layout()

## Save the model to Cloud Storage

Save your model to a pickle file and then, upload your model to Cloud Storage bucket. The uploaded model path is later used for creating a model in the Vertex AI Model Registry.

Note: You can also upload the model to Vertex AI Model Registry from your local environment using the latest Vertex AI SDK for Python.

In [None]:
# save the trained model to a local file
LOCAL_FILE_NAME = "model.pkl"
with open(LOCAL_FILE_NAME, "wb") as file:
    pickle.dump(forest, file)

# Upload the saved model file to Cloud Storage
BLOB_PATH = "fraud-detect-model-path-unique"  # @param {type:"string"}
BLOB_NAME = os.path.join(BLOB_PATH, LOCAL_FILE_NAME)

bucket = storage.Client(PROJECT_ID).bucket(BUCKET_URI[5:])
blob = bucket.blob(BLOB_NAME)
blob.upload_from_filename(LOCAL_FILE_NAME)

## Create a model in Vertex AI

Set the parameters required for model creation in Vertex AI Model Registry.

In [None]:
# set model display name
MODEL_DISPLAY_NAME = "fraud-detection-model-unique"  # @param {type:"string"}
# set the GCS path to the model artifact
ARTIFACT_GCS_PATH = f"{BUCKET_URI}/{BLOB_PATH}"
# set the prediction container uri
SERVING_CONTAINER_IMAGE_URI = (
    "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-2:latest"
)

Create a model resouce in Vertex AI using the `Model.upload` method.

Learn more about [Vertex AI Model Registry](https://cloud.google.com/vertex-ai/docs/model-registry/introduction).

In [None]:
# create a Vertex AI model resource
model = aiplatform.Model.upload(
    display_name=MODEL_DISPLAY_NAME,
    artifact_uri=ARTIFACT_GCS_PATH,
    serving_container_image_uri=SERVING_CONTAINER_IMAGE_URI,
)
# print the model's display name
print("Display name:\n", model.display_name)
# print the model's resource name
print("Resource name:\n", model.resource_name)

## Create an endpoint

Set the display name and create an endpoint for deploying the model.


In [None]:
# set the endpoint display name
ENDPOINT_DISPLAY_NAME = "fraud-detect-endpoint-unique"  # @param {type:"string"}
# create the Endpoint
endpoint = aiplatform.Endpoint.create(display_name=ENDPOINT_DISPLAY_NAME)
# print the endpoint display name
print("Display name:\n", endpoint.display_name)
# print the endpoint resource name
print("Resource name:\n", endpoint.resource_name)

### Deploy model to the endpoint

Set the following parameters for endpoint deployment:

- `endpoint`: The Vertex AI Endpoint resource created in the last step.
- `deployed_model_display_name`: Display name for the model. If not provided, model's display name is used.
- `machine_type`: Machine type required for serving the model on the endpoint.

In [None]:
# set the display name for the deployed model
DEPLOYED_MODEL_NAME = "fraud-detection-deployed-model"
# set the machine type for the endpoint
MACHINE_TYPE = "n1-standard-2"

Deploy the model to the created endpoint.

In [None]:
# deploy the model to the endpoint
model.deploy(
    endpoint=endpoint,
    deployed_model_display_name=DEPLOYED_MODEL_NAME,
    machine_type=MACHINE_TYPE,
)
# print the model display name
print(model.display_name)
# print the model resource name
print(model.resource_name)

## What-If Tool 

The What-If Tool can be used to analyze the model predictions on a test data. See a [brief introduction to the What-If Tool](https://pair-code.github.io/what-if-tool/). 

In this tutorial, the What-If Tool is configured and run with the locally trained model as well as the model deployed on Vertex AI Endpoint.

[WitConfigBuilder](https://github.com/PAIR-code/what-if-tool/blob/master/witwidget/notebook/visualization.py#L30) provides the  `set_ai_platform_model()` method to configure the What-If Tool with a model deployed as a version on AI Platform models. This feature currently supports AI Platform only but not Vertex AI models. However, there is also an option to pass a custom function for generating predictions through the `set_custom_predict_fn()` method where either the locally trained model or a function that returns predictions from a Vertex AI model can be passed.

### Prepare test samples

Set some samples aside from the test data for both the available classes (Fraud/not-Fraud) to analyze the model using the What-If Tool.

In [None]:
# set sample size
SAMPLE = 10

# collect samples for each class-label from the test data
pos_samples = y_test[y_test["isFraud"] == 1].sample(SAMPLE).index
neg_samples = y_test[y_test["isFraud"] == 0].sample(SAMPLE).index
test_samples_y = pd.concat([y_test.loc[pos_samples], y_test.loc[neg_samples]])
test_samples_X = X_test.loc[test_samples_y.index].copy()

### Running the What-If Tool on the local model

Build the What-IF tool widget using the local model's `predict_proba` function.

The following step generates an interactive widget for analyzing the predictions. 

Note: This following cell only runs in a Colab environment where the What-If tool is supported.

In [None]:
# check for Colab environment
IS_COLAB = "google.colab" in sys.modules

# run what-if tool
if IS_COLAB:
    # define target and labels
    TARGET_FEATURE = "isFraud"
    LABEL_VOCAB = ["not-fraud", "fraud"]

    # define the function to adjust the predictions

    def adjust_prediction(pred):
        return [1 - pred, pred]

    # Combine the features and labels into one array for the What-If Tool
    test_examples = np.hstack(
        (test_samples_X.to_numpy(), test_samples_y.to_numpy().reshape(-1, 1))
    )

    # Configure the WIT to run on the locally trained model
    config_builder = (
        WitConfigBuilder(
            test_examples.tolist(), test_samples_X.columns.tolist() + ["isFraud"]
        )
        .set_custom_predict_fn(forest.predict_proba)
        .set_target_feature(TARGET_FEATURE)
        .set_label_vocab(LABEL_VOCAB)
    )

    # display the WIT widget
    display(WitWidget(config_builder, height=600))

### Running the What-If Tool on the deployed Vertex AI model

In this step, you define a function that sends requests to the deployed model on endpoint and returns the formatted predictions. The function is then used to build the What-IF tool widget for analyzing the predictions.

In [None]:
# run the what-if tool
if IS_COLAB:
    # configure the target and class-labels
    TARGET_FEATURE = "isFraud"
    LABEL_VOCAB = ["not-fraud", "fraud"]

    # function to return predictions from the deployed Model

    def endpoint_predict_sample(instances: list):
        prediction = endpoint.predict(instances=instances)
        preds = [[1 - i, i] for i in prediction.predictions]
        return preds

    # Combine the features and labels into one array for the What-If Tool
    test_examples = np.hstack(
        (test_samples_X.to_numpy(), test_samples_y.to_numpy().reshape(-1, 1))
    )

    # Configure the WIT with the prediction function
    config_builder = (
        WitConfigBuilder(
            test_examples.tolist(), test_samples_X.columns.tolist() + ["isFraud"]
        )
        .set_custom_predict_fn(endpoint_predict_sample)
        .set_target_feature(TARGET_FEATURE)
        .set_label_vocab(LABEL_VOCAB)
    )

    # run the WIT-widget
    display(WitWidget(config_builder, height=400))

## Cleaning up


To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Vertex AI Endpoint
- Vertex AI Model
- Cloud Storage bucket

In [None]:
# undeploy the model from the endpoint
endpoint.undeploy_all()

# delete the endpoint
endpoint.delete()

# delete the model
model.delete()

# delete the bucket
delete_bucket = False
if delete_bucket:
    ! gsutil rm -r $BUCKET_URI

# delete the local files
! rm $LOCAL_FILE_NAME