In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Visualizing embedding similarity from text documents using t-SNE plots

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/embeddings/embedding-similarity-visualization.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/embedding-similarity-visualization.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/embeddings/embedding-similarity-visualization.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/embedding-similarity-visualization.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/embedding-similarity-visualization.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/embedding-similarity-visualization.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/embedding-similarity-visualization.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/embedding-similarity-visualization.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            

| | |
|-|-|
|Author(s) | [Gabe Rives-Corbett](https://github.com/grivescorbett) |

This notebook demonstrates how vector similarity is relevant to LLM-generated embeddings. You will embed a collection of labelled documents and then plot the embeddings on a two-dimensional t-SNE plot to observe how similar documents tend to cluster together based on their embeddings.

## Getting started

### Install libraries

In [None]:
%pip install --upgrade google-genai scikit-learn pandas seaborn

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, you will need to authenticate your environment. To do this, run the new cell below. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

from google import genai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

### Import libraries

In [None]:
import re

from google.api_core import retry
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

tqdm.pandas()

## Fetch and clean the data

In this example, you will use the open source [20 Newsgroups](http://qwone.com/~jason/20Newsgroups/) dataset, a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups

In [None]:
newsgroups = fetch_20newsgroups(
    categories=["comp.graphics", "sci.space", "sci.med", "rec.sport.hockey"]
)

In [None]:
raw_data = pd.DataFrame(
    {
        "text": newsgroups.data,
        "target": [newsgroups.target_names[x] for x in newsgroups.target],
    }
)

Because of the 8k input token limit, in this example you will exclude all documents that have a length outside this limit.

Even though tokens typically are >=1 characters, for simplicity, you can just filter for documents that have <= 8000 _characters_.

In [None]:
raw_data = raw_data.loc[raw_data["text"].str.len() <= 8000]

Subsample the dataset into 500 data points, stratified on the label

In [None]:
x_subsample, _, y_subsample, _ = train_test_split(
    raw_data["text"], raw_data["target"], stratify=raw_data["target"], train_size=500
)

Clean out the text removing by emails, names, etc. This will help improve the data that will then be converted into embeddings.

In [None]:
x_subsample = [re.sub(r"[\w\.-]+@[\w\.-]+", "", d) for d in x_subsample]  # Remove email
x_subsample = [re.sub(r"\([^()]*\)", "", d) for d in x_subsample]  # Remove names
x_subsample = [d.replace("From: ", "") for d in x_subsample]  # Remove "From: "
x_subsample = [
    d.replace("\nSubject: ", "") for d in x_subsample
]  # Remove "\nSubject: "

In [None]:
df = pd.DataFrame({"text": x_subsample, "target": list(y_subsample)})

You now have 500 data points roughly evenly distributed across the categories:

In [None]:
df["target"].value_counts()

## Create and visualize the embeddings using a t-SNE plot

Load the text embedding model from Vertex AI ([documentation](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text-embeddings)).

Since we are using these embeddings for visualization, we will set the [task type](https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/task-types) to clustering.

In [None]:
MODEL_ID = "text-embedding-005"

In [None]:
# Retrieve embeddings from the specified model with retry logic
def get_embeddings():
    @retry.Retry(timeout=300.0)
    def embed_fn(contents: str) -> list[float]:
        response = client.models.embed_content(
            model=MODEL_ID,
            contents=contents,
        )
        return response.embeddings[0].values

    return embed_fn

Create the embeddings. This may take a minute or two.

In [None]:
df["embeddings"] = df["text"].progress_apply(get_embeddings())

In [None]:
df.head()

The vectors generate by our model are 768 dimensions, and so visualizing across 768 dimensions is impossible. Instead, you can use [t-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) to reduce to 2 dimensions.

In [None]:
tsne = TSNE(random_state=0, n_iter=1000)
tsne_results = tsne.fit_transform(
    np.array(df["embeddings"].to_list(), dtype=np.float32)
)

In [None]:
df_tsne = pd.DataFrame(tsne_results, columns=["TSNE1", "TSNE2"])
df_tsne["target"] = df["target"]  # Add labels column from df_train to df_tsne

In [None]:
df_tsne.head()

Plot the data points. It should now be visually clear how the documents from the same newsgroup show up close to each other in the vector space with text embeddings.

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))  # Set figsize
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
sns.scatterplot(data=df_tsne, x="TSNE1", y="TSNE2", hue="target", palette="hls")
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.title("Scatter plot of news using t-SNE")
plt.xlabel("TSNE1")
plt.ylabel("TSNE2")
plt.axis("equal")