# Get Embeddings

In [1]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Set up Azure OpenAI

In [2]:
import os
import openai
from dotenv import load_dotenv

# Set up Azure OpenAI
load_dotenv()
openai.api_type = "azure"
openai.api_base = "https://tutorial-openai-01-2023.openai.azure.com/"
openai.api_version = "2022-12-01"
openai.api_key = os.getenv("OPENAI_API_KEY")

True

## Load Data

In [3]:
import pandas as pd

df_orig = pd.read_csv("../data/rottentomatoes-20movies-wordcount.csv", sep='\t')
df = df_orig.copy()
df

Unnamed: 0,Movie,Publish,Review,Date,Score,Word_Count
0,SOLO: A STAR WARS STORY,Stuff.co.nz,The formula is strong with this one.,2018-05-24,70.0,7
1,BLACK PANTHER,Gone With The Twins,Just about the same as every other Marvel title.,2020-05-12,50.0,9
2,DUNKIRK,Screen Zealots,This is one heck of a stunning war picture.,2018-12-20,80.0,9
3,KNIVES OUT,Student Edge,Don't fear: No spoilers here. All you need to ...,2019-11-26,80.0,17
4,KNIVES OUT,Deep Focus Review,"Sharp and funny, Knives Out exceeds expectatio...",2022-02-23,100.0,29
...,...,...,...,...,...,...
6635,ROGUE ONE: A STAR WARS STORY,Movie Nation,This is more like it...the 'Star Wars' movie J...,2016-12-13,75.0,13
6636,ROGUE ONE: A STAR WARS STORY,Newsday,"This ""Star Wars"" spinoff doesn't spin very far...",2016-12-13,75.0,19
6637,ROGUE ONE: A STAR WARS STORY,Metro,Boasts thin characters played by great actors ...,2016-12-13,40.0,37
6638,ROGUE ONE: A STAR WARS STORY,Den of Geek,Rogue One builds to one of the best third acts...,2016-12-13,80.0,14


## Deploy a model
ref: 
- https://learn.microsoft.com/en-us/azure/cognitive-services/openai/concepts/models
- https://learn.microsoft.com/en-us/azure/cognitive-services/openai/concepts/models#text-search-embedding


In [4]:
# id of desired_model
desired_model = "text-search-davinci-doc-001" # suitable for Search, context relevance, information retrieval

# list models deployed with embeddings capability
deployment_id = None
result = openai.Deployment.list()

for deployment in result.data:
    if deployment["status"] != "succeeded":
        continue
    
    model = openai.Model.retrieve(deployment["model"])
    if model["id"] == desired_model:
        deployment_id = deployment["id"]
        
# if not model deployed, deploy one
if not deployment_id:
    print('No deployment with status: succeeded found.')

    # Now let's create the deployment
    print(f'Creating a new deployment with model: {desired_model}')
    result = openai.Deployment.create(model=desired_model, scale_settings={"scale_type":"standard"})
    deployment_id = result["id"]
    print(f'Successfully created {desired_model} with deployment_id {deployment_id}')
else:
    print(f'Found a succeeded deployment of "{desired_model}" that supports text search with id: {deployment_id}.')

Found a succeeded deployment of "text-search-davinci-doc-001" that supports text search with id: deployment-c9c3379c45bc4f29bd6082f8f36ab23c.


## Get Embeddings
ref: https://learn.microsoft.com/en-us/azure/cognitive-services/openai/tutorials/embeddings?tabs=bash

In [5]:
input = 'Movie title: ' + df['Movie'][0] + '\n' + df['Review'][0]
input

embedding = openai.Embedding.create(
    input=input,
    deployment_id=deployment_id)

# embedding
len(embedding["data"][0]["embedding"])

'Movie title: SOLO: A STAR WARS STORY\nThe formula is strong with this one.'

12288

In [6]:
from ratelimiter import RateLimiter

@RateLimiter(max_calls=50, period=60) # Published limit is 120 requests per minute, at the time of development, only 50 requests per minute is possible.
def request_api(df, deployment_id, i):
    try:
        input = 'Movie title: ' + df['Movie'][i] + '\n' + df['Review'][i]
        embedding = openai.Embedding.create(input=input, deployment_id=deployment_id)
        df['embedding'].iloc[i] = embedding['data'][0]['embedding']
    except Exception as err:
        print(i)
        print(f"Unexpected {err=}, {type(err)=}")

In [None]:
df['embedding'] = ''

for i in range(len(df)): # This takes over 133 minutes.
#for i in range(0,2):
    request_api(df, deployment_id, i)

In [None]:
df

## Save data

In [10]:
df.to_csv("../data/rottentomatoes-20movies-embeddings.csv", sep='\t', index=False)