use_cases/archive/movie_reviews/notebooks/01-get-embeddings.ipynb (450 lines of code) (raw):

{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Get Embeddings" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "from IPython.core.interactiveshell import InteractiveShell\n", "InteractiveShell.ast_node_interactivity = \"all\"" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Set up Azure OpenAI" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import openai\n", "from dotenv import load_dotenv\n", "\n", "# Set up Azure OpenAI\n", "load_dotenv()\n", "openai.api_type = \"azure\"\n", "openai.api_base = \"https://tutorial-openai-01-2023.openai.azure.com/\"\n", "openai.api_version = \"2022-12-01\"\n", "openai.api_key = os.getenv(\"OPENAI_API_KEY\")" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Load Data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Movie</th>\n", " <th>Publish</th>\n", " <th>Review</th>\n", " <th>Date</th>\n", " <th>Score</th>\n", " <th>Word_Count</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>SOLO: A STAR WARS STORY</td>\n", " <td>Stuff.co.nz</td>\n", " <td>The formula is strong with this one.</td>\n", " <td>2018-05-24</td>\n", " <td>70.0</td>\n", " <td>7</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>BLACK PANTHER</td>\n", " <td>Gone With The Twins</td>\n", " <td>Just about the same as every other Marvel title.</td>\n", " <td>2020-05-12</td>\n", " <td>50.0</td>\n", " <td>9</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>DUNKIRK</td>\n", " <td>Screen Zealots</td>\n", " <td>This is one heck of a stunning war picture.</td>\n", " <td>2018-12-20</td>\n", " <td>80.0</td>\n", " <td>9</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>KNIVES OUT</td>\n", " <td>Student Edge</td>\n", " <td>Don't fear: No spoilers here. All you need to ...</td>\n", " <td>2019-11-26</td>\n", " <td>80.0</td>\n", " <td>17</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>KNIVES OUT</td>\n", " <td>Deep Focus Review</td>\n", " <td>Sharp and funny, Knives Out exceeds expectatio...</td>\n", " <td>2022-02-23</td>\n", " <td>100.0</td>\n", " <td>29</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>6635</th>\n", " <td>ROGUE ONE: A STAR WARS STORY</td>\n", " <td>Movie Nation</td>\n", " <td>This is more like it...the 'Star Wars' movie J...</td>\n", " <td>2016-12-13</td>\n", " <td>75.0</td>\n", " <td>13</td>\n", " </tr>\n", " <tr>\n", " <th>6636</th>\n", " <td>ROGUE ONE: A STAR WARS STORY</td>\n", " <td>Newsday</td>\n", " <td>This \"Star Wars\" spinoff doesn't spin very far...</td>\n", " <td>2016-12-13</td>\n", " <td>75.0</td>\n", " <td>19</td>\n", " </tr>\n", " <tr>\n", " <th>6637</th>\n", " <td>ROGUE ONE: A STAR WARS STORY</td>\n", " <td>Metro</td>\n", " <td>Boasts thin characters played by great actors ...</td>\n", " <td>2016-12-13</td>\n", " <td>40.0</td>\n", " <td>37</td>\n", " </tr>\n", " <tr>\n", " <th>6638</th>\n", " <td>ROGUE ONE: A STAR WARS STORY</td>\n", " <td>Den of Geek</td>\n", " <td>Rogue One builds to one of the best third acts...</td>\n", " <td>2016-12-13</td>\n", " <td>80.0</td>\n", " <td>14</td>\n", " </tr>\n", " <tr>\n", " <th>6639</th>\n", " <td>ROGUE ONE: A STAR WARS STORY</td>\n", " <td>We Got This Covered</td>\n", " <td>Rogue One makes up for a shaky first act by pu...</td>\n", " <td>2016-12-13</td>\n", " <td>70.0</td>\n", " <td>26</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>6640 rows × 6 columns</p>\n", "</div>" ], "text/plain": [ " Movie Publish \\\n", "0 SOLO: A STAR WARS STORY Stuff.co.nz \n", "1 BLACK PANTHER Gone With The Twins \n", "2 DUNKIRK Screen Zealots \n", "3 KNIVES OUT Student Edge \n", "4 KNIVES OUT Deep Focus Review \n", "... ... ... \n", "6635 ROGUE ONE: A STAR WARS STORY Movie Nation \n", "6636 ROGUE ONE: A STAR WARS STORY Newsday \n", "6637 ROGUE ONE: A STAR WARS STORY Metro \n", "6638 ROGUE ONE: A STAR WARS STORY Den of Geek \n", "6639 ROGUE ONE: A STAR WARS STORY We Got This Covered \n", "\n", " Review Date Score \\\n", "0 The formula is strong with this one. 2018-05-24 70.0 \n", "1 Just about the same as every other Marvel title. 2020-05-12 50.0 \n", "2 This is one heck of a stunning war picture. 2018-12-20 80.0 \n", "3 Don't fear: No spoilers here. All you need to ... 2019-11-26 80.0 \n", "4 Sharp and funny, Knives Out exceeds expectatio... 2022-02-23 100.0 \n", "... ... ... ... \n", "6635 This is more like it...the 'Star Wars' movie J... 2016-12-13 75.0 \n", "6636 This \"Star Wars\" spinoff doesn't spin very far... 2016-12-13 75.0 \n", "6637 Boasts thin characters played by great actors ... 2016-12-13 40.0 \n", "6638 Rogue One builds to one of the best third acts... 2016-12-13 80.0 \n", "6639 Rogue One makes up for a shaky first act by pu... 2016-12-13 70.0 \n", "\n", " Word_Count \n", "0 7 \n", "1 9 \n", "2 9 \n", "3 17 \n", "4 29 \n", "... ... \n", "6635 13 \n", "6636 19 \n", "6637 37 \n", "6638 14 \n", "6639 26 \n", "\n", "[6640 rows x 6 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df_orig = pd.read_csv(\"../data/rottentomatoes-20movies-wordcount.csv\", sep='\\t')\n", "df = df_orig.copy()\n", "df" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Deploy a model\n", "ref: \n", "- https://learn.microsoft.com/en-us/azure/cognitive-services/openai/concepts/models\n", "- https://learn.microsoft.com/en-us/azure/cognitive-services/openai/concepts/models#text-search-embedding\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found a succeeded deployment of \"text-search-davinci-doc-001\" that supports text search with id: deployment-c9c3379c45bc4f29bd6082f8f36ab23c.\n" ] } ], "source": [ "# id of desired_model\n", "desired_model = \"text-search-davinci-doc-001\" # suitable for Search, context relevance, information retrieval\n", "\n", "# list models deployed with embeddings capability\n", "deployment_id = None\n", "result = openai.Deployment.list()\n", "\n", "for deployment in result.data:\n", " if deployment[\"status\"] != \"succeeded\":\n", " continue\n", " \n", " model = openai.Model.retrieve(deployment[\"model\"])\n", " if model[\"id\"] == desired_model:\n", " deployment_id = deployment[\"id\"]\n", " \n", "# if not model deployed, deploy one\n", "if not deployment_id:\n", " print('No deployment with status: succeeded found.')\n", "\n", " # Now let's create the deployment\n", " print(f'Creating a new deployment with model: {desired_model}')\n", " result = openai.Deployment.create(model=desired_model, scale_settings={\"scale_type\":\"standard\"})\n", " deployment_id = result[\"id\"]\n", " print(f'Successfully created {desired_model} with deployment_id {deployment_id}')\n", "else:\n", " print(f'Found a succeeded deployment of \"{desired_model}\" that supports text search with id: {deployment_id}.')" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Get Embeddings\n", "ref: https://learn.microsoft.com/en-us/azure/cognitive-services/openai/tutorials/embeddings?tabs=bash" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Movie title: SOLO: A STAR WARS STORY\\nThe formula is strong with this one.'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "12288" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "input = 'Movie title: ' + df['Movie'][0] + '\\n' + df['Review'][0]\n", "input\n", "\n", "embedding = openai.Embedding.create(\n", " input=input,\n", " deployment_id=deployment_id)\n", "\n", "# embedding\n", "len(embedding[\"data\"][0][\"embedding\"])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from ratelimiter import RateLimiter\n", "\n", "@RateLimiter(max_calls=50, period=60) # Published limit is 120 requests per minute, at the time of development, only 50 requests per minute is possible.\n", "def request_api(df, deployment_id, i):\n", " try:\n", " input = 'Movie title: ' + df['Movie'][i] + '\\n' + df['Review'][i]\n", " embedding = openai.Embedding.create(input=input, deployment_id=deployment_id)\n", " df['embedding'].iloc[i] = embedding['data'][0]['embedding']\n", " except Exception as err:\n", " print(i)\n", " print(f\"Unexpected {err=}, {type(err)=}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['embedding'] = ''\n", "\n", "for i in range(len(df)): # This takes over 133 minutes.\n", "#for i in range(0,2):\n", " request_api(df, deployment_id, i)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Save data" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"../data/rottentomatoes-20movies-embeddings.csv\", sep='\\t', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "azureml_py38", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "6d65a8c07f5b6469e0fc613f182488c0dccce05038bbda39e5ac9075c0454d11" } } }, "nbformat": 4, "nbformat_minor": 2 }