colab-enterprise/Oracle-Data-Generation.ipynb

{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "S4eg4Ohk_87K" }, "source": [ "### Overview" ] }, { "cell_type": "markdown", "metadata": { "id": "QUVVFi55_-Mr" }, "source": [ "This notebook will generate menu data and vector embeddings for the menu items and ingredients tables. This data is already loaded by the intialize stored procedure. You can drop the table to run those cells.\n", "\n", "The Search Embeddings and Menu Search with re-ranking will search the data using the embeddings. The Re-ranking will combine two different searches into one and then re-rank the search results so we get the best menu options at the top of the webpage showing the menu items.\n", "\n", "Cost:\n", "* Low: Gemini, BigQuery\n", "* Medium: Remember to stop your Colab Enterprise Notebook Runtime\n", "\n", "Author:\n", "* Adam Paternostro" ] }, { "cell_type": "markdown", "metadata": { "id": "MlFEqJmbAAFj" }, "source": [ "### License" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PnLIFRWNAB9E" }, "outputs": [], "source": [ "##################################################################################\n", "# Copyright 2024 Google LLC\n", "#\n", "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", "# You may obtain a copy of the License at\n", "#\n", "# https://www.apache.org/licenses/LICENSE-2.0\n", "#\n", "# Unless required by applicable law or agreed to in writing, software\n", "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License.\n", "###################################################################################" ] }, { "cell_type": "markdown", "metadata": { "id": "q8_XhJi8AElm" }, "source": [ "### Pip Installs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "k4tF3Q0tAGFH" }, "outputs": [], "source": [ "# PIP Installs\n", "import sys\n", "\n", "# https://PLACEHOLDER.com/index.html\n", "#!{sys.executable} -m pip install PLACEHOLDER" ] }, { "cell_type": "markdown", "metadata": { "id": "1_yr2zSVAILi" }, "source": [ "### Initialize" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZpelyE8M_4KC" }, "outputs": [], "source": [ "from PIL import Image\n", "import IPython.display\n", "import google.auth\n", "import requests\n", "import json\n", "import uuid\n", "import base64\n", "import os\n", "import cv2\n", "import random\n", "import time\n", "import datetime\n", "import base64\n", "import random\n", "\n", "import logging\n", "from tenacity import retry, wait_exponential, stop_after_attempt, before_sleep_log, retry_if_exception" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5Aq6Q4aa_4mJ" }, "outputs": [], "source": [ "# Set these (run this cell to verify the output)\n", "\n", "bigquery_location = \"${bigquery_location}\"\n", "region = \"${region}\"\n", "location = \"${location}\"\n", "dataset_name = \"${bigquery_chocolate_ai_dataset}\"\n", "\n", "# Get the current date and time\n", "now = datetime.datetime.now()\n", "\n", "# Format the date and time as desired\n", "formatted_date = now.strftime(\"%Y-%m-%d-%H-%M\")\n", "\n", "# Get some values using gcloud\n", "project_id = !(gcloud config get-value project)\n", "user = !(gcloud auth list --filter=status:ACTIVE --format=\"value(account)\")\n", "\n", "if len(project_id) != 1:\n", " raise RuntimeError(f\"project_id is not set: {project_id}\")\n", "project_id = project_id[0]\n", "\n", "if len(user) != 1:\n", " raise RuntimeError(f\"user is not set: {user}\")\n", "user = user[0]\n", "\n", "print(f\"project_id = {project_id}\")\n", "print(f\"user = {user}\")" ] }, { "cell_type": "markdown", "metadata": { "id": "U9W0pz36AUg1" }, "source": [ "### Helper Methods" ] }, { "cell_type": "markdown", "metadata": { "id": "NeiSib02AW29" }, "source": [ "#### restAPIHelper\n", "Calls the Google Cloud REST API using the current users credentials." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HUUrDGG__4oz" }, "outputs": [], "source": [ "def restAPIHelper(url: str, http_verb: str, request_body: str, project_id=None) -> str:\n", " \"\"\"Calls the Google Cloud REST API passing in the current users credentials\"\"\"\n", "\n", " import requests\n", " import google.auth\n", " import json\n", "\n", " # Get an access token based upon the current user\n", " creds, project = google.auth.default()\n", " auth_req = google.auth.transport.requests.Request()\n", " creds.refresh(auth_req)\n", " access_token=creds.token\n", "\n", " headers = {\n", " \"Content-Type\" : \"application/json\",\n", " \"Authorization\" : \"Bearer \" + access_token\n", " }\n", "\n", " # Required by some API calls\n", " if project_id != None:\n", " headers[\"x-goog-user-project\"] = project_id\n", "\n", " if http_verb == \"GET\":\n", " response = requests.get(url, headers=headers)\n", " elif http_verb == \"POST\":\n", " response = requests.post(url, json=request_body, headers=headers)\n", " elif http_verb == \"PUT\":\n", " response = requests.put(url, json=request_body, headers=headers)\n", " elif http_verb == \"PATCH\":\n", " response = requests.patch(url, json=request_body, headers=headers)\n", " elif http_verb == \"DELETE\":\n", " response = requests.delete(url, headers=headers)\n", " else:\n", " raise RuntimeError(f\"Unknown HTTP verb: {http_verb}\")\n", "\n", " if response.status_code == 200:\n", " return json.loads(response.content)\n", " #image_data = json.loads(response.content)[\"predictions\"][0][\"bytesBase64Encoded\"]\n", " else:\n", " error = f\"Error restAPIHelper -> ' Status: '{response.status_code}' Text: '{response.text}'\"\n", " raise RuntimeError(error)" ] }, { "cell_type": "markdown", "metadata": { "id": "HnBwJylyAbSo" }, "source": [ "#### RetryCondition (for retrying LLM calls)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nR1Xu--2_4xA" }, "outputs": [], "source": [ "def RetryCondition(error):\n", " error_string = str(error)\n", " print(error_string)\n", "\n", " retry_errors = [\n", " \"RESOURCE_EXHAUSTED\",\n", " \"No content in candidate\",\n", " # Add more error messages here as needed\n", " ]\n", "\n", " for retry_error in retry_errors:\n", " if retry_error in error_string:\n", " print(\"Retrying...\")\n", " return True\n", "\n", " return False" ] }, { "cell_type": "markdown", "metadata": { "id": "xK1kVO4ZAeWS" }, "source": [ "#### Gemini LLM (Pro 1.0 , Pro 1.5, Flash 2.0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "W4kkhoC7_40I" }, "outputs": [], "source": [ "@retry(wait=wait_exponential(multiplier=1, min=1, max=60), stop=stop_after_attempt(10), retry=retry_if_exception(RetryCondition), before_sleep=before_sleep_log(logging.getLogger(), logging.INFO))\n", "def GeminiLLM(prompt, model = \"gemini-2.0-flash-001\", response_schema = None,\n", " temperature = 1, topP = 1, topK = 32):\n", "\n", " # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#supported_models\n", " # model = \"gemini-2.0-flash\"\n", "\n", " llm_response = None\n", " if temperature < 0:\n", " temperature = 0\n", "\n", " creds, project = google.auth.default()\n", " auth_req = google.auth.transport.requests.Request() # required to acess access token\n", " creds.refresh(auth_req)\n", " access_token=creds.token\n", "\n", " headers = {\n", " \"Content-Type\" : \"application/json\",\n", " \"Authorization\" : \"Bearer \" + access_token\n", " }\n", "\n", " # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference\n", " url = f\"https://{location}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{location}/publishers/google/models/{model}:generateContent\"\n", "\n", " generation_config = {\n", " \"temperature\": temperature,\n", " \"topP\": topP,\n", " \"maxOutputTokens\": 8192,\n", " \"candidateCount\": 1,\n", " \"responseMimeType\": \"application/json\",\n", " }\n", "\n", " # Add inthe response schema for when it is provided\n", " if response_schema is not None:\n", " generation_config[\"responseSchema\"] = response_schema\n", "\n", " if model == \"gemini-2.0-flash\":\n", " generation_config[\"topK\"] = topK\n", "\n", " payload = {\n", " \"contents\": {\n", " \"role\": \"user\",\n", " \"parts\": {\n", " \"text\": prompt\n", " },\n", " },\n", " \"generation_config\": {\n", " **generation_config\n", " },\n", " \"safety_settings\": {\n", " \"category\": \"HARM_CATEGORY_SEXUALLY_EXPLICIT\",\n", " \"threshold\": \"BLOCK_LOW_AND_ABOVE\"\n", " }\n", " }\n", "\n", " response = requests.post(url, json=payload, headers=headers)\n", "\n", " if response.status_code == 200:\n", " try:\n", " json_response = json.loads(response.content)\n", " except Exception as error:\n", " raise RuntimeError(f\"An error occurred parsing the JSON: {error}\")\n", "\n", " if \"candidates\" in json_response:\n", " candidates = json_response[\"candidates\"]\n", " if len(candidates) > 0:\n", " candidate = candidates[0]\n", " if \"content\" in candidate:\n", " content = candidate[\"content\"]\n", " if \"parts\" in content:\n", " parts = content[\"parts\"]\n", " if len(parts):\n", " part = parts[0]\n", " if \"text\" in part:\n", " text = part[\"text\"]\n", " llm_response = text\n", " else:\n", " raise RuntimeError(\"No text in part: {response.content}\")\n", " else:\n", " raise RuntimeError(\"No parts in content: {response.content}\")\n", " else:\n", " raise RuntimeError(\"No parts in content: {response.content}\")\n", " else:\n", " raise RuntimeError(\"No content in candidate: {response.content}\")\n", " else:\n", " raise RuntimeError(\"No candidates: {response.content}\")\n", " else:\n", " raise RuntimeError(\"No candidates: {response.content}\")\n", "\n", " # Remove some typically response characters (if asking for a JSON reply)\n", " llm_response = llm_response.replace(\"```json\",\"\")\n", " llm_response = llm_response.replace(\"```\",\"\")\n", " llm_response = llm_response.replace(\"\\n\",\"\")\n", "\n", " return llm_response\n", "\n", " else:\n", " raise RuntimeError(f\"Error with prompt:'{prompt}' Status:'{response.status_code}' Text:'{response.text}'\")" ] }, { "cell_type": "markdown", "metadata": { "id": "QtCLU0quRFye" }, "source": [ "#### Vertex Re-Ranking API" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-webMRuQRGPO" }, "outputs": [], "source": [ "def vertex_ai_reranking(project_id, query, records):\n", " \"\"\"Re-ranks search results\"\"\"\n", "\n", "\n", " url = f\"https://discoveryengine.googleapis.com/v1/projects/{project_id}/locations/global/rankingConfigs/default_ranking_config:rank\"\n", "\n", " request_body = {\n", " \"model\": \"semantic-ranker-512@latest\",\n", " \"query\": query,\n", " \"records\": records\n", " }\n", "\n", " print(request_body)\n", "\n", " json_result = restAPIHelper(url, \"POST\", request_body, project_id)\n", "\n", " return json_result" ] }, { "cell_type": "markdown", "metadata": { "id": "HSVBUNPdA7rY" }, "source": [ "#### Helper Functions" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "R653dOrf_422" }, "outputs": [], "source": [ "def RunQuery(sql):\n", " import time\n", " from google.cloud import bigquery\n", " client = bigquery.Client()\n", "\n", " if (sql.startswith(\"SELECT\") or sql.startswith(\"WITH\")):\n", " df_result = client.query(sql).to_dataframe()\n", " return df_result\n", " else:\n", " job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)\n", " query_job = client.query(sql, job_config=job_config)\n", "\n", " # Check on the progress by getting the job's updated state.\n", " query_job = client.get_job(\n", " query_job.job_id, location=query_job.location\n", " )\n", " print(\"Job {} is currently in state {} with error result of {}\".format(query_job.job_id, query_job.state, query_job.error_result))\n", "\n", " while query_job.state != \"DONE\":\n", " time.sleep(2)\n", " query_job = client.get_job(\n", " query_job.job_id, location=query_job.location\n", " )\n", " print(\"Job {} is currently in state {} with error result of {}\".format(query_job.job_id, query_job.state, query_job.error_result))\n", "\n", " if query_job.error_result == None:\n", " return True\n", " else:\n", " raise Exception(query_job.error_result)" ] }, { "cell_type": "markdown", "metadata": { "id": "rTWhutEjBAza" }, "source": [ "### BigQuery Tables (for Generated Data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MW48pQErBBLM" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Create an ingredients table which will reside in Oracle\n", "-- Drop the table to re-run the GenAI in this notebooks\n", "-- DROP TABLE IF EXISTS `chocolate_ai.generated_ingredients`;\n", "----------------------------------------------------------------------------------------------------------\n", "CREATE TABLE IF NOT EXISTS `chocolate_ai.generated_ingredients` AS\n", "SELECT GENERATE_UUID() AS ingredient_id,\n", " menu_id,\n", " menu_name,\n", " menu_description,\n", "\n", " CAST(NULL AS STRING) AS ingredient_prompt,\n", " CAST(NULL AS STRING) AS ingredient_information,\n", " CAST(NULL AS STRING) AS ingredient_explanation,\n", "\n", " CAST(NULL AS STRING) AS allergy_prompt,\n", " CAST(NULL AS STRING) AS allergy_information,\n", " CAST(NULL AS STRING) AS allergy_explanation,\n", "\n", " FROM `chocolate_ai.menu`;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "RHabdmmyb9w4" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Create an inventory table which will reside in Oracle\n", "-- This will show low inventory on the website\n", "----------------------------------------------------------------------------------------------------------\n", "CREATE TABLE IF NOT EXISTS `chocolate_ai.generated_inventory` AS\n", "SELECT GENERATE_UUID() AS inventory_id,\n", " menu_id,\n", " CAST(ROUND(RAND() * (25 - 1)) AS INT64) AS inventory_quantity,\n", " FROM `chocolate_ai.generated_ingredients`;\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "l2S_jXyiglfo" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Create customer allergy table that contains \"complex allergy sentenance\" which will reside in Oracle\n", "-- This will show the customer allery preferences on the website\n", "----------------------------------------------------------------------------------------------------------\n", "CREATE TABLE IF NOT EXISTS `chocolate_ai.generated_customer_allergy_information` AS\n", "SELECT GENERATE_UUID() AS customer_allergy_information_id,\n", " customer_id,\n", " CAST(NULL AS STRING) AS allergy_information_prompt,\n", " CAST(NULL AS STRING) AS allergy_information,\n", " CAST(NULL AS STRING) AS allergy_information_explanation,\n", " FROM `chocolate_ai.customer`;" ] }, { "cell_type": "markdown", "metadata": { "id": "DludUQ1xZwr0" }, "source": [ "### Generate Data" ] }, { "cell_type": "markdown", "metadata": { "id": "-PwEXyN_ieG-" }, "source": [ "#### Create ingredients based upon menu items" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "rpnEmqQqC-tu" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- View the data\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT * FROM `chocolate_ai.generated_ingredients` ORDER BY menu_id LIMIT 10;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "OycfWxOZDMFh" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Create a prompt that will create a list of ingredients based upon each menu item (name and description)\n", "----------------------------------------------------------------------------------------------------------\n", "UPDATE `chocolate_ai.generated_ingredients`\n", " SET ingredient_prompt = CONCAT(\"\"\"You are a master chocolatier and expert at understanding the ingredients needed to create finely crafted chocolates. Read the below description of our chocolate and think through the ingredients step by step. The goal is to be able to determine allergy information so keep this in mind, but do not generate the allergy information at this time.\n", "\n", "<menu-name>\"\"\",\n", "menu_name,\n", "\"</menu-name>\",\n", "\"<menu-description>\",\n", "menu_description,\n", "\"</menu-description>\")\n", "WHERE ingredient_prompt IS NULL;\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "KkIC52PvHa3e" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- View the generated prompts\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT * FROM `chocolate_ai.generated_ingredients` ORDER BY menu_id;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "A12whAH_I6cd" }, "outputs": [], "source": [ "%%bigquery bq_dataframe\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Select the BigQuery data into a dataframe so we can loop over in Python code\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT * FROM `chocolate_ai.generated_ingredients` WHERE ingredient_information IS NULL;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "AnOfHVJCH6rs" }, "outputs": [], "source": [ "##########################################################################################################\n", "# For each item in the dataframe of ingredients call Vertex AI and execute the prompt\n", "# Save the results into the ingredients table\n", "# We could also used the BigQuery (BQML) ML.GENERATE_TEXT\n", "##########################################################################################################\n", "\n", "# Write me the json in OpenAPI 3.0 schema object for the below object.\n", "# Make all fields required.\n", "# {\n", "# \"ingredient_information\" : \"text\",\n", "# \"ingredient_explanation\" : \"text\"\n", "# }\n", "\n", "response_schema = {\n", " \"type\": \"object\",\n", " \"required\": [\n", " \"ingredient_information\",\n", " \"ingredient_explanation\"\n", " ],\n", " \"properties\": {\n", " \"ingredient_information\": {\n", " \"type\": \"string\"\n", " },\n", " \"ingredient_explanation\": {\n", " \"type\": \"string\"\n", " }\n", " }\n", "}\n", "\n", "for index, row in bq_dataframe.iterrows():\n", " menu_id = row[\"menu_id\"]\n", " print(f\"menu id: {menu_id}\")\n", " prompt = row[\"ingredient_prompt\"]\n", "\n", " # Use LLM to generate data\n", " llm_response = GeminiLLM(prompt, response_schema=response_schema, temperature=.5)\n", "\n", " # Parse response (we know the JSON since we passed it to our LLM)\n", " llm_json_response = json.loads(llm_response)\n", " print(json.dumps(llm_json_response, indent=2))\n", " ingredient_information = llm_json_response[\"ingredient_information\"]\n", " ingredient_explanation = llm_json_response[\"ingredient_explanation\"]\n", "\n", " # Update BigQuery\n", " sql = f'''UPDATE `chocolate_ai.generated_ingredients`\n", " SET ingredient_information = \"\"\"{ingredient_information}\"\"\",\n", " ingredient_explanation = \"\"\"{ingredient_explanation}\"\"\"\n", " WHERE menu_id = {menu_id}'''\n", "\n", " print(f\"sql: {sql}\")\n", " RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "x1J7dxMXKN0T" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- See our results from Gemini\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT * FROM `chocolate_ai.generated_ingredients` WHERE ingredient_information IS NOT NULL ORDER BY menu_id;" ] }, { "cell_type": "markdown", "metadata": { "id": "0W3fdsnvipL8" }, "source": [ "#### Create allergy information based upon the menu ingredients" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "IYXk6KtfWLLt" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Create a prompt that will take the ingredients and extract allergy information.\n", "-- This is done to show artifical data generation and in real life you would use the actual ingredients.\n", "----------------------------------------------------------------------------------------------------------\n", "UPDATE `chocolate_ai.generated_ingredients`\n", " SET allergy_prompt = CONCAT(\"\"\"Role: You are an expert food scientist specializing in chocolate and allergen identification. You possess an encyclopedic knowledge of ingredients, including their common names, scientific names, and derived components. You are meticulous and never miss a potential allergen.\n", "\n", "Objective: Analyze the provided chocolate ingredients and identify *ALL* potential allergens based on a comprehensive allergy list. This includes direct matches, semantic matches (synonyms and related terms), and *implicit* allergens based on common chocolate-making practices. Assume nothing is safe unless proven otherwise.\n", "\n", "Output: Generate a JSON object with a single field: `allergy_information`. .\n", "\n", "Output: The value of this field should be a comma-separated string containing all identified allergens from the provided list. If no allergens are found, the value should be an empty string (\"\")\n", "\n", "Input Data:\n", "\n", "<allergy_list>\n", "{\n", " \"allergy\": [\n", " \"Milk\", \"Dairy\", \"Lactose\", \"Casein\", \"Whey\",\n", " \"Soy\", \"Soybean Oil\", \"Lecithin\",\n", " \"Tree Nuts\", \"Almonds\", \"Hazelnuts\", \"Walnuts\", \"Pecans\", \"Cashews\", \"Pistachios\", \"Macadamia Nuts\", \"Brazil Nuts\",\n", " \"Peanuts\",\n", " \"Wheat\", \"Gluten\", \"Oats\", \"Barley\", \"Rye\",\n", " \"Eggs\",\n", " \"Sesame\",\n", " \"Coconut\",\n", " \"Sunflower Seeds\", \"Sunflower Oil\",\n", " \"Mustard\",\n", " \"Sulfites\",\n", " \"Corn\", \"High Fructose Corn Syrup\", \"Dextrose\", \"Maltodextrin\", \"Corn Starch\",\n", " \"Food Dyes\", \"Red 40\", \"Yellow 5\", \"Blue 1\",\n", " \"Chocolate\", \"Theobromine\", \"Cocoa Mass\", \"Cocoa Butter\",\n", " \"Cinnamon\",\n", " \"Vanilla\",\n", " \"Artificial Sweeteners\", \"Aspartame\", \"Sucralose\", \"Saccharin\",\n", " \"Benzoates\", \"Sodium Benzoate\",\n", " \"Sorbates\", \"Potassium Sorbate\",\n", " \"Rice\",\n", " \"Gelatin\",\n", " \"Honey\", \"Royal Jelly\", \"Propolis\", \"Bee Pollen\",\n", " \"Shellac\",\n", " \"Palm Oil\",\n", " \"Canola Oil\",\n", " \"Citric Acid\",\n", " \"MSG (Monosodium Glutamate)\",\n", " \"Xanthan Gum\",\n", " \"Guar Gum\",\n", " \"Carrageenan\",\n", " \"Inulin\",\n", " \"Fructose\", \"Galactose\",\n", " \"Mannitol\", \"Sorbitol\", \"Xylitol\", \"Erythritol\",\n", " \"Isomalt\",\n", " \"Stevia\",\n", " \"Monk Fruit Extract\",\n", " \"Agave\",\n", " \"Tapioca\",\n", " \"Potato Starch\",\n", " \"Modified Food Starch\",\n", " \"Dextrin\",\n", " \"Cellulose\",\n", " \"Pectin\",\n", " \"Agar-Agar\",\n", " \"Locust Bean Gum\",\n", " \"Tara Gum\",\n", " \"Acacia Gum\",\n", " \"Tragacanth Gum\",\n", " \"Karaya Gum\",\n", " \"Gellan Gum\",\n", " \"Konjac Gum\",\n", " \"Rennet\",\n", " \"Annatto\",\n", " \"Carmine\", \"Cochineal Extract\",\n", " \"Beetroot Red\",\n", " \"Turmeric\",\n", " \"Saffron\",\n", " \"Paprika\",\n", " \"Spirulina\",\n", " \"Chlorella\",\n", " \"Algae\",\n", " \"Quinoa\",\n", " \"Amaranth\",\n", " \"Buckwheat\",\n", " \"Chia Seeds\",\n", " \"Flax Seeds\",\n", " \"Hemp Seeds\",\n", " \"Safflower Oil\",\n", " \"Grape Seed Oil\",\n", " \"Avocado Oil\",\n", " \"Olive Oil\",\n", " \"Coffee\",\n", " \"Tea\",\n", " \"Herbs\", \"Mint\", \"Lavender\", \"Rosemary\",\n", " \"Spices\", \"Clove\", \"Nutmeg\", \"Ginger\",\n", " \"Fruits\", \"Strawberries\", \"Raspberries\", \"Blackberries\", \"Blueberries\",\n", " \"Citrus Fruits\", \"Orange\", \"Lemon\", \"Lime\", \"Grapefruit\",\n", " \"Nightshades\", \"Tomato\", \"Potato\", \"Peppers\", \"Eggplant\",\n", " \"Legumes\", \"Beans\", \"Lentils\", \"Peas\",\n", " \"Seeds\", \"Poppy Seeds\", \"Pumpkin Seeds\", \"Squash Seeds\"\n", " ]\n", " }\n", "</allergy_list>\n", "<ingredient_information>\"\"\",\n", "ingredient_information,\n", "\"</ingredient_information>\",\n", "\"<ingredient_explanation>\",\n", "ingredient_explanation,\n", "\"</ingredient_explanation\")\n", "WHERE allergy_prompt IS NULL;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5FrGL8dEXpyE" }, "outputs": [], "source": [ "%%bigquery bq_dataframe\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Place the prompt for allergies into a dataframe so we can loop over in Python code\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT * FROM `chocolate_ai.generated_ingredients` WHERE ingredient_information IS NOT NULL AND allergy_information IS NULL;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Fi4WHJYfXrxJ" }, "outputs": [], "source": [ "##########################################################################################################\n", "# Run each allergy prompt and update the table\n", "##########################################################################################################\n", "\n", "# Write me the json in OpenAPI 3.0 schema object for the below object.\n", "# Make all fields required.\n", "# {\n", "# \"allergy_information\" : \"text\",\n", "# \"allergy_explanation\" : \"text\"\n", "# }\n", "\n", "response_schema = {\n", " \"type\": \"object\",\n", " \"required\": [\n", " \"allergy_information\",\n", " \"allergy_explanation\"\n", " ],\n", " \"properties\": {\n", " \"allergy_information\": {\n", " \"type\": \"string\"\n", " },\n", " \"allergy_explanation\": {\n", " \"type\": \"string\"\n", " }\n", " }\n", "}\n", "\n", "for index, row in bq_dataframe.iterrows():\n", " menu_id = row[\"menu_id\"]\n", " print(f\"menu id: {menu_id}\")\n", " prompt = row[\"allergy_prompt\"]\n", "\n", " # Use LLM to generate data\n", " llm_response = GeminiLLM(prompt, response_schema=response_schema, temperature=.5)\n", "\n", " # Parse response (we know the JSON since we passed it to our LLM)\n", " llm_json_response = json.loads(llm_response)\n", " print(json.dumps(llm_json_response, indent=2))\n", " allergy_information = llm_json_response[\"allergy_information\"]\n", " allergy_explanation = llm_json_response[\"allergy_explanation\"]\n", "\n", " # Update BigQuery\n", " sql = f'''UPDATE `chocolate_ai.generated_ingredients`\n", " SET allergy_information = \"\"\"{allergy_information}\"\"\",\n", " allergy_explanation = \"\"\"{allergy_explanation}\"\"\"\n", " WHERE menu_id = {menu_id}'''\n", "\n", " print(f\"sql: {sql}\")\n", " RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "V76EzFwjaS8c" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- View our completed \"generated ingredients\" table\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT * FROM `chocolate_ai.generated_ingredients` WHERE allergy_information IS NOT NULL ORDER BY menu_id;" ] }, { "cell_type": "markdown", "metadata": { "id": "PFwdYJFcivIR" }, "source": [ "#### Create customer allergies as complex sentences" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JHLo4LutjAIK" }, "outputs": [], "source": [ "%%bigquery\n", "----------------------------------------------------------------------------------------------------------\n", "-- Create a prompt that will take the ingredients and extract allergy information.\n", "-- This is done to show artifical data generation and in real life you would use the actual ingredients.\n", "----------------------------------------------------------------------------------------------------------\n", "UPDATE `chocolate_ai.generated_customer_allergy_information`\n", " SET allergy_information_prompt = CONCAT(\"\"\"Role: You are a customer and need to set your allergy information so you do not purchase items you are allergic to.\n", " You are shopping at a chocolate shop that sells handmade specialty chocolates and serves various coffees.\n", " You are an expert food scientist specializing in chocolate and allergen identification.\n", " You possess an encyclopedic knowledge of ingredients, including their common names, scientific names, and derived components.\n", "\n", "Objective: Generate 2 to 4 sentences describing your allergies. This is for a demo and we want a large variety of words with similar meaning. This sentence will be processed\n", "by an LLM to extract keywords, removing all the noise. The below allergy list are the allergies we have identified in our chocolate, so you can use this for reference as\n", "as well as make up other allergies which will cause a non-match (which is good.)\n", "\n", "Example: Severe nut allergy, especially hazelnuts and almonds. Also, avoid any chocolate containing traces of soy lecithin. I have had anaphylactic reactions in the past. Please\n", "ensure all preparation surfaces are thoroughly cleaned. I also have a sensitivity to artificial sweeteners, particularly aspartame. I would prefer organic chocolate if possible.\n", "\n", "Output: Generate a JSON object with a single field: `allergy_information`.\n", "\n", "Output: The value of this field should be English sentences. If no allergens are generated, the value should be an empty string (\"\")\n", "\n", "Input Data:\n", "<allergy_list>\n", "{\n", " \"allergy\": [\n", " \"Milk\", \"Dairy\", \"Lactose\", \"Casein\", \"Whey\",\n", " \"Soy\", \"Soybean Oil\", \"Lecithin\",\n", " \"Tree Nuts\", \"Almonds\", \"Hazelnuts\", \"Walnuts\", \"Pecans\", \"Cashews\", \"Pistachios\", \"Macadamia Nuts\", \"Brazil Nuts\",\n", " \"Peanuts\",\n", " \"Wheat\", \"Gluten\", \"Oats\", \"Barley\", \"Rye\",\n", " \"Eggs\",\n", " \"Sesame\",\n", " \"Coconut\",\n", " \"Sunflower Seeds\", \"Sunflower Oil\",\n", " \"Mustard\",\n", " \"Sulfites\",\n", " \"Corn\", \"High Fructose Corn Syrup\", \"Dextrose\", \"Maltodextrin\", \"Corn Starch\",\n", " \"Food Dyes\", \"Red 40\", \"Yellow 5\", \"Blue 1\",\n", " \"Chocolate\", \"Theobromine\", \"Cocoa Mass\", \"Cocoa Butter\",\n", " \"Cinnamon\",\n", " \"Vanilla\",\n", " \"Artificial Sweeteners\", \"Aspartame\", \"Sucralose\", \"Saccharin\",\n", " \"Benzoates\", \"Sodium Benzoate\",\n", " \"Sorbates\", \"Potassium Sorbate\",\n", " \"Rice\",\n", " \"Gelatin\",\n", " \"Honey\", \"Royal Jelly\", \"Propolis\", \"Bee Pollen\",\n", " \"Shellac\",\n", " \"Palm Oil\",\n", " \"Canola Oil\",\n", " \"Citric Acid\",\n", " \"MSG (Monosodium Glutamate)\",\n", " \"Xanthan Gum\",\n", " \"Guar Gum\",\n", " \"Carrageenan\",\n", " \"Inulin\",\n", " \"Fructose\", \"Galactose\",\n", " \"Mannitol\", \"Sorbitol\", \"Xylitol\", \"Erythritol\",\n", " \"Isomalt\",\n", " \"Stevia\",\n", " \"Monk Fruit Extract\",\n", " \"Agave\",\n", " \"Tapioca\",\n", " \"Potato Starch\",\n", " \"Modified Food Starch\",\n", " \"Dextrin\",\n", " \"Cellulose\",\n", " \"Pectin\",\n", " \"Agar-Agar\",\n", " \"Locust Bean Gum\",\n", " \"Tara Gum\",\n", " \"Acacia Gum\",\n", " \"Tragacanth Gum\",\n", " \"Karaya Gum\",\n", " \"Gellan Gum\",\n", " \"Konjac Gum\",\n", " \"Rennet\",\n", " \"Annatto\",\n", " \"Carmine\", \"Cochineal Extract\",\n", " \"Beetroot Red\",\n", " \"Turmeric\",\n", " \"Saffron\",\n", " \"Paprika\",\n", " \"Spirulina\",\n", " \"Chlorella\",\n", " \"Algae\",\n", " \"Quinoa\",\n", " \"Amaranth\",\n", " \"Buckwheat\",\n", " \"Chia Seeds\",\n", " \"Flax Seeds\",\n", " \"Hemp Seeds\",\n", " \"Safflower Oil\",\n", " \"Grape Seed Oil\",\n", " \"Avocado Oil\",\n", " \"Olive Oil\",\n", " \"Coffee\",\n", " \"Tea\",\n", " \"Herbs\", \"Mint\", \"Lavender\", \"Rosemary\",\n", " \"Spices\", \"Clove\", \"Nutmeg\", \"Ginger\",\n", " \"Fruits\", \"Strawberries\", \"Raspberries\", \"Blackberries\", \"Blueberries\",\n", " \"Citrus Fruits\", \"Orange\", \"Lemon\", \"Lime\", \"Grapefruit\",\n", " \"Nightshades\", \"Tomato\", \"Potato\", \"Peppers\", \"Eggplant\",\n", " \"Legumes\", \"Beans\", \"Lentils\", \"Peas\",\n", " \"Seeds\", \"Poppy Seeds\", \"Pumpkin Seeds\", \"Squash Seeds\"\n", " ]\n", " }\n", "</allergy_list>\"\"\")\n", "WHERE allergy_information_prompt IS NULL;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mABDNbVblOGz" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- View the generated prompts\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT * FROM `chocolate_ai.generated_customer_allergy_information` WHERE allergy_information_prompt IS NOT NULL;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zUDNy1PLjAIL" }, "outputs": [], "source": [ "%%bigquery bq_dataframe\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Place the prompt for allergies into a dataframe so we can loop over in Python code\n", "-- NOTE: Change the LIMIT of 11000 if you want to test just a few rows\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT *\n", " FROM `chocolate_ai.generated_customer_allergy_information`\n", " WHERE allergy_information_prompt IS NOT NULL\n", " AND allergy_information IS NULL\n", " ORDER BY customer_id LIMIT 11000;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "OGPfC63pjAIM" }, "outputs": [], "source": [ "##########################################################################################################\n", "# Run each customer allergy prompt and update the table\n", "##########################################################################################################\n", "\n", "# Write me the json in OpenAPI 3.0 schema object for the below object.\n", "# Make all fields required.\n", "# {\n", "# \"allergy_information\" : \"text\",\n", "# \"allergy_information_explanation\" : \"text\"\n", "# }\n", "\n", "response_schema = {\n", " \"type\": \"object\",\n", " \"required\": [\n", " \"allergy_information\",\n", " \"allergy_information_explanation\"\n", " ],\n", " \"properties\": {\n", " \"allergy_information\": {\n", " \"type\": \"string\"\n", " },\n", " \"allergy_information_explanation\": {\n", " \"type\": \"string\"\n", " }\n", " }\n", "}\n", "\n", "for index, row in bq_dataframe.iterrows():\n", " customer_id = row[\"customer_id\"]\n", " print(f\"customer_id id: {customer_id}\")\n", " prompt = row[\"allergy_information_prompt\"]\n", "\n", " # Use LLM to generate data\n", " success = False\n", " while success == False:\n", " llm_response = GeminiLLM(prompt, response_schema=response_schema, temperature=.5)\n", "\n", " # Parse response (we know the JSON since we passed it to our LLM)\n", " llm_json_response = json.loads(llm_response)\n", " #print(json.dumps(llm_json_response, indent=2))\n", " allergy_information = llm_json_response[\"allergy_information\"]\n", " allergy_information_explanation = llm_json_response[\"allergy_information_explanation\"]\n", "\n", " # Update BigQuery\n", " sql = f'''UPDATE `chocolate_ai.generated_customer_allergy_information`\n", " SET allergy_information = \"\"\"{allergy_information}\"\"\",\n", " allergy_information_explanation = \"\"\"{allergy_information_explanation}\"\"\"\n", " WHERE customer_id = {customer_id}'''\n", "\n", " #print(f\"sql: {sql}\")\n", " try:\n", " RunQuery(sql)\n", " success = True\n", " except:\n", " print(\"Retrying\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "BaBQdNtXjAIM" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- View our completed \"generated customer allergy information\" table\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT * FROM `chocolate_ai.generated_customer_allergy_information` WHERE allergy_information IS NOT NULL ORDER BY customer_id DESC LIMIT 25;" ] }, { "cell_type": "markdown", "metadata": { "id": "VVm3b-A8udrc" }, "source": [ "### BigQuery Tables (Final tables from the Generated Data tables)" ] }, { "cell_type": "markdown", "metadata": { "id": "Z6gpn1p7uT0k" }, "source": [ "#### Create Final Tables from the Generated Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pDRRA11duW9a" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Create the \"Oracle tables\" in this BigQuery dataset\n", "-- These would be copied to Oracle (by hand)\n", "----------------------------------------------------------------------------------------------------------\n", "CREATE TABLE IF NOT EXISTS `chocolate_ai.oracle_customer` AS\n", "SELECT *\n", " FROM `chocolate_ai.generated_customer_allergy_information`;\n", "\n", "CREATE TABLE IF NOT EXISTS `chocolate_ai.oracle_ingredients` AS\n", "SELECT *\n", " FROM `chocolate_ai.generated_ingredients`;\n", "\n", "CREATE TABLE IF NOT EXISTS `chocolate_ai.oracle_inventory` AS\n", "SELECT *\n", " FROM `chocolate_ai.generated_inventory`;" ] }, { "cell_type": "markdown", "metadata": { "id": "Y7AoA-ZcqRVS" }, "source": [ "### Create Embeddings" ] }, { "cell_type": "markdown", "metadata": { "id": "ngb3fVz1EFq5" }, "source": [ "#### Create the text embedding model so we can call text-embedding-005 directly from SQL" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7CNysjgCqTIV" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Add a reference to text embedding 005 from vertex\n", "----------------------------------------------------------------------------------------------------------\n", "CREATE MODEL IF NOT EXISTS `chocolate_ai.textembedding_model`\n", " REMOTE WITH CONNECTION `us.vertex-ai`\n", " OPTIONS (endpoint = 'text-embedding-005');" ] }, { "cell_type": "markdown", "metadata": { "id": "ZJJuv9QkERY_" }, "source": [ "#### Create the Menu Embedding **Allery** table which will hold each menu item and associated allery" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HFEx2A1iqYCs" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Create a new table for the ingredients allergy (search) text embeddings\n", "-- Split each allergy information by the comman and embedded each (do not do the whole string)\n", "-- We want to match each allergy not the entire allergy string, so we will split and embed each one seperately.\n", "----------------------------------------------------------------------------------------------------------\n", "CREATE TABLE IF NOT EXISTS `chocolate_ai.menu_embeddings_allergy` AS\n", "WITH split_allergies AS\n", "(\n", " SELECT menu_id,\n", " menu_name,\n", " menu_description,\n", " SPLIT(LOWER(allergy_information), ',') AS allergy_array\n", " FROM `chocolate_ai.generated_ingredients`\n", "),\n", "allergies AS\n", "(\n", " SELECT menu_id,\n", " menu_name,\n", " menu_description,\n", " allergy\n", " FROM split_allergies\n", " JOIN UNNEST(allergy_array) AS allergy\n", ")\n", "SELECT menu_id,\n", " menu_name,\n", " menu_description,\n", " allergy,\n", " ml_generate_embedding_result AS menu_allergy_embedding\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT menu_id,\n", " menu_name,\n", " menu_description,\n", " TRIM(LOWER(allergy)) AS allergy,\n", " TRIM(LOWER(allergy)) AS content -- make lowercase\n", " FROM allergies),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality));" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1XbgCOwisk8x" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- View the count. We have each allergy embeddded seperately.\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT COUNT(*) FROM `chocolate_ai.menu_embeddings_allergy`;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Bx4PFP9HsuAa" }, "outputs": [], "source": [ "%%bigquery\n", "SELECT * FROM `chocolate_ai.menu_allergy_embeddings` ORDER BY menu_id LIMIT 25;" ] }, { "cell_type": "markdown", "metadata": { "id": "528UBJdqEbP-" }, "source": [ "#### Create the Menu Embedding **Name** table which will hold each menu item and associated name" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZxMJHad3qiil" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Create a new table for the menu description (search) text embeddings\n", "-- Embedding the title and seperately embed the description\n", "----------------------------------------------------------------------------------------------------------\n", "CREATE TABLE IF NOT EXISTS `chocolate_ai.menu_embeddings_name` AS\n", "SELECT menu_id,\n", " menu_name,\n", " menu_description,\n", " ml_generate_embedding_result AS menu_name_embedding\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT menu_id,\n", " menu_name,\n", " menu_description,\n", " LOWER(menu_name) AS content -- make lowercase\n", " FROM `chocolate_ai.menu`),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality));" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "plwYgRcMtw0X" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- View the menu embedding table\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT * FROM `chocolate_ai.menu_embeddings_name` ORDER BY menu_id LIMIT 5;" ] }, { "cell_type": "markdown", "metadata": { "id": "OHkHiRwvEq8o" }, "source": [ "#### Create the Menu Embedding **Description** table which will hold each menu item and associated description" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "T3aoduL1tyu-" }, "outputs": [], "source": [ "%%bigquery\n", "----------------------------------------------------------------------------------------------------------\n", "-- Create a new table for the menu description (search) text embeddings\n", "-- Embedding the title and seperately embed the description\n", "----------------------------------------------------------------------------------------------------------\n", "\n", "CREATE TABLE IF NOT EXISTS `chocolate_ai.menu_embeddings_description` AS\n", "SELECT menu_id,\n", " menu_name,\n", " menu_description,\n", " ml_generate_embedding_result AS menu_description_embedding\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT menu_id,\n", " menu_name,\n", " menu_description,\n", " LOWER(menu_description) AS content -- make lowercase\n", " FROM `chocolate_ai.menu`),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality));" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "dmxFUamYt6NY" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- View the menu description embeddings.\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT * FROM `chocolate_ai.menu_embeddings_description` ORDER BY menu_id LIMIT 5;" ] }, { "cell_type": "markdown", "metadata": { "id": "67hGed9Eua-j" }, "source": [ "### Search Embeddings" ] }, { "cell_type": "markdown", "metadata": { "id": "Q84uDIY2E4Aj" }, "source": [ "#### Basic search for a single term" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1w6MKXOsunnp" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- We can do simple searchs like \"milk\" or \"lactose\"\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT query.query AS search_string,\n", " COUNT(base.menu_id) AS menu_count,\n", " --base.menu_name\n", " --base.menu_description,\n", " base.allergy,\n", " distance\n", " FROM VECTOR_SEARCH(TABLE `chocolate_ai.menu_embeddings_allergy` ,\n", " 'menu_allergy_embedding', -- column in table to search\n", " (SELECT ml_generate_embedding_result,\n", " content AS query\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT LOWER('Lactose') AS content),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality) -- struct\n", " )),\n", " top_k => 10000)\n", " WHERE distance < .67\n", "GROUP BY ALL\n", "ORDER BY distance;" ] }, { "cell_type": "markdown", "metadata": { "id": "sTP0EgCEE8Fn" }, "source": [ "#### Basic search for a compound term" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "E-53DGPly-zA" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- We can do semi-simple search like \"lactose intolerance\"\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT query.query AS search_string,\n", " COUNT(base.menu_id) AS menu_count,\n", " --base.menu_name\n", " --base.menu_description,\n", " --TRIM(base.allergy) as allergy,\n", " base.allergy,\n", " distance\n", " FROM VECTOR_SEARCH(TABLE `chocolate_ai.menu_embeddings_allergy` ,\n", " 'menu_allergy_embedding', -- column in table to search\n", " (SELECT ml_generate_embedding_result,\n", " content AS query\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT LOWER('lactose intolerance') AS content),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality) -- struct\n", " )),\n", " top_k => 10000)\n", " WHERE distance < .76\n", "GROUP BY ALL\n", "ORDER BY distance;" ] }, { "cell_type": "markdown", "metadata": { "id": "iAO92XC_E-17" }, "source": [ "#### Searching a complex description of a customers allergies" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "i2s6kavwQd3V" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Show: Complex allergy information does not return anything.\n", "--\n", "-- The embeddings does not match a complex set of sentances:\n", "-- Severe nut allergy, especially hazelnuts and almonds.\n", "-- Also, avoid any chocolate containing traces of soy lecithin.\n", "-- I have had anaphylactic reactions in the past.\n", "-- Please ensure all preparation surfaces are thoroughly cleaned.\n", "-- I also have a sensitivity to artificial sweeteners, particularly aspartame.\n", "-- I would prefer organic chocolate if possible.\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT query.query AS search_string,\n", " COUNT(base.menu_id) AS menu_count,\n", " --base.menu_name\n", " --base.menu_description,\n", " --TRIM(base.allergy) as allergy,\n", " base.allergy,\n", " distance\n", " FROM VECTOR_SEARCH(TABLE `chocolate_ai.menu_embeddings_allergy` ,\n", " 'menu_allergy_embedding', -- column in table to search\n", " (SELECT ml_generate_embedding_result,\n", " content AS query\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT LOWER('Severe nut allergy, especially hazelnuts and almonds. Also, avoid any chocolate containing traces of soy lecithin. I have had anaphylactic reactions in the past. Please ensure all preparation surfaces are thoroughly cleaned. I also have a sensitivity to artificial sweeteners, particularly aspartame. I would prefer organic chocolate if possible.') AS content),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality) -- struct\n", " )),\n", " top_k => 10000)\n", " WHERE distance < .76\n", "GROUP BY ALL\n", "ORDER BY distance;" ] }, { "cell_type": "markdown", "metadata": { "id": "cn6inNHOFFN_" }, "source": [ "#### Breaking apart a complex description of a customers allergies into indivual allergy items" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UoZ8_Cn5x2zv" }, "outputs": [], "source": [ "%%bigquery customer_df\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Simulate an Oracle query by getting customer allergy information\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT *\n", " FROM `chocolate_ai.oracle_customer`\n", "WHERE customer_id = 1;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jJsao1Sl-Kjy" }, "outputs": [], "source": [ "##########################################################################################################\n", "## Extract just the customer food allergies using Gemini\n", "## In production you should double check these results.\n", "## Use Gemini Pro \"gemini-2.0-pro-exp-02-05\" (not flash), since we want a deeper reasoning on this task.\n", "##########################################################################################################\n", "\n", "customer_allergy_information = customer_df.iloc[0][\"allergy_information\"]\n", "print(f\"customer_allergy_information: {customer_allergy_information}\")\n", "customer_allergy_information = customer_allergy_information.lower()\n", "\n", "# Write me the json in OpenAPI 3.0 schema object for the below object.\n", "# Make all fields required.\n", "# {\n", "# \"allergies\" : \"text\",\n", "# }\n", "\n", "response_schema = {\n", " \"type\": \"object\",\n", " \"required\": [\n", " \"allergies\"\n", " ],\n", " \"properties\": {\n", " \"allergies\": {\n", " \"type\": \"string\"\n", " }\n", " }\n", "}\n", "\n", "prompt = \"\"\"You are an expert in allergies and processing of tokens in sentances.\n", "I need to search vector embeddings for matching food allergies.\n", "I only want items the customer is allergic to. Do not include their preferences in the result list.\n", "This is for a chocolate company so most items contain chocolate so check twice if the customer said they are allergic to chocolate; otherwise, we would exclude the entire menu.\n", "\n", "I need you to create a comma seperated list of words from the below customer preference and follow the below rules.\n", "\n", "Rules:\n", "- Preprocessing and Tokenization (with a Focus on Negation):\n", " - Tokenization: Break the user's sentence into individual words or phrases. This is similar to what we did with the allergy_information, but now we need to be a bit smarter. We can't just split on spaces; we need to handle punctuation and potentially multi-word allergens.\n", " - Stop Word Removal (Carefully!): Remove common words like \"I\", \"do\", \"the\", \"in\", \"my\", etc. However, be extremely careful about removing words related to negation (\"not\", \"no\", \"without\"). These are crucial for understanding the user's intent. We'll handle these in the next step.\n", " - Stemming/Lemmatization (optional but helpful): Consider stemming (reducing words to their root form, e.g., \"colors\" -> \"color\") or lemmatization (finding the dictionary form, e.g., \"better\" -> \"good\"). This can help match variations like \"sweetener\" and \"sweeteners\". SpaCy is great for this.\n", "\n", "- Negation Handling:\n", " - This is the most important part of processing this type of query. We need to distinguish between positive (\"I want milk\") and negative (\"I don't want milk\") statements about allergens. Here are several strategies, ordered from simpler to more complex:\n", " - Simple Negation Flag (Recommended for most cases):\n", " - Iterate through the tokenized sentence.\n", " - If you encounter a negation word (\"not\", \"no\", \"without\", \"avoid\", etc.), set a negation_flag to True.\n", " - When you encounter a known allergen, check the negation_flag.\n", " - If negation_flag is True, this allergen is excluded (the user doesn't want it).\n", " - If negation_flag is False, this allergen is included (the user does want it – though this is less common in allergy contexts).\n", " - Reset the negation_flag to False after processing each allergen or at the end of a clause (e.g., after a comma or \"and\").\n", "\n", "<customer_preference>\n", "{customer_allergy_information}\n", "</customer_preference>\n", "\n", "\"\"\"\n", "\n", "# Use LLM to generate data\n", "llm_response = GeminiLLM(prompt, response_schema=response_schema, model=\"gemini-2.0-pro-exp-02-05\", temperature=.5)\n", "\n", "# Parse response (we know the JSON since we passed it to our LLM)\n", "llm_json_response = json.loads(llm_response)\n", "print(json.dumps(llm_json_response, indent=2))\n", "allergies = llm_json_response[\"allergies\"]\n", "\n" ] }, { "cell_type": "markdown", "metadata": { "id": "ul9A1z1fFLMs" }, "source": [ "#### Searching a complex description using Gemini extract allery keywords" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mrR_w7yU4unq" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Take the above comma seperated list of allergies and split and embed each value\n", "-- Then search our menu allergy vector embeddings (all at once)\n", "-- We will end up with menu items that the customer should avoid (distance < .80) or we\n", "-- will end up with menus items that are okay (distance >= .80)\n", "----------------------------------------------------------------------------------------------------------\n", "WITH split_allergies AS\n", "(\n", " SELECT SPLIT(LOWER(\"nut, hazelnuts, almonds, soy lecithin, artificial sweeteners, aspartame\"), ',') AS allergy_array\n", "),\n", "allergies AS\n", "(\n", " SELECT DISTINCT allergy\n", " FROM split_allergies\n", " JOIN UNNEST(allergy_array) AS allergy\n", "),\n", "customer_vector_embeddings AS\n", "(\n", "SELECT allergy,\n", " ml_generate_embedding_result AS customer_allergy_embedding\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT TRIM(allergy) AS allergy,\n", " TRIM(allergy) AS content\n", " FROM allergies),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality))\n", ")\n", "SELECT query.allergy as customer_allergy,\n", " base.menu_id AS menu_id,\n", " base.menu_name AS menu_name,\n", " base.menu_description AS menu_description,\n", " base.allergy as menu_allergy,\n", " distance\n", " FROM VECTOR_SEARCH(TABLE `chocolate_ai.menu_embeddings_allergy` ,\n", " 'menu_allergy_embedding', -- column in table to search\n", " TABLE customer_vector_embeddings, -- source table of multiple embeddings\n", " 'customer_allergy_embedding', -- source table of customer allergies embeddings\n", " top_k => 1000)\n", " WHERE distance < .80\n", "ORDER BY distance;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "4ONcp8RE_yBY" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Same query as above, but with just with distinct results\n", "----------------------------------------------------------------------------------------------------------\n", "WITH split_allergies AS\n", "(\n", " SELECT SPLIT(LOWER(\"hazelnuts, almonds, soy lecithin, artificial sweeteners, aspartame\"), ',') AS allergy_array\n", "),\n", "allergies AS\n", "(\n", " SELECT DISTINCT allergy\n", " FROM split_allergies\n", " JOIN UNNEST(allergy_array) AS allergy\n", "),\n", "customer_vector_embeddings AS\n", "(\n", "SELECT allergy,\n", " ml_generate_embedding_result AS customer_allergy_embedding\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT TRIM(allergy) AS allergy,\n", " TRIM(allergy) AS content\n", " FROM allergies),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality))\n", ")\n", "SELECT DISTINCT query.allergy as customer_allergy,\n", " --base.menu_id AS menu_id,\n", " --base.menu_name AS menu_name,\n", " --base.menu_description AS menu_description,\n", " base.allergy as menu_allergy,\n", " distance\n", " FROM VECTOR_SEARCH(TABLE `chocolate_ai.menu_embeddings_allergy` ,\n", " 'menu_allergy_embedding', -- column in table to search\n", " TABLE customer_vector_embeddings, -- source table of multiple embeddings\n", " 'customer_allergy_embedding', -- source table of customer allergies embeddings\n", " top_k => 1000)\n", " WHERE distance < .65\n", "ORDER BY distance;" ] }, { "cell_type": "markdown", "metadata": { "id": "eqkxiVnAQ5a2" }, "source": [ "### Menu (Product) Search with Re-Ranking" ] }, { "cell_type": "markdown", "metadata": { "id": "ufgk9jnLhpxd" }, "source": [ "#### Basic search for \"chocolate truffles\" in Menu **Name** and Menu **Description**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "DFACuWTtRa5Z" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- We can do simple searchs like \"chocolate truffles\"\n", "-- This is ONLY searching the menu name\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT query.query AS search_string,\n", " base.menu_id,\n", " base.menu_name,\n", " base.menu_description,\n", " distance\n", " FROM VECTOR_SEARCH(TABLE `chocolate_ai.menu_embeddings_name` ,\n", " 'menu_name_embedding', -- column in table to search\n", " (SELECT ml_generate_embedding_result,\n", " content AS query\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT LOWER('chocolate truffles') AS content),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality) -- struct\n", " )),\n", " top_k => 10)\n", " --WHERE distance < .67\n", "ORDER BY distance;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "LvXjFQgqReyM" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- We can do simple searchs like \"chocolate truffles\"\n", "-- This is ONLY searching the menu description\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT query.query AS search_string,\n", " base.menu_id,\n", " base.menu_name,\n", " base.menu_description,\n", " distance\n", " FROM VECTOR_SEARCH(TABLE `chocolate_ai.menu_embeddings_description` ,\n", " 'menu_description_embedding', -- column in table to search\n", " (SELECT ml_generate_embedding_result,\n", " content AS query\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT LOWER('chocolate truffles') AS content),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality) -- struct\n", " )),\n", " top_k => 10)\n", "ORDER BY distance;" ] }, { "cell_type": "markdown", "metadata": { "id": "5Dg1W8IIh0-N" }, "source": [ "#### Combine our Menu **Name** and Menu **Description** search results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "uP-b4f0VRevn" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- UNION the menu name and menu description searches into one result\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT query.query AS search_string,\n", " base.menu_id,\n", " base.menu_name,\n", " base.menu_description,\n", " distance\n", " FROM VECTOR_SEARCH(TABLE `chocolate_ai.menu_embeddings_name` ,\n", " 'menu_name_embedding', -- column in table to search\n", " (SELECT ml_generate_embedding_result,\n", " content AS query\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT LOWER('chocolate truffles') AS content),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality) -- struct\n", " )),\n", " top_k => 10)\n", "UNION ALL\n", "SELECT query.query AS search_string,\n", " base.menu_id,\n", " base.menu_name,\n", " base.menu_description,\n", " distance\n", " FROM VECTOR_SEARCH(TABLE `chocolate_ai.menu_embeddings_description` ,\n", " 'menu_description_embedding', -- column in table to search\n", " (SELECT ml_generate_embedding_result,\n", " content AS query\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT LOWER('chocolate truffles') AS content),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality) -- struct\n", " )),\n", " top_k => 10)\n", "ORDER BY distance;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PMQjQWCvResv" }, "outputs": [], "source": [ "%%bigquery ranking_dataset\n", "\n", "----------------------------------------------------------------------------------------------------------\n", "-- Place the UNION of search results into a dataframe, so we can use it in Python code\n", "----------------------------------------------------------------------------------------------------------\n", "SELECT query.query AS search_string,\n", " base.menu_id,\n", " base.menu_name,\n", " base.menu_description,\n", " distance,\n", " 'menu_name' AS source,\n", " FROM VECTOR_SEARCH(TABLE `chocolate_ai.menu_embeddings_name` ,\n", " 'menu_name_embedding', -- column in table to search\n", " (SELECT ml_generate_embedding_result,\n", " content AS query\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT LOWER('chocolate truffles') AS content),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality) -- struct\n", " )),\n", " top_k => 10)\n", "UNION ALL\n", "SELECT query.query AS search_string,\n", " base.menu_id,\n", " base.menu_name,\n", " base.menu_description,\n", " distance,\n", " 'menu_description' AS source,\n", " FROM VECTOR_SEARCH(TABLE `chocolate_ai.menu_embeddings_description` ,\n", " 'menu_description_embedding', -- column in table to search\n", " (SELECT ml_generate_embedding_result,\n", " content AS query\n", " FROM ML.GENERATE_EMBEDDING(MODEL `chocolate_ai.textembedding_model`,\n", " (SELECT LOWER('chocolate truffles') AS content),\n", " STRUCT(TRUE AS flatten_json_output,\n", " 'SEMANTIC_SIMILARITY' as task_type,\n", " 768 AS output_dimensionality) -- struct\n", " )),\n", " top_k => 10)\n", "ORDER BY distance;" ] }, { "cell_type": "markdown", "metadata": { "id": "4gg7289Gh7nH" }, "source": [ "#### **Rank** our search result across our two seperate search results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "AojI3aDqRhRY" }, "outputs": [], "source": [ "##########################################################################################################\n", "## Build up a list of our query results (id and content)\n", "## The id must be unique so concatenate the source (menu name or menu description) in case we have the same menu id from both queries.\n", "##########################################################################################################\n", "\n", "record_list = []\n", "for index, row in ranking_dataset.iterrows():\n", " menu_id = row[\"menu_id\"]\n", " source = row[\"source\"]\n", " menu_name = row[\"menu_name\"].lower()\n", " menu_description = row[\"menu_description\"].lower()\n", "\n", " if source == \"menu_name\":\n", " content = f\"{menu_name}\"\n", " else:\n", " content = f\"{menu_description}\"\n", "\n", " record = {\n", " \"id\": f\"{source}-{menu_id}\",\n", " \"content\": content\n", " }\n", " record_list.append(record)\n", "\n", "print(f\"record_list: {record_list}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "M9ZdBm0hRmjv" }, "outputs": [], "source": [ "##########################################################################################################\n", "## Now we want to rank our menu name and menu description search results.\n", "## We might have menu names that are an exact match and menu descriptions that are an exact match.\n", "## We basically want to interweave our results into a single list with the most relevant items at the top.\n", "## So we can end up with menu names and menu descriptions in as adjacent items\n", "##########################################################################################################\n", "query = \"Which of the below items best match 'chocolate truffles'?\"\n", "\n", "json_result = vertex_ai_reranking(project_id, query, record_list)\n", "\n", "# Print the results (pretty print)\n", "json_formatted_str = json.dumps(json_result, indent=2)\n", "print(json_formatted_str)" ] } ], "metadata": { "colab": { "collapsed_sections": [ "S4eg4Ohk_87K", "MlFEqJmbAAFj", "q8_XhJi8AElm", "1_yr2zSVAILi", "U9W0pz36AUg1", "NeiSib02AW29", "HnBwJylyAbSo", "QtCLU0quRFye", "HSVBUNPdA7rY", "rTWhutEjBAza", "-PwEXyN_ieG-", "0W3fdsnvipL8", "VVm3b-A8udrc", "Z6gpn1p7uT0k", "Y7AoA-ZcqRVS", "ngb3fVz1EFq5", "ZJJuv9QkERY_", "528UBJdqEbP-", "OHkHiRwvEq8o", "Q84uDIY2E4Aj", "sTP0EgCEE8Fn", "iAO92XC_E-17", "ul9A1z1fFLMs", "ufgk9jnLhpxd", "5Dg1W8IIh0-N", "4gg7289Gh7nH" ], "name": "Oracle-Data-Generation", "private_outputs": true, "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }

colab-enterprise/Oracle-Data-Generation.ipynb (2,135 lines of code) (raw):