colab-enterprise/synthetic_data_generation_using

{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "clfviNVNXpqj" }, "source": [ "# <img src=\"https://lh3.googleusercontent.com/mUTbNK32c_DTSNrhqETT5aQJYFKok2HB1G2nk2MZHvG5bSs0v_lmDm_ArW7rgd6SDGHXo0Ak2uFFU96X6Xd0GQ=w160-h128\" width=\"45\" valign=\"top\" alt=\"BigQuery\"> Synthetic Data Generation Using LLMs" ] }, { "cell_type": "markdown", "metadata": { "id": "DszuLZoo9A7k" }, "source": [ "## Initialize Python" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "bhKxJadjWa1R" }, "outputs": [], "source": [ "import pandas as pd\n", "import json\n", "import bigframes.pandas as bf\n", "#from bigframesllm import BigFramesLLM\n", "from bigframes.ml.llm import GeminiTextGenerator" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MSC6-rboip3h" }, "outputs": [], "source": [ "from google.cloud import bigquery\n", "client = bigquery.Client()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "lpU1P_fAXviJ" }, "outputs": [], "source": [ "PROJECT_ID = \"data-analytics-preview\"\n", "REGION = \"us\"\n", "DATASET_ID = \"coffee_curated\"\n", "CONNECTION_NAME = \"vertex-ai\"\n", "\n", "connection = f\"{PROJECT_ID}.{REGION}.{CONNECTION_NAME}\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qMijiOqUZmAp" }, "outputs": [], "source": [ "# bf.reset_session() # if you need to change the region\n", "bf.options.bigquery.project = PROJECT_ID\n", "bf.options.bigquery.location = REGION" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zy7lRW09Ws1J" }, "outputs": [], "source": [ "session = bf.get_global_session()\n", "\n", "llm_model = GeminiTextGenerator(session=session, connection_name=connection)" ] }, { "cell_type": "markdown", "metadata": { "id": "6YeExbVqf4ZE" }, "source": [ "## Supporting Functions" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QtSLYNa_exfc" }, "outputs": [], "source": [ "def PrettyPrintJson(json_string):\n", " json_object = json.loads(json_string)\n", " json_formatted_str = json.dumps(json_object, indent=2)\n", " print(json_formatted_str)\n", " return json.dumps(json_object)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sthZI1h2fe8H" }, "outputs": [], "source": [ "def LLM(prompt, isOutputJson, max_output_tokens=1024, temperature=0, top_p=0, top_k=1):\n", " print()\n", " print(\"Prompt: \", prompt)\n", " print()\n", " df_prompt = pd.DataFrame(\n", " {\n", " \"prompt\": [prompt],\n", " })\n", " bf_df_prompt = bf.read_pandas(df_prompt)\n", " prediction = llm_model.predict(bf_df_prompt,\n", " max_output_tokens=max_output_tokens,\n", " temperature=temperature, # 0 to 1 (1 random)\n", " top_p=top_p, # 0 to 1 (1 random)\n", " top_k=top_k, # (1 to 40 random)\n", " ).to_pandas()\n", " try:\n", " # Remove common LLM output mistakes\n", " result = prediction['ml_generate_text_llm_result'][0]\n", "\n", " result = result.replace(\"```json\\n\",\"\")\n", " result = result.replace(\"```JSON\\n\",\"\")\n", " result = result.replace(\"```json\",\"\")\n", " result = result.replace(\"```JSON\",\"\")\n", " result = result.replace(\"```sql\\n\",\"\")\n", " result = result.replace(\"```SQL\\n\",\"\")\n", " result = result.replace(\"```sql\",\"\")\n", " result = result.replace(\"```SQL\",\"\")\n", " result = result.replace(\"```\",\"\")\n", "\n", " if isOutputJson:\n", " json_string = PrettyPrintJson(result)\n", " json_string = json_string.replace(\"'\",\"\\\\'\")\n", " json_string = json_string.strip()\n", " return json_string\n", " else:\n", " result = result.replace(\"'\",\"\\\\'\")\n", " result = result.strip()\n", " return result\n", "\n", " except:\n", " print(\"Error (raw): \", prediction['ml_generate_text_llm_result'][0])\n", " print(\"Error (result): \", result)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_8p8PkhFibYw" }, "outputs": [], "source": [ "def GetTableSchema(dataset_name, table_name):\n", " import io\n", "\n", " dataset_ref = client.dataset(dataset_name, project=PROJECT_ID)\n", " table_ref = dataset_ref.table(table_name)\n", " table = client.get_table(table_ref)\n", "\n", " f = io.StringIO(\"\")\n", " client.schema_to_json(table.schema, f)\n", " return f.getvalue()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HgMmeQSWvq6k" }, "outputs": [], "source": [ "def GetForeignKeys(dataset_name, table_name, field_name):\n", " sql = f\"\"\"\n", " SELECT STRING_AGG(CAST({field_name} AS STRING), \",\" ORDER BY {field_name}) AS result\n", " FROM `{PROJECT_ID}.{dataset_name}.{table_name}`\n", " \"\"\"\n", " #print(sql)\n", " df_result = client.query(sql).to_dataframe()\n", " #display(df_result)\n", " return df_result['result'].iloc[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "esIZXzKjsKIF" }, "outputs": [], "source": [ "def GetDistinctValues(dataset_name, table_name, field_name):\n", " sql = f\"\"\"\n", " SELECT STRING_AGG(DISTINCT {field_name}, \",\" ) AS result\n", " FROM `{PROJECT_ID}.{dataset_name}.{table_name}`\n", " \"\"\"\n", " #print(sql)\n", " df_result = client.query(sql).to_dataframe()\n", " #display(df_result)\n", " return df_result['result'].iloc[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Tto7sIdJ5vYi" }, "outputs": [], "source": [ "def GetStartingValue(dataset_name, table_name, field_name):\n", " sql = f\"\"\"\n", " SELECT IFNULL(MAX({field_name}),0) + 1 AS result\n", " FROM `{PROJECT_ID}.{dataset_name}.{table_name}`\n", " \"\"\"\n", " #print(sql)\n", " df_result = client.query(sql).to_dataframe()\n", " #display(df_result)\n", " return df_result['result'].iloc[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7sK-fbCaVMCZ" }, "outputs": [], "source": [ "def GetMaximumValue(dataset_name, table_name, field_name):\n", " sql = f\"\"\"\n", " SELECT IFNULL(MAX({field_name}),0) AS result\n", " FROM `{PROJECT_ID}.{dataset_name}.{table_name}`\n", " \"\"\"\n", " #print(sql)\n", " df_result = client.query(sql).to_dataframe()\n", " #display(df_result)\n", " return df_result['result'].iloc[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "DFqXl0cefaOe" }, "outputs": [], "source": [ "def RunQuery(sql):\n", " import time\n", "\n", " #return True # return early for now\n", "\n", " job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)\n", " query_job = client.query(sql, job_config=job_config)\n", "\n", " # Check on the progress by getting the job's updated state.\n", " query_job = client.get_job(\n", " query_job.job_id, location=query_job.location\n", " )\n", " print(\"Job {} is currently in state {} with error result of {}\".format(query_job.job_id, query_job.state, query_job.error_result))\n", "\n", " while query_job.state != \"DONE\":\n", " time.sleep(5)\n", " query_job = client.get_job(\n", " query_job.job_id, location=query_job.location\n", " )\n", " print(\"Job {} is currently in state {} with error result of {}\".format(query_job.job_id, query_job.state, query_job.error_result))\n", "\n", " if query_job.error_result == None:\n", " return True\n", " else:\n", " return False" ] }, { "cell_type": "markdown", "metadata": { "id": "CUPdwNf_89Gq" }, "source": [ "## Create Tables" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "V1DRyogrX0Yz" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.company`\n", "(\n", " company_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " company_name STRING NOT NULL OPTIONS(description=\"Name of the company.\"),\n", ")\n", "CLUSTER BY company_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mV5dmUl2X1S6" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.location`\n", "(\n", " location_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " company_id INTEGER NOT NULL OPTIONS(description=\"Foreign key: A valid value from the company table.\"),\n", " location_name STRING NOT NULL OPTIONS(description=\"Name of the location.\"),\n", " location_type STRING NOT NULL OPTIONS(description=\"Valid values for location_type are: 'Fixed', 'Mobile'\"),\n", ")\n", "CLUSTER BY location_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nvfwRUU2YTXC" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.customer`\n", "(\n", " customer_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " company_id INTEGER NOT NULL OPTIONS(description=\"Foreign key: A valid value from the company table.\"),\n", " customer_name STRING NOT NULL OPTIONS(description=\"Name of the customer.\"),\n", " customer_yob INT NOT NULL OPTIONS(description=\"Customer year of birth\"),\n", " customer_email STRING NOT NULL OPTIONS(description=\"Customer's email address\"),\n", " customer_inception_date DATE NOT NULL OPTIONS(description=\"Date of first customer interaction\")\n", "\n", ")\n", "CLUSTER BY customer_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nK3j19oMEpGn" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.customer_profile`\n", "(\n", " customer_id INTEGER NOT NULL OPTIONS(description=\"Primary key. Foreign key: Customer table.\"),\n", " customer_llm_summary STRING NOT NULL OPTIONS(description=\"LLM generated summary of customer data.\"),\n", " customer_lifetime_value STRING NOT NULL OPTIONS(description=\"Total sales for this customer.\"),\n", " customer_cluster_id INT NOT NULL OPTIONS(description=\"Clustering algorithm id.\"),\n", " customer_review_llm_summary STRING NOT NULL OPTIONS(description=\"LLM summary are all of the customer reviews.\"),\n", " customer_survey_llm_summary STRING NOT NULL OPTIONS(description=\"LLM summary are all of the customer surveys.\")\n", "\n", ")\n", "CLUSTER BY customer_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "hEdN9VbhFRnT" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.customer_survey`\n", "(\n", " customer_survey_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " customer_id INTEGER NOT NULL OPTIONS(description=\"Foreign key: Customer table\"),\n", " survey_date DATE NOT NULL OPTIONS(description=\"The date of the survey\"),\n", " question_1 STRING NOT NULL OPTIONS(description=\"A survey question.\"),\n", " response_1 STRING NOT NULL OPTIONS(description=\"The customers response.\"),\n", " question_2 STRING NOT NULL OPTIONS(description=\"A survey question.\"),\n", " response_2 STRING NOT NULL OPTIONS(description=\"The customers response.\"),\n", " question_3 STRING NOT NULL OPTIONS(description=\"A survey question.\"),\n", " response_3 STRING NOT NULL OPTIONS(description=\"The customers response.\")\n", ")\n", "CLUSTER BY customer_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cKK7KEnaH_Em" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.customer_review`\n", "(\n", " customer_review_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " customer_id INTEGER NOT NULL OPTIONS(description=\"Foreign key: Customer table\"),\n", " location_id INTEGER NOT NULL OPTIONS(description=\"Foreign key: Location table\"),\n", " review_datetime TIMESTAMP NOT NULL OPTIONS(description=\"Date and time of the review.\"),\n", " review_text STRING NOT NULL OPTIONS(description=\"The customer's review of the coffee.\"),\n", " review_mp3 STRING OPTIONS(description=\"The GCS location of an attached MP3 file.\"),\n", " review_image STRING OPTIONS(description=\"The GCS location of an attached image file.\"),\n", " review_sentiment STRING OPTIONS(description=\"The sentiment of the review text.\"),\n", " social_media_source STRING NOT NULL OPTIONS(description=\"The social media site the review was posted on.\"),\n", " social_media_handle STRING NOT NULL OPTIONS(description=\"The customer's social media handle\"),\n", " gen_ai_recommended_action STRING OPTIONS(description=\"Valid values for gen_ai_recommended_action are: 'Send Survey', 'Send Coupon'\"),\n", " gen_ai_reponse STRING OPTIONS(description=\"The Generated response from the LLM.\"),\n", " human_response STRING OPTIONS(description=\"The human manually entered response.\"),\n", " response_sent_action STRING OPTIONS(description=\"Valid values for response_sent_action are: 'Sent LLM Response', 'Human called'\"),\n", " response_sent_date TIMESTAMP OPTIONS(description=\"Date and time the response was sent.\")\n", "\n", ")\n", "CLUSTER BY customer_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "bktx53fDfszf" }, "outputs": [], "source": [ "sql = f\"\"\"ALTER TABLE `{PROJECT_ID}.{DATASET_ID}.customer_review`\n", " ADD COLUMN IF NOT EXISTS llm_detected_theme JSON OPTIONS(description=\"The LLM detected themes in the customer review.\");\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1mIv7Q-DRxI4" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.menu`\n", "(\n", " menu_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " company_id INTEGER NOT NULL OPTIONS(description=\"Foreign key: Company table.\"),\n", " item_name STRING NOT NULL OPTIONS(description=\"The name of the coffee drink\"),\n", " item_price STRING NOT NULL OPTIONS(description=\"The price of the coffee\"),\n", " item_description STRING NOT NULL OPTIONS(description=\"The detailed description of the coffee drink\"),\n", " item_size STRING NOT NULL OPTIONS(description=\"Valid Values: Small, Medium, Large\")\n", "\n", ")\n", "CLUSTER BY menu_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Uiu6p13mAjQG" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.order`\n", "(\n", " order_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " location_id INTEGER NOT NULL OPTIONS(description=\"Foreign key: Location table.\"),\n", " customer_id INTEGER NOT NULL OPTIONS(description=\"Foreign key: Customer table.\"),\n", " order_datetime TIMESTAMP NOT NULL OPTIONS(description=\"The datetime the order was started.\"),\n", " order_completion_datetime TIMESTAMP NOT NULL OPTIONS(description=\"The datetime the order was completed.\")\n", "\n", ")\n", "CLUSTER BY order_id;\n", "\"\"\"\n", "\n", "client.query(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "IoR5uQmLEOiJ" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.order_item`\n", "(\n", " order_item_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " order_id INTEGER NOT NULL OPTIONS(description=\"Foreign key: Order table\"),\n", " menu_id INTEGER NOT NULL OPTIONS(description=\"Foreign key: Menu table\"),\n", " quantity INTEGER NOT NULL OPTIONS(description=\"Number of items ordered\")\n", "\n", ")\n", "CLUSTER BY order_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "TJcSIw-ym4d6" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.coffee_farm`\n", "(\n", " coffee_farm_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " name STRING NOT NULL OPTIONS(description=\"The name of the coffee farm.\"),\n", " latitude FLOAT64 NOT NULL OPTIONS(description=\"The latitude of the coffee farm.\"),\n", " longitude FLOAT64 NOT NULL OPTIONS(description=\"The longitude of the coffee farm.\"),\n", " lat_long GEOGRAPHY NOT NULL OPTIONS(description=\"The latitude and longitude of the coffee farm.\"),\n", " contact_name STRING NOT NULL OPTIONS(description=\"The main contact person of the coffee farm.\"),\n", " contact_email STRING NOT NULL OPTIONS(description=\"The email address of the coffee farm.\"),\n", " contact_code STRING NOT NULL OPTIONS(description=\"Alpha 3 Code\")\n", ")\n", "CLUSTER BY coffee_farm_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3OESaGAsbxis" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.coffee_processor`\n", "(\n", " coffee_processor_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " name STRING NOT NULL OPTIONS(description=\"The name of the coffee processing facility.\"),\n", " latitude FLOAT64 NOT NULL OPTIONS(description=\"The latitude of the coffee processing facility.\"),\n", " longitude FLOAT64 NOT NULL OPTIONS(description=\"The longitude of the coffee processing facility.\"),\n", " lat_long GEOGRAPHY NOT NULL OPTIONS(description=\"The latitude and longitude of the coffee processing facility.\"),\n", " contact_name STRING NOT NULL OPTIONS(description=\"The main contact person of the coffee processing facility.\"),\n", " contact_email STRING NOT NULL OPTIONS(description=\"The email address of the coffee processing facility.\"),\n", " contact_code STRING NOT NULL OPTIONS(description=\"Alpha 3 Code\")\n", ")\n", "CLUSTER BY coffee_processor_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9y95NMOufFEF" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.coffee_roaster`\n", "(\n", " coffee_roaster_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " name STRING NOT NULL OPTIONS(description=\"The name of the coffee roaster facility.\"),\n", " latitude FLOAT64 NOT NULL OPTIONS(description=\"The latitude of the coffee roaster facility.\"),\n", " longitude FLOAT64 NOT NULL OPTIONS(description=\"The longitude of the coffee roaster facility.\"),\n", " lat_long GEOGRAPHY NOT NULL OPTIONS(description=\"The latitude and longitude of the coffee roaster facility.\"),\n", " contact_name STRING NOT NULL OPTIONS(description=\"The main contact person of the coffee roaster facility.\"),\n", " contact_email STRING NOT NULL OPTIONS(description=\"The email address of the coffee roaster facility.\"),\n", " contact_code STRING NOT NULL OPTIONS(description=\"Alpha 3 Code\")\n", ")\n", "CLUSTER BY coffee_roaster_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "dd5pjKEzSaji" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.city`\n", "(\n", " city_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " city_name STRING NOT NULL OPTIONS(description=\"The name of the city.\"),\n", " country_code STRING NOT NULL OPTIONS(description=\"Alpha 3 Code.\"),\n", " latitude FLOAT64 NOT NULL OPTIONS(description=\"The latitude of the city.\"),\n", " longitude FLOAT64 NOT NULL OPTIONS(description=\"The longitude of the city.\"),\n", " lat_long GEOGRAPHY NOT NULL OPTIONS(description=\"The latitude and longitude of the city.\")\n", ")\n", "CLUSTER BY city_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)\n", "\n", "sql = f\"\"\"\n", "INSERT INTO `{PROJECT_ID}.{DATASET_ID}.city` VALUES\n", "(1, 'New York City', 'USA', 40.7128, 74.0060, ST_GeogPoint(74.0060, 40.7128)),\n", "(2, 'London', 'GBR', 51.5072, 0.1276, ST_GeogPoint(0.1276, 51.5072)),\n", "(3, 'Tokyo', 'JPN', 35.6764, 139.6500, ST_GeogPoint(139.6500, 35.6764)),\n", "(4, 'San Francisco', 'USA', 37.7749, 122.4194, ST_GeogPoint(122.4194, 37.7749));\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fnQfyW0XR7jT" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.sales_forecast`\n", "(\n", " sales_forecast_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " forecast_date DATE NOT NULL OPTIONS(description=\"The date for the forecasted sales\"),\n", " sales_forecast_amount FLOAT64 NOT NULL OPTIONS(description=\"The projected sales for the city.\"),\n", " city_id INTEGER NOT NULL OPTIONS(description=\"Foreign key: City table.\")\n", ")\n", "CLUSTER BY sales_forecast_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5DavsmDlXmLE" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.marketing_campaign`\n", "(\n", " marketing_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " marketing_campaign_name STRING NOT NULL OPTIONS(description=\"The cost for the campaign.\"),\n", " marketing_cost FLOAT64 NOT NULL OPTIONS(description=\"The cost for the campaign.\"),\n", " number_of_new_customers INT64 NOT NULL OPTIONS(description=\"The projected sales for the city.\")\n", ")\n", "CLUSTER BY marketing_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)\n", "\n", "sql = f\"\"\"\n", "INSERT INTO `{PROJECT_ID}.{DATASET_ID}.marketing_campaign` VALUES\n", "(1, 'Google Ads', 15000, 1500),\n", "(2, 'Billboards', 4000, 200);\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "markdown", "metadata": { "id": "Jr2sIqNP800E" }, "source": [ "## Company Table" ] }, { "cell_type": "markdown", "metadata": { "id": "RKpEobqAXesn" }, "source": [ "Create the names for the coffee companies.\n", "\n", "* Create the names outside of the SQL statement. This provides more control over the names.\n", "* Use the names for the SQL prompt.\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Qj3hXqA_FZWm" }, "outputs": [], "source": [ "company_count = 10\n", "\n", "table_name = \"company\"\n", "primary_key = \"company_id\"\n", "\n", "schema = GetTableSchema(DATASET_ID, table_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "TvLGkqIcWqH6" }, "outputs": [], "source": [ "prompt = f\"\"\"Generate {company_count} creative names and return in the below json format.\n", "- The name should be new and not a name that is already used by an existing coffee company.\n", "- The name should be related to coffee.\n", "- The name should be related to a food truck type of service.\n", "\n", "JSON format: [ \"value\" ]\n", "Sample JSON Response: [ \"value1\", \"value2\" ]\n", "\"\"\"\n", "\n", "llm_valid_execution = False\n", "while llm_valid_execution == False:\n", " try:\n", " company_names = LLM(prompt, True, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " llm_valid_execution = True\n", " except Exception as error:\n", " print(\"An error occurred:\", error)\n", "\n", "print(f\"company_names: {company_names}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "N8mIkpstyyw0" }, "outputs": [], "source": [ "# Override (hardcoded)\n", "company_names = \"\"\"[\n", " \"BeanStreet Brews\",\n", " \"MochaWheels Cafe\",\n", " \"Brew 'n Go Bistro\",\n", " \"Rolling Roast Express\",\n", " \"JavaJolt Junction\",\n", " \"Café on Wheels\",\n", " \"Espresso Eats Mobile\",\n", " \"CuppaCruiser\",\n", " \"The Roaming Roastery\",\n", " \"StreetBean Delights\"\n", "]\"\"\"\n", "\n", "company_names = company_names.replace(\"'\",\"\\\\'\") # Deal with single quotes in output" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "dmdXJAxHy7xU" }, "outputs": [], "source": [ "company_names" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7qb4CFnsglfU" }, "outputs": [], "source": [ "starting_value = GetStartingValue(DATASET_ID, table_name, primary_key)\n", "\n", "prompt=f\"\"\"\n", "You are a database engineer and need to generate data for a table for the below schema.\n", "- The schema is for a Google Cloud BigQuery Table.\n", "- The table name is \"{PROJECT_ID}.{DATASET_ID}.{table_name}\".\n", "- Read the description of each field for valid values.\n", "- Do not preface the response with any special characters or 'sql'.\n", "- Generate {company_count} insert statements for this table.\n", "- Valid values for company_name are: {company_names}\n", "- The starting value of the field {primary_key} is {starting_value}.\n", "\n", "Example 1: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Sample'),(2, 'Sample');\n", "Example 2: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Data'),(2, 'Data'),(3, 'Data');\n", "\n", "Schema: {schema}\n", "\"\"\"\n", "\n", "llm_valid_execution = False\n", "while llm_valid_execution == False:\n", " try:\n", " sql = LLM(prompt, False, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " print(\"---------------------------------\")\n", " print(\"sql: \", sql)\n", " print(\"---------------------------------\")\n", " llm_valid_execution = RunQuery(sql)\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "deSLhzKC8qHQ" }, "source": [ "## Location Table" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "AkAbXzGMFWEF" }, "outputs": [], "source": [ "location_count = 50\n", "location_list = \"New York, San Francisco, Miami\"\n", "\n", "table_name = \"company\"\n", "field_name = \"company_id\"\n", "company_ids = GetForeignKeys(DATASET_ID, table_name, field_name)\n", "\n", "table_name = \"location\"\n", "primary_key = \"location_id\"\n", "\n", "schema = GetTableSchema(DATASET_ID, table_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QayPYuBQX3q9" }, "outputs": [], "source": [ "prompt = f\"\"\"Generate {location_count} creative names and return in the below json format.\n", "- The name should be new and not a name that is already used by an existing coffee company.\n", "- The name should be related to mobile coffee trucks that travel around the city.\n", "- The name should be related the following locations: {location_list}\n", "\n", "JSON format: [ \"value\" ]\n", "Sample JSON Response: [ \"value1\", \"value2\" ]\n", "\"\"\"\n", "\n", "llm_valid_execution = False\n", "while llm_valid_execution == False:\n", " try:\n", " location_names = LLM(prompt, True, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " llm_valid_execution = True\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7hzH8uvczlN7" }, "outputs": [], "source": [ "# Override (hardcoded)\n", "\n", "location_names = \"\"\"[\n", " \"CitySips Roaming Cafe\",\n", " \"JavaJourney Express\",\n", " \"UrbanCaffeine Cruiser\",\n", " \"CafeWheels NY\",\n", " \"Golden Gate Grind Mobile\",\n", " \"Sunshine Sips on Wheels\",\n", " \"Big Apple Brew Bus\",\n", " \"Bay Brew Hauler\",\n", " \"Magic City Mocha Mobile\",\n", " \"Metropolis Mug Mover\",\n", " \"New York Nectar Nomad\",\n", " \"SFO Street Sips\",\n", " \"MiaMornings Mobile\",\n", " \"CityBeans Roam-uccino\",\n", " \"Sunrise City Sipper\",\n", " \"Gotham Grind on Wheels\",\n", " \"Bay Area Bean Bus\",\n", " \"Mia Mochaccino Mobile\",\n", " \"Cityscape Sip Stop\",\n", " \"Transit Brew Buggy\",\n", " \"Fog City Fueler\",\n", " \"Miami Metro Mugs\",\n", " \"NY Espresso Express\",\n", " \"Golden Grind Getter\",\n", " \"Sunny Side Sips Shuttle\",\n", " \"Empire Espresso Explorer\",\n", " \"SF Sidewalk Sipper\",\n", " \"Beachside Brew Bounder\",\n", " \"Urban Sipper's Shuttle\",\n", " \"NYC Nomadic Nectar\",\n", " \"Golden Bridge Brewmobile\",\n", " \"Sunny State Sipster\",\n", " \"Cafe Cruiser Central\",\n", " \"NY Neighborhood Nectar\",\n", " \"Frisco Fuel on Wheels\",\n", " \"MiaMug Mobility\",\n", " \"Metropolitan Mochaccino\",\n", " \"CitySips Street Surfer\",\n", " \"Golden Gate Gourmet Glide\",\n", " \"Beach Breeze Brew Bus\",\n", " \"City Roast Cruiser\",\n", " \"NYC Urban Uplifter\",\n", " \"Frisco Fresh Brews\",\n", " \"Miami Magic Mugs\",\n", " \"Coffee Cart Connection\",\n", " \"Empire City Espresso Explorer\",\n", " \"Golden Glow Grind Rover\",\n", " \"Sun-Kissed Sip & Go\",\n", " \"CityLife Latte Lorry\",\n", " \"Cityscape Sipper Shuttle\"\n", "]\"\"\"\n", "\n", "location_names = location_names.replace(\"'\",\"\\\\'\") # Deal with single quotes in output" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3WpCWlL-lb9h" }, "outputs": [], "source": [ "starting_value = GetStartingValue(DATASET_ID, table_name, primary_key)\n", "\n", "prompt=f\"\"\"\n", "You are a database engineer and need to generate data for a table for the below schema.\n", "- The schema is for a Google Cloud BigQuery Table.\n", "- The table name is \"{PROJECT_ID}.{DATASET_ID}.{table_name}\".\n", "- Read the description of each field for valid values.\n", "- Do not preface the response with any special characters or 'sql'.\n", "- Generate {location_count} insert statements for this table.\n", "- Valid values for location_name are: {location_names}\n", "- Valid values for company_id are: {company_ids}\n", "- The starting value of the field {primary_key} is {starting_value}.\n", "\n", "Example 1: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Sample'),(2, 'Sample');\n", "Example 2: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Data'),(2, 'Data'),(3, 'Data');\n", "\n", "Schema: {schema}\n", "\"\"\"\n", "\n", "llm_valid_execution = False\n", "while llm_valid_execution == False:\n", " try:\n", " sql = LLM(prompt, False, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " print(\"---------------------------------\")\n", " print(\"sql: \", sql)\n", " print(\"---------------------------------\")\n", " llm_valid_execution = RunQuery(sql)\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "nlhRKtPAsAWz" }, "source": [ "## Customer Table" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "FwWRZChiFTIl" }, "outputs": [], "source": [ "customer_count = 20\n", "country = \"United States\"\n", "\n", "table_name = \"company\"\n", "field_name = \"company_id\"\n", "company_ids = GetForeignKeys(DATASET_ID, table_name, field_name)\n", "\n", "table_name = \"customer\"\n", "primary_key = \"customer_id\"\n", "\n", "schema = GetTableSchema(DATASET_ID, table_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6DcfseTbsB8u" }, "outputs": [], "source": [ "loop_count = 10\n", "loop_index = 1\n", "\n", "while loop_index <= loop_count:\n", " print(f\"loop_index: {loop_index} | loop_count: {loop_count}\")\n", " starting_value = GetStartingValue(DATASET_ID, table_name, primary_key)\n", "\n", " prompt=f\"\"\"\n", " You are a database engineer and need to generate data for a table for the below schema.\n", " - The schema is for a Google Cloud BigQuery Table.\n", " - The table name is \"{PROJECT_ID}.{DATASET_ID}.{table_name}\".\n", " - Read the description of each field for valid values.\n", " - Do not preface the response with any special characters or 'sql'.\n", " - Generate {customer_count} insert statements for this table.\n", " - The customer_inception_date is a date and should be within the past 5 years.\n", " - The customer_name should be names used in the country {country} and be a first and last name.\n", " - Valid values for company_id are: {company_ids}\n", " - The starting value of the field {primary_key} is {starting_value}.\n", "\n", " Example 1: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Sample'),(2, 'Sample');\n", " Example 2: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Data'),(2, 'Data'),(3, 'Data');\n", "\n", " Schema: {schema}\n", " \"\"\"\n", "\n", "\n", " llm_valid_execution = False\n", " while llm_valid_execution == False:\n", " try:\n", " sql = LLM(prompt, False, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " print(\"---------------------------------\")\n", " print(\"sql: \", sql)\n", " print(\"---------------------------------\")\n", " llm_valid_execution = RunQuery(sql)\n", " loop_index = loop_index + 1\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "ylfzCAIzEALT" }, "source": [ "## Customer Survey Table" ] }, { "cell_type": "markdown", "metadata": { "id": "ZEvAdTsKaLbB" }, "source": [ "* Generate a list of survey coffee questions\n", "* Save these, so we re-use in each loop\n", "* Generate survey responses\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "W9NKsQYbEALe" }, "outputs": [], "source": [ "customer_survey_count = 3\n", "\n", "prompt = f\"\"\"You work in the marketing department and are gathering feedback on coffee served to customers.\n", "Generate {customer_survey_count} random coffee survey questions and return in the below JSON format.\n", "- Do not include any special characters in the json.\n", "- Do not include ```json in the output.\n", "\n", "JSON format: [ \"value\" ]\n", "Sample JSON Response: [ \"value1\", \"value2\" ]\n", "\"\"\"\n", "\n", "llm_valid_execution = False\n", "while llm_valid_execution == False:\n", " try:\n", " generatcustomer_surveysed_customer_surveys = LLM(prompt, True, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " llm_valid_execution = True\n", " except Exception as error:\n", " print(\"An error occurred:\", error)\n", "\n", "# Hard coded so we ask the same questions (for now)\n", "customer_surveys = '[\"How satisfied were you with the taste of our coffee?\", \"How likely are you to recommend our coffee to a friend?\", \"Which of the following best describes your overall experience with our coffee?\"]'\n", "\n", "# Load the json do we index\n", "customer_surveys_json = json.loads(customer_surveys)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "CceughTOFQW-" }, "outputs": [], "source": [ "rows_of_data_to_generate = 5\n", "\n", "table_name = \"customer\"\n", "field_name = \"customer_id\"\n", "customer_ids = GetForeignKeys(DATASET_ID, table_name, field_name)\n", "\n", "table_name = \"customer_survey\"\n", "primary_key = \"customer_survey_id\"\n", "\n", "schema = GetTableSchema(DATASET_ID, table_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UQL9jWr0EALe" }, "outputs": [], "source": [ "loop_count = 10\n", "loop_index = 1\n", "\n", "while loop_index <= loop_count:\n", " print(f\"loop_index: {loop_index} | loop_count: {loop_count}\")\n", " starting_value = GetStartingValue(DATASET_ID, table_name, primary_key)\n", "\n", " prompt=f\"\"\"\n", " You are a database engineer and need to generate data for a table for the below schema.\n", " - The schema is for a Google Cloud BigQuery Table.\n", " - The table name is \"{PROJECT_ID}.{DATASET_ID}.{table_name}\".\n", " - Read the description of each field for valid values.\n", " - Do not preface the response with any special characters or 'sql'.\n", " - Generate {rows_of_data_to_generate} insert statements for this table.\n", " - Valid values for customer_id are: {customer_ids}\n", " - The survey_date is a date and should be within the past 5 years.\n", " - Valid values for question_1 is {customer_surveys_json[0]}\n", " - Valid values for question_2 is {customer_surveys_json[1]}\n", " - Valid values for question_3 is {customer_surveys_json[2]}\n", " - The response for each question should be 2 to 20 words.\n", " - The response can have a postive, neutral or negative sentiment.\n", " - The starting value of the field {primary_key} is {starting_value}.\n", " - Escape single quotes with a backslash. Example: Adam's Answer: Adam\\'s\n", "\n", " Example 1: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Sample'),(2, 'Sample');\n", " Example 2: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Data'),(2, 'Data'),(3, 'Data');\n", "\n", " Schema: {schema}\n", " \"\"\"\n", "\n", " llm_valid_execution = False\n", " while llm_valid_execution == False:\n", " try:\n", " sql = LLM(prompt, False, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " print(\"---------------------------------\")\n", " print(\"sql: \", sql)\n", " print(\"---------------------------------\")\n", " llm_valid_execution = RunQuery(sql)\n", " loop_index = loop_index + 1\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "WbxE7eDp-L1j" }, "source": [ "## Customer Review Table" ] }, { "cell_type": "markdown", "metadata": { "id": "Mg8ZZEViq9lu" }, "source": [ "### Create customer reviews" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8VwUtOMh-L1j" }, "outputs": [], "source": [ "rows_of_data_to_generate = 3\n", "\n", "table_name = \"customer\"\n", "field_name = \"customer_id\"\n", "customer_ids = GetForeignKeys(DATASET_ID, table_name, field_name)\n", "\n", "table_name = \"location\"\n", "field_name = \"location_id\"\n", "location_ids = GetForeignKeys(DATASET_ID, table_name, field_name)\n", "\n", "table_name = \"customer_review\"\n", "primary_key = \"customer_review_id\"\n", "\n", "schema = GetTableSchema(DATASET_ID, table_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "LL6cYlwWCYQu" }, "outputs": [], "source": [ "# location_ids='1,11,21'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "bVm36Q_1-L1k" }, "outputs": [], "source": [ "import random\n", "loop_count = 1000\n", "loop_index = 1\n", "\n", "while loop_index <= loop_count:\n", " print(f\"loop_index: {loop_index} | loop_count: {loop_count}\")\n", " starting_value = GetStartingValue(DATASET_ID, table_name, primary_key)\n", "\n", " if random.random() < .25:\n", " prompt=f\"\"\"\n", " You are a database engineer and need to generate data for a table for the below schema.\n", " You need to generate reviews for customers who have purchased your brewed coffee.\n", " Write a negative in first person based upon the following: \"Bad Service\",\"Long Wait Time\",\"Slow Service\",\"Dirty\",\"Overpriced\",\"Overcrowded\",\"Noisy Location\",\"Lack of Allergan Information\",\"Inconsistent Quality\",\"Lack of Seating\",\"No Flavor\",\"Too weak\",\"Too strong\",\"Too bitter\",\"Limited Menu\"\n", " - The schema is for a Google Cloud BigQuery Table.\n", " - The table name is \"{PROJECT_ID}.{DATASET_ID}.{table_name}\".\n", " - Read the description of each field for valid values.\n", " - Do not preface the response with any special characters or 'sql'.\n", " - Generate {rows_of_data_to_generate} insert statements for this table.\n", " - Valid values for customer_id are: {customer_ids}\n", " - Valid values for location_id are: {location_ids}\n", " - The review_datetime is a date and should be within the past 5 years.\n", " - The response for each question should be 20 to 100 words.\n", " - The starting value of the field {primary_key} is {starting_value}.\n", " - Escape single quotes with a backslash. Example: Adam's Answer: Adam\\'s\n", " - Only generate data for these fields: customer_review_id, customer_id, location_id, review_datetime, review_text, social_media_source, social_media_handle\n", "\n", " Examples:\n", " Example 1: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Sample'),(2, 'Sample');\n", " Example 2: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Data'),(2, 'Data'),(3, 'Data');\n", "\n", " Schema: {schema}\n", " \"\"\"\n", " else:\n", " prompt=f\"\"\"\n", " You are a database engineer and need to generate data for a table for the below schema.\n", " You need to generate reviews for customers who have purchased your brewed coffee.\n", " Write a positive or neutral review in first person based upon the following: \"Good Service\",\"Short Wait Time\",\"Fast Service\",\"Clean\",\"Good value\",\"Cozy Seating Areas\",\"Quite Location\",\"Variety of Milk Alternatives\",\"Consistent Quality\",\"Lots of places to sit\",\"Lots of Flavor\",\"Good Taste\",\"Good Selection\"\n", " - The schema is for a Google Cloud BigQuery Table.\n", " - The table name is \"{PROJECT_ID}.{DATASET_ID}.{table_name}\".\n", " - Read the description of each field for valid values.\n", " - Do not preface the response with any special characters or 'sql'.\n", " - Generate {rows_of_data_to_generate} insert statements for this table.\n", " - Valid values for customer_id are: {customer_ids}\n", " - The review_datetime is a date and should be within the past 5 years.\n", " - The response for each question should be 20 to 100 words.\n", " - The starting value of the field {primary_key} is {starting_value}.\n", " - Escape single quotes with a backslash. Example: Adam's Answer: Adam\\'s\n", " - Only generate data for these fields: customer_review_id, customer_id, location_id, review_datetime, review_text, social_media_source, social_media_handle\n", "\n", " Examples:\n", " Example 1: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Sample'),(2, 'Sample');\n", " Example 2: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Data'),(2, 'Data'),(3, 'Data');\n", "\n", " Schema: {schema}\n", " \"\"\"\n", "\n", " llm_valid_execution = False\n", " while llm_valid_execution == False:\n", " try:\n", " sql = LLM(prompt, False, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " print(\"---------------------------------\")\n", " print(\"sql: \", sql)\n", " print(\"---------------------------------\")\n", " llm_valid_execution = RunQuery(sql)\n", " loop_index = loop_index + 1\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "uWUogwtAqNTT" }, "source": [ "### Score the Sentiment" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fSB45vFQPnaM" }, "outputs": [], "source": [ "sql = \"\"\"SELECT customer_review_id,\n", " review_text\n", " FROM `data-analytics-preview.coffee_curated.customer_review`\n", " WHERE review_sentiment IS NULL\n", " ORDER BY customer_review_id\n", "\"\"\"\n", "\n", "# Fields to update\n", "# review_sentiment,\n", "# gen_ai_recommended_action,\n", "# gen_ai_reponse,\n", "# human_response,\n", "# response_sent_action,\n", "# response_sent_date\n", "\n", "df_process = client.query(sql).to_dataframe()\n", "\n", "for row in df_process.itertuples():\n", " customer_review_id = row.customer_review_id\n", " review_text = row.review_text\n", "\n", " llm_valid_execution = False\n", " while llm_valid_execution == False:\n", " try:\n", " prompt=f\"\"\"\n", " For the given review classify the sentiment as Positive, Neutral or Negative.\n", " Review: {review_text}\n", " \"\"\"\n", " review_sentiment = LLM(prompt, False, max_output_tokens=10, temperature=0, top_p=0, top_k=1)\n", "\n", " sql = f\"\"\"UPDATE `data-analytics-preview.coffee_curated.customer_review`\n", " SET review_sentiment = '{review_sentiment}'\n", " WHERE customer_review_id = {customer_review_id}\n", " \"\"\"\n", "\n", " print (sql)\n", "\n", " llm_valid_execution = RunQuery(sql)\n", " llm_valid_execution = True\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "6W_Dr1xGqRML" }, "source": [ "### Gen AI Response" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZJeGxIYwqUlc" }, "outputs": [], "source": [ "sql = \"\"\"SELECT customer_review_id,\n", " review_text\n", " FROM `data-analytics-preview.coffee_curated.customer_review`\n", " WHERE gen_ai_reponse IS NULL\n", " ORDER BY customer_review_id\"\"\"\n", "\n", "df_process = client.query(sql).to_dataframe()\n", "\n", "for row in df_process.itertuples():\n", " customer_review_id = row.customer_review_id\n", " review_text = row.review_text\n", "\n", " llm_valid_execution = False\n", " while llm_valid_execution == False:\n", " try:\n", " prompt=f\"\"\"\n", " Generate responses to the below customer review who purchased coffee and return the results the below json format.\n", " The review can be positive, negative, or neutral.\n", " Provide a variety of responses, including thanking customers for positive reviews, addressing concerns in negative reviews, and engaging with neutral reviews.\n", " Please generate at least 5 different responses.\n", "\n", " JSON format: [ \"value\" ]\n", " Sample JSON Response: [ \"response 1\", \"response 2\", \"response 3\", \"response 4\", \"response 5\" ]\n", "\n", " Review: {review_text}\"\"\"\n", "\n", " json_result = LLM(prompt, True, max_output_tokens=1024, temperature=0, top_p=0, top_k=1)\n", " print(f\"json_result: {json_result}\")\n", "\n", " if json_result == None:\n", " llm_valid_execution = False\n", " else:\n", " sql = f\"\"\"UPDATE `data-analytics-preview.coffee_curated.customer_review`\n", " SET gen_ai_reponse = '{json_result}'\n", " WHERE customer_review_id = {customer_review_id}\n", " \"\"\"\n", "\n", " print(f\"sql: {sql}\")\n", "\n", " llm_valid_execution = RunQuery(sql)\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "azULzoGAqVGz" }, "source": [ "### Gen AI Recommended Action" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "noL3Es8tqjr0" }, "outputs": [], "source": [ "sql = \"\"\"SELECT customer_review_id,\n", " review_text\n", " FROM `data-analytics-preview.coffee_curated.customer_review`\n", " WHERE gen_ai_recommended_action IS NULL\n", " ORDER BY customer_review_id\"\"\"\n", "\n", "df_process = client.query(sql).to_dataframe()\n", "\n", "for row in df_process.itertuples():\n", " customer_review_id = row.customer_review_id\n", " review_text = row.review_text\n", "\n", " llm_valid_execution = False\n", " while llm_valid_execution == False:\n", " try:\n", " prompt=\"\"\"\n", " Select one of the following actions based upon the below customer review who purchased coffee.\n", " - First randomly sort the actions.\n", " - Select the best action based upon the sentiment of the review.\n", " - It is okay to use the action \"Send the customer a coupon\" for both positive and negative reviews.\n", " - Return the results the below json format.\n", " - Do not include any special characters or \"```json\" in the json output\n", "\n", " Actions\n", " - \"Thank the Customer\"\n", " - \"Apologize to the Customer\"\n", " - \"Send the customer a coupon\"\n", " - \"Call the customer\"\n", " - \"Promote Additional Products\"\n", " - \"Promise to Investigate\"\n", " - \"Encourage More Reviews\"\n", " - \"Invite Further Engagement\"\n", " - \"Reshare the review on other social media\"\n", "\n", " JSON format: { \"action\" : \"value\", \"explaination\" : \"llm explaination\" }\n", " Sample JSON Response: { \"action\" : \"Call the customer\", \"explaination\" : \"The customer left their phone number in the review.\" }\n", " Sample JSON Response: { \"action\" : \"Encourage More Reviews\", \"explaination\" : \"Thanks for the review, please keep posting.\" }\n", "\n", " Review:\"\"\"\n", " prompt = prompt + review_text\n", "\n", " json_result = LLM(prompt, True, max_output_tokens=1024, temperature=0, top_p=0, top_k=1)\n", " print(f\"json_result: {json_result}\")\n", "\n", " if json_result == None:\n", " llm_valid_execution = False\n", " else:\n", " sql = f\"\"\"UPDATE `data-analytics-preview.coffee_curated.customer_review`\n", " SET gen_ai_recommended_action = '{json_result}'\n", " WHERE customer_review_id = {customer_review_id}\n", " \"\"\"\n", "\n", " print(f\"sql: {sql}\")\n", "\n", " llm_valid_execution = RunQuery(sql)\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "mOtL7RgiZox9" }, "source": [ "### Detect Customer Themes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "y7r3Kzovd-y2" }, "outputs": [], "source": [ "sql = \"\"\"SELECT customer_review_id,\n", " review_text\n", " FROM `data-analytics-preview.coffee_curated.customer_review`\n", " WHERE llm_detected_theme IS NULL\n", " ORDER BY customer_review_id\"\"\"\n", "\n", "df_process = client.query(sql).to_dataframe()\n", "\n", "for row in df_process.itertuples():\n", " customer_review_id = row.customer_review_id\n", " review_text = row.review_text\n", "\n", " llm_valid_execution = False\n", " while llm_valid_execution == False:\n", " try:\n", " prompt=\"\"\"\n", " Classify the below customer review as one or more of the below themes.\n", " - Return the results the below json format.\n", " - Include an explaination for selecting each theme.\n", " - Do not include double quotes in the explaination.\n", " - Do not include any special characters, double quotes or \"```json\" in the json output.\n", "\n", " Themes\n", " - \"Bad Service\"\n", " - \"Long Wait Time\"\n", " - \"Slow Service\"\n", " - \"Dirty\"\n", " - \"Overpriced\"\n", " - \"Overcrowded\"\n", " - \"Noisy Location\"\n", " - \"Lack of Allergan Information\"\n", " - \"Inconsistent Quality\"\n", " - \"Lack of Seating\"\n", " - \"No Flavor\"\n", " - \"Too weak\"\n", " - \"Too strong\"\n", " - \"Too bitter\"\n", " - \"Limited Menu\"\n", " - \"Good Service\"\n", " - \"Short Wait Time\"\n", " - \"Fast Service\"\n", " - \"Clean\"\n", " - \"Good value\"\n", " - \"Cozy Seating Areas\"\n", " - \"Quite Location\"\n", " - \"Variety of Milk Alternatives\"\n", " - \"Consistent Quality\"\n", " - \"Lots of places to sit\"\n", " - \"Lots of Flavor\"\n", " - \"Good Taste\"\n", " - \"Good Selection\"\n", "\n", " JSON format: [{ \"theme\" : \"value\", \"explaination\" : \"llm explaination\" }]\n", " Sample JSON Response: [{ \"theme\" : \"Fast Service\", \"explaination\" : \"The customer got their order fast.\" }]\n", " Sample JSON Response: [{ \"theme\" : \"Overpriced\", \"explaination\" : \"The customer said it was too expensive.\" }]\n", "\n", " Review:\"\"\"\n", " prompt = prompt + review_text\n", "\n", " json_result = LLM(prompt, True, max_output_tokens=1024, temperature=0, top_p=0, top_k=1)\n", " print(f\"json_result: {json_result}\")\n", "\n", " if json_result == None:\n", " llm_valid_execution = False\n", " else:\n", " sql = f\"\"\"UPDATE `data-analytics-preview.coffee_curated.customer_review`\n", " SET llm_detected_theme = JSON'{json_result}'\n", " WHERE customer_review_id = {customer_review_id}\n", " \"\"\"\n", "\n", " print(f\"sql: {sql}\")\n", "\n", " llm_valid_execution = RunQuery(sql)\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "Wuv45clVnqUf" }, "source": [ "## Menu Table" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0wyRzw-QFHm-" }, "outputs": [], "source": [ "table_name = \"company\"\n", "field_name = \"company_id\"\n", "company_ids = GetForeignKeys(DATASET_ID, table_name, field_name)\n", "\n", "table_name = \"menu\"\n", "primary_key = \"menu_id\"\n", "\n", "schema = GetTableSchema(DATASET_ID, table_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zlEe3aBvf33F" }, "outputs": [], "source": [ "loop_count = 10\n", "loop_index = 1\n", "menu_count = 10\n", "\n", "while loop_index <= loop_count:\n", " print(f\"loop_index: {loop_index} | loop_count: {loop_count}\")\n", "\n", " # Get Menu Names\n", " menu_count = 5 # We mulitple this by 3 to get 15 (one for small, medium and large sizes)\n", "\n", " table_name = \"menu\"\n", " field_name = \"item_name\"\n", " existing_values = GetDistinctValues(DATASET_ID, table_name, field_name)\n", "\n", " prompt = f\"\"\"Generate {menu_count} different coffee drink names and return in the below json format.\n", " - The name can be an existing coffee drink or something new.\n", " - The name should be related to coffee.\n", " - Do not use any of these names: [{existing_values}]\n", " - Do not number the results.\n", "\n", " JSON format: [ \"value\" ]\n", " Sample JSON Response: [ \"value1\", \"value2\" ]\n", " \"\"\"\n", "\n", " llm_valid_execution = False\n", " while llm_valid_execution == False:\n", " try:\n", " menu_names = LLM(prompt, True, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " llm_valid_execution = True\n", " except Exception as error:\n", " print(\"An error occurred:\", error)\n", "\n", "\n", " # Insert data\n", " starting_value = GetStartingValue(DATASET_ID, table_name, primary_key)\n", "\n", " prompt=f\"\"\"\n", " You are a database engineer and need to generate data for a table for the below schema.\n", " - The schema is for a Google Cloud BigQuery Table.\n", " - The table name is \"{PROJECT_ID}.{DATASET_ID}.{table_name}\".\n", " - Read the description of each field for valid values.\n", " - Do not preface the response with any special characters or 'sql'.\n", " - Generate {menu_count * 3} total rows for this table.\n", " - Valid values for company_id are: {company_ids}\n", " - Valid values for item_name are: {menu_names}\n", " - The starting value of the field {primary_key} is {starting_value}.\n", " - Create a Small, Medium and Large size for each item_name. The same company_id should be used as well for all 3 sizes.\n", "\n", " Example 1: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Sample'),(2, 'Sample');\n", " Example 2: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Data'),(2, 'Data'),(3, 'Data');\n", "\n", " Schema: {schema}\n", " \"\"\"\n", "\n", " llm_valid_execution = False\n", " while llm_valid_execution == False:\n", " try:\n", " sql = LLM(prompt, False, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " print(\"---------------------------------\")\n", " print(\"sql: \", sql)\n", " print(\"---------------------------------\")\n", " llm_valid_execution = RunQuery(sql)\n", " loop_index = loop_index + 1\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6TEqBTkAAqLF" }, "outputs": [], "source": [ "loop_count = 1\n", "loop_index = 1\n", "menu_count = 20\n", "\n", "while loop_index <= loop_count:\n", " print(f\"loop_index: {loop_index} | loop_count: {loop_count}\")\n", "\n", " table_name = \"menu\"\n", " field_name = \"item_name\"\n", " existing_values = GetDistinctValues(DATASET_ID, table_name, field_name)\n", "\n", " prompt = f\"\"\"Generate {menu_count} different foods that you would buy with coffee and return in the below json format.\n", " - The name can be an existing coffee drink or something new.\n", " - The name should be related to coffee.\n", " - Do not use any of these names: [{existing_values}]\n", " - Do not number the results.\n", "\n", " JSON format: [ \"value\" ]\n", " Sample JSON Response: [ \"value1\", \"value2\" ]\n", " \"\"\"\n", "\n", " llm_valid_execution = False\n", " while llm_valid_execution == False:\n", " try:\n", " #menu_names = LLM(prompt, True, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " llm_valid_execution = True\n", " except Exception as error:\n", " print(\"An error occurred:\", error)\n", "\n", " menu_names = \"\"\"\n", " [\"Cinnamon Roll\",\n", " \"Espresso Brownie\",\n", " \"Coffee Tiramisu\",\n", " \"Cafe Latte Donut\",\n", " \"Coffee Glazed Bacon\",\n", " \"Hazelnut Croissant\",\n", " \"Mocha Pancakes\",\n", " \"Coffee-infused Oatmeal\",\n", " \"Cafe Muffin\",\n", " \"Espresso Chocolate Truffle\",\n", " \"Caffeine Chip Cookies\",\n", " \"Mocha Almond Croissant\",\n", " \"Coffee Bean Scones\",\n", " \"Java Chip Pancakes\",\n", " \"Hazelnut Latte Donut\",\n", " \"Cappuccino Cheesecake\",\n", " \"Coffee Tiramisu\",\n", " \"Caramel Macchiato Muffin\",\n", " \"Coffee-Infused Chocolate Truffles\",\n", " \"Coffee Bean Ice Cream\"]\"\"\"\n", " # Insert data\n", " starting_value = GetStartingValue(DATASET_ID, table_name, primary_key)\n", "\n", " prompt=f\"\"\"\n", " You are a database engineer and need to generate data for a table for the below schema.\n", " - The schema is for a Google Cloud BigQuery Table.\n", " - The table name is \"{PROJECT_ID}.{DATASET_ID}.{table_name}\".\n", " - Read the description of each field for valid values.\n", " - Do not preface the response with any special characters or 'sql'.\n", " - Generate {menu_count} total rows for this table.\n", " - Valid values for company_id are: {company_ids}\n", " - Valid values for item_name are: {menu_names}\n", " - The starting value of the field {primary_key} is {starting_value}.\n", " - Create a Small, Medium and Large size for each item_name. The same company_id should be used as well for all 3 sizes.\n", "\n", " Example 1: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Sample'),(2, 'Sample');\n", " Example 2: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Data'),(2, 'Data'),(3, 'Data');\n", "\n", " Schema: {schema}\n", " \"\"\"\n", "\n", " llm_valid_execution = False\n", " while llm_valid_execution == False:\n", " try:\n", " sql = LLM(prompt, False, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " print(\"---------------------------------\")\n", " print(\"sql: \", sql)\n", " print(\"---------------------------------\")\n", " llm_valid_execution = RunQuery(sql)\n", " loop_index = loop_index + 1\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "KeDoqMQNAFcE" }, "source": [ "## Order Table" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "j8Aoyg2hE_Ru" }, "outputs": [], "source": [ "order_count = 10\n", "\n", "table_name = \"customer\"\n", "field_name = \"customer_id\"\n", "max_customer_id = GetMaximumValue(DATASET_ID, table_name, field_name)\n", "\n", "table_name = \"location\"\n", "field_name = \"location_id\"\n", "max_location_id = GetMaximumValue(DATASET_ID, table_name, field_name)\n", "\n", "table_name = \"order\"\n", "primary_key = \"order_id\"\n", "\n", "schema = GetTableSchema(DATASET_ID, table_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JPmtl23VAFcO" }, "outputs": [], "source": [ "loop_count = 50\n", "loop_index = 1\n", "\n", "while loop_index <= loop_count:\n", " print(f\"loop_index: {loop_index} | loop_count: {loop_count}\")\n", " starting_value = GetStartingValue(DATASET_ID, table_name, primary_key)\n", "\n", " prompt=f\"\"\"\n", " You are a database engineer and need to generate data for a table for the below schema.\n", " - The schema is for a Google Cloud BigQuery Table.\n", " - The table name is \"{PROJECT_ID}.{DATASET_ID}.{table_name}\".\n", " - Read the description of each field for valid values.\n", " - Do not preface the response with any special characters or 'sql'.\n", " - Generate {order_count} insert statements for this table.\n", " - The order_datetime is a date and should be within the past 1 year.\n", " - The order_completion_datetime should be within 60 to 900 seconds of the order_datetime.\n", " - Valid values for location_id between 1 and {max_location_id}.\n", " - Valid values for customer_id between 1 and {max_customer_id}.\n", " - The starting value of the field {primary_key} is {starting_value}.\n", " - Timestamps should use this format: 2020-06-02 23:57:12.120174 UTC.\n", "\n", " Example 1: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Sample'),(2, 'Sample');\n", " Example 2: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Data'),(2, 'Data'),(3, 'Data');\n", "\n", " Schema: {schema}\n", " \"\"\"\n", "\n", " llm_valid_execution = False\n", " while llm_valid_execution == False:\n", " try:\n", " sql = LLM(prompt, False, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " sql = sql.replace(\"\\\\'\",\"'\")\n", " print(\"---------------------------------\")\n", " print(\"sql: \", sql)\n", " print(\"---------------------------------\")\n", " llm_valid_execution = RunQuery(sql)\n", " loop_index = loop_index + 1\n", " except Exception as error:\n", " print(\"An error occurred:\", error)\n" ] }, { "cell_type": "markdown", "metadata": { "id": "0Bni86QYEusy" }, "source": [ "## Order Item Table" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "op7cuZl0F60U" }, "outputs": [], "source": [ "order_item_count = 3" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "b1EdHmZrFbc9" }, "outputs": [], "source": [ "table_name = \"menu\"\n", "field_name = \"menu_id\"\n", "max_menu_id = GetMaximumValue(DATASET_ID, table_name, field_name)\n", "\n", "table_name = \"order\"\n", "field_name = \"order_id\"\n", "max_order_id = GetMaximumValue(DATASET_ID, table_name, field_name)\n", "\n", "table_name = \"order_item\"\n", "primary_key = \"order_item_id\"\n", "\n", "schema = GetTableSchema(DATASET_ID, table_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "4KYEq_cjEusy" }, "outputs": [], "source": [ "loop_count = 100\n", "loop_index = 1\n", "\n", "while loop_index <= loop_count:\n", " print(f\"loop_index: {loop_index} | loop_count: {loop_count}\")\n", " starting_value = GetStartingValue(DATASET_ID, table_name, primary_key)\n", "\n", " max_order_item_id = GetMaximumValue(DATASET_ID, table_name, \"order_id\")\n", "\n", " if max_order_item_id > max_order_id:\n", " print(\"Breaking out of loop since we have items for each order.\")\n", " break\n", "\n", " order_id_to_generate_data = max_order_item_id + 1\n", " print(f\"order_id_to_generate_data: {order_id_to_generate_data}\")\n", "\n", " prompt=f\"\"\"\n", " You are a database engineer and need to generate data for a table for the below schema.\n", " - The schema is for a Google Cloud BigQuery Table.\n", " - The table name is \"{PROJECT_ID}.{DATASET_ID}.{table_name}\".\n", " - Read the description of each field for valid values.\n", " - Do not preface the response with any special characters or 'sql'.\n", " - Generate {order_item_count} insert statements for this table.\n", " - You can have 1 to 10 items for a single order_id.\n", " - The order_id should use the value of: {order_id_to_generate_data}.\n", " - Valid values for menu_id between 1 and {max_menu_id}.\n", " - Valid values for quantity as between 1 and 4.\n", " - The starting value of the field {primary_key} is {starting_value}.\n", "\n", " Example 1: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Sample'),(2, 'Sample');\n", " Example 2: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Data'),(2, 'Data'),(3, 'Data');\n", "\n", " Schema: {schema}\n", " \"\"\"\n", "\n", "\n", " llm_valid_execution = False\n", " while llm_valid_execution == False:\n", " try:\n", " sql = LLM(prompt, False, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " print(\"---------------------------------\")\n", " print(\"sql: \", sql)\n", " print(\"---------------------------------\")\n", " llm_valid_execution = RunQuery(sql)\n", " loop_index = loop_index + 1\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "6LvkDmsAZVuQ" }, "source": [ "## Coffee Farm Table" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "u0GZ4jVSZVuR" }, "outputs": [], "source": [ "coffee_farm_count = 10\n", "\n", "table_name = \"coffee_farm\"\n", "primary_key = \"coffee_farm_id\"\n", "\n", "schema = GetTableSchema(DATASET_ID, table_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0rm-TxQLmQuS" }, "outputs": [], "source": [ "table_name = \"coffee_farm\"\n", "field_name = \"name\"\n", "existing_values = GetDistinctValues(DATASET_ID, table_name, field_name)\n", "\n", "prompt = f\"\"\"Generate {coffee_farm_count} creative names and return in the below json format.\n", "- The name should be new and not a name that is already used by an existing coffee company.\n", "- The name should be related to coffee farms that grow coffee beans.\n", "- Do not use the following values: {existing_values}\n", "\n", "JSON format: [ \"value\" ]\n", "Sample JSON Response: [ \"value1\", \"value2\" ]\n", "\"\"\"\n", "\n", "llm_valid_execution = False\n", "while llm_valid_execution == False:\n", " try:\n", " coffee_farm_names = LLM(prompt, True, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " llm_valid_execution = True\n", " except Exception as error:\n", " print(\"An error occurred:\", error)\n", "\n", "print(f\"coffee_farm_names: {coffee_farm_names}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HFj7hytMZVuR" }, "outputs": [], "source": [ "starting_value = GetStartingValue(DATASET_ID, table_name, primary_key)\n", "\n", "prompt=f\"\"\"\n", "You are a database engineer and need to generate data for a table for the below schema.\n", "- The schema is for a Google Cloud BigQuery Table.\n", "- The table name is \"{PROJECT_ID}.{DATASET_ID}.{table_name}\".\n", "- Read the description of each field for valid values.\n", "- Do not preface the response with any special characters or 'sql'.\n", "- Generate {coffee_farm_count} insert statements for this table.\n", "- The starting value of the field {primary_key} is {starting_value}.\n", "- Valid values for name are: {coffee_farm_names}\n", "- Use the World Geodetic System (WGS) for the latitude and longitude values.\n", "- When inserting GEOGRAPHY data types use the BigQuery function ST_GEOGPOINT.\n", "- Escape single quotes with a backslash. Example: Adam's Answer: Adam\\'s\n", "\n", "Example 1: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Sample'),(2, 'Sample');\n", "Example 2: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Data'),(2, 'Data'),(3, 'Data');\n", "\n", "Schema: {schema}\n", "\"\"\"\n", "\n", "llm_valid_execution = False\n", "while llm_valid_execution == False:\n", " try:\n", " sql = LLM(prompt, False, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " print(\"---------------------------------\")\n", " print(\"sql: \", sql)\n", " print(\"---------------------------------\")\n", " llm_valid_execution = RunQuery(sql)\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "EG1vLI1rbY3o" }, "source": [ "## Coffee Processor Table" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2WDgpdk_bY3p" }, "outputs": [], "source": [ "coffee_processor_count = 10\n", "\n", "table_name = \"coffee_processor\"\n", "primary_key = \"coffee_processor_id\"\n", "\n", "schema = GetTableSchema(DATASET_ID, table_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HJgAgQckpbI4" }, "outputs": [], "source": [ "table_name = \"coffee_processor\"\n", "field_name = \"name\"\n", "existing_values = GetDistinctValues(DATASET_ID, table_name, field_name)\n", "\n", "prompt = f\"\"\"Generate {coffee_processor_count} creative names and return in the below json format.\n", "- The name should be new and not a name that is already used by an existing coffee company.\n", "- The name should be related to coffee processing facilities that process the raw beans.\n", "- Do not use the following values: {existing_values}\n", "\n", "JSON format: [ \"value\" ]\n", "Sample JSON Response: [ \"value1\", \"value2\" ]\n", "\"\"\"\n", "\n", "llm_valid_execution = False\n", "while llm_valid_execution == False:\n", " try:\n", " coffee_processor_names = LLM(prompt, True, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " llm_valid_execution = True\n", " except Exception as error:\n", " print(\"An error occurred:\", error)\n", "\n", "print(f\"coffee_processor_names: {coffee_processor_names}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "dpO1oNZLbY3p" }, "outputs": [], "source": [ "starting_value = GetStartingValue(DATASET_ID, table_name, primary_key)\n", "\n", "prompt=f\"\"\"\n", "You are a database engineer and need to generate data for a table for the below schema.\n", "- The schema is for a Google Cloud BigQuery Table.\n", "- The table name is \"{PROJECT_ID}.{DATASET_ID}.{table_name}\".\n", "- Read the description of each field for valid values.\n", "- Do not preface the response with any special characters or 'sql'.\n", "- Generate a single INSERT statement with {coffee_processor_count} rows of data.\n", "- The starting value of the field {primary_key} is {starting_value}.\n", "- Valid values for name are: {coffee_processor_names}\n", "- Use the World Geodetic System (WGS) for the latitude and longitude values.\n", "- When inserting GEOGRAPHY data types use the BigQuery function ST_GEOGPOINT.\n", "- Escape single quotes with a backslash. Example: Adam's Answer: Adam\\'s\n", "\n", "Example 1: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Sample'),(2, 'Sample');\n", "Example 2: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Data'),(2, 'Data'),(3, 'Data');\n", "\n", "Schema: {schema}\n", "\"\"\"\n", "\n", "llm_valid_execution = False\n", "while llm_valid_execution == False:\n", " try:\n", " sql = LLM(prompt, False, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " print(\"---------------------------------\")\n", " print(\"sql: \", sql)\n", " print(\"---------------------------------\")\n", " llm_valid_execution = RunQuery(sql)\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "x0iOmN-ue1xl" }, "source": [ "## Coffee Roaster Table" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "D7A3AIEGe1xl" }, "outputs": [], "source": [ "coffee_roaster_count = 10\n", "\n", "table_name = \"coffee_roaster\"\n", "primary_key = \"coffee_roaster_id\"\n", "\n", "schema = GetTableSchema(DATASET_ID, table_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "AxDwKdUWuEPV" }, "outputs": [], "source": [ "table_name = \"coffee_roaster\"\n", "field_name = \"name\"\n", "existing_values = GetDistinctValues(DATASET_ID, table_name, field_name)\n", "\n", "prompt = f\"\"\"Generate {coffee_roaster_count} creative names and return in the below json format.\n", "- The name should be new and not a name that is already used by an existing coffee company.\n", "- The name should be related to coffee roasters that roast the beans to perfection.\n", "- Do not use the following values: {existing_values}\n", "\n", "JSON format: [ \"value\" ]\n", "Sample JSON Response: [ \"value1\", \"value2\" ]\n", "\"\"\"\n", "\n", "llm_valid_execution = False\n", "while llm_valid_execution == False:\n", " try:\n", " coffee_roaster_names = LLM(prompt, True, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " llm_valid_execution = True\n", " except Exception as error:\n", " print(\"An error occurred:\", error)\n", "\n", "print(f\"coffee_roaster_names: {coffee_roaster_names}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "TYBHUYUye1xl" }, "outputs": [], "source": [ "starting_value = GetStartingValue(DATASET_ID, table_name, primary_key)\n", "\n", "prompt=f\"\"\"\n", "You are a database engineer and need to generate data for a table for the below schema.\n", "- The schema is for a Google Cloud BigQuery Table.\n", "- The table name is \"{PROJECT_ID}.{DATASET_ID}.{table_name}\".\n", "- Read the description of each field for valid values.\n", "- Do not preface the response with any special characters or 'sql'.\n", "- Generate a single INSERT statement with {coffee_roaster_count} rows of data.\n", "- The starting value of the field {primary_key} is {starting_value}.\n", "- Valid values for name are: {coffee_processor_names}\n", "- Use the World Geodetic System (WGS) for the latitude and longitude values.\n", "- When inserting GEOGRAPHY data types use the BigQuery function ST_GEOGPOINT.\n", "- Escape single quotes with a backslash. Example: Adam's Answer: Adam\\'s\n", "\n", "Example 1: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Sample'),(2, 'Sample');\n", "Example 2: INSERT INTO `my-dataset.my-dataset.my-table` (field_1, field_2) VALUES (1, 'Data'),(2, 'Data'),(3, 'Data');\n", "\n", "Schema: {schema}\n", "\"\"\"\n", "\n", "llm_valid_execution = False\n", "while llm_valid_execution == False:\n", " try:\n", " sql = LLM(prompt, False, max_output_tokens=1024, temperature=1, top_p=1, top_k=40)\n", " print(\"---------------------------------\")\n", " print(\"sql: \", sql)\n", " print(\"---------------------------------\")\n", " llm_valid_execution = RunQuery(sql)\n", " except Exception as error:\n", " print(\"An error occurred:\", error)" ] }, { "cell_type": "markdown", "metadata": { "id": "MJhOnDeYPgKL" }, "source": [ "# Set Locations (by hand)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kfQgzYDVOEqT" }, "outputs": [], "source": [ "sql = f\"\"\"ALTER TABLE `{PROJECT_ID}.{DATASET_ID}.location`\n", " ADD COLUMN IF NOT EXISTS city_id INT OPTIONS(description=\"The primary city for a location. For coffee trucks this is the current city.\"),\n", " ADD COLUMN IF NOT EXISTS current_latitude FLOAT64 OPTIONS(description=\"The current latitude of the coffee truck or physical store.\"),\n", " ADD COLUMN IF NOT EXISTS current_longitude FLOAT64 OPTIONS(description=\"The current longitude of the coffee truck or physical store.\"),\n", " ADD COLUMN IF NOT EXISTS current_lat_long GEOGRAPHY OPTIONS(description=\"The current latitude and longitude of the coffee truck or physical store.\")\n", " ;\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "N4ZBxn_iNJWQ" }, "outputs": [], "source": [ "sql = f\"\"\"\n", "CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.location_history`\n", "(\n", " location_history_id INTEGER NOT NULL OPTIONS(description=\"Primary key.\"),\n", " location_id INTEGER NOT NULL OPTIONS(description=\"Foreign key: Location table.\"),\n", " city_id STRING NOT NULL OPTIONS(description=\"Foreign key: City table.\"),\n", " start_datetime TIMESTAMP NOT NULL OPTIONS(description=\"The start time of when this location was established\"),\n", " stop_datetime TIMESTAMP NOT NULL OPTIONS(description=\"The end time of when this location was moved from.\"),\n", " latitude FLOAT64 NOT NULL OPTIONS(description=\"The latitude of the coffee truck.\"),\n", " longitude FLOAT64 NOT NULL OPTIONS(description=\"The longitude of the coffee truck.\"),\n", " lat_long GEOGRAPHY NOT NULL OPTIONS(description=\"The latitude and longitude of the coffee truck.\")\n", ")\n", "CLUSTER BY location_id;\n", "\"\"\"\n", "\n", "RunQuery(sql)" ] }, { "cell_type": "markdown", "metadata": { "id": "80fLTGc8BOji" }, "source": [ "# Learnings" ] }, { "cell_type": "markdown", "metadata": { "id": "Re1SPznGBRwB" }, "source": [ "Issues:\n", "- LLMs are slow\n", "- Scaling to millions of rows is an issue\n", "- Output contains: ```\n", "- Output contains: ```sql\n", "- dates are not always valid dates \"2017-09-31\" to type DATE\n", "- for UUIDs, use an INT and then swap to UUID later (add a column and then do an update)\n", "- LLM returns single quotes in strings. Had to prompt or string.replace.\n", "- Probally need to use Min/Max of primary key for ints.\n", "- Sometimes the LLM generates multiple insert..intos\n", "- Inserts are sometimes many INSERTS and not many values\n", "\n", "Learnings\n", "- The LLM can generate Small, Medium and Large (3 records for menu items) with pricing that is correct.\n", "- The LLM can understand the schema\n", "- The LLM can understand the description (valid values)\n", "-" ] }, { "cell_type": "markdown", "metadata": { "id": "vyV1TXsNDkQr" }, "source": [ "**Coffee Tables**\n", "- Company (done)\n", "- Location (done)\n", "- Customer (done, needs loop)\n", "- Customer Profile (created table. needs code to populate LLM items)\n", "- Customer Survey (done, needs loop)\n", "- Customer Review\n", "- Menu\n", "- Order\n", "- Order Detail\n", "- Coffee Farm\n", "- Coffee Processor\n", "- Coffee Exporter\n", "- Coffee Roaster\n", "- Coffee Supply Chain\n", "\n", "**Streaming Data**\n", "- Coffee Sales\n", "- Customer Feedback\n", "- Location Telemetry\n", "\n", "\n" ] } ], "metadata": { "colab": { "collapsed_sections": [ "DszuLZoo9A7k", "6YeExbVqf4ZE", "CUPdwNf_89Gq", "Jr2sIqNP800E", "deSLhzKC8qHQ", "nlhRKtPAsAWz", "ylfzCAIzEALT", "Mg8ZZEViq9lu", "uWUogwtAqNTT", "6W_Dr1xGqRML", "Wuv45clVnqUf", "KeDoqMQNAFcE", "0Bni86QYEusy", "6LvkDmsAZVuQ", "EG1vLI1rbY3o", "x0iOmN-ue1xl", "80fLTGc8BOji" ], "name": "BigQuery table", "private_outputs": true, "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }

colab-enterprise/synthetic_data_generation_using_llms.ipynb (2,456 lines of code) (raw):