colab-enterprise/gen-ai-demo/Event-Populate-Table.ipynb (467 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "k6eIqerFOzyj" }, "source": [ "## <img src=\"https://lh3.googleusercontent.com/mUTbNK32c_DTSNrhqETT5aQJYFKok2HB1G2nk2MZHvG5bSs0v_lmDm_ArW7rgd6SDGHXo0Ak2uFFU96X6Xd0GQ=w160-h128\" width=\"45\" valign=\"top\" alt=\"BigQuery\"> Populate the BigQuery Event table\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### License" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "##################################################################################\n", "# Copyright 2024 Google LLC\n", "#\n", "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", "# You may obtain a copy of the License at\n", "# \n", "# https://www.apache.org/licenses/LICENSE-2.0\n", "# \n", "# Unless required by applicable law or agreed to in writing, software\n", "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License.\n", "###################################################################################" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Notebook Overview" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- This notebook will download the current day events from Google Events for each city.\n", "\n", "- Notebook Logic:\n", " 1. For each city\n", " - Get a list of the events \n", " - Append this to an overall list\n", " 2. Bulk insert the data into BigQuery\n", " 3. Show the results" ] }, { "cell_type": "markdown", "metadata": { "id": "2FHvD3ffG52M" }, "source": [ "## Initialize Python" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "h-3IL5aFP6Dl" }, "outputs": [], "source": [ "# You will need to create an account and verify your email.\n", "# https://serpapi.com/\n", "# You get 100 free calls (per month)\n", "\n", "serpapi_key = \"<<Get your own key>>\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Y8loQuV0Xu10" }, "outputs": [], "source": [ "# We will generate events for each city for the number of days. Since we have 4 cites and 100 free calls, you need to do at most 25 days.\n", "# date:today - Today's Events\n", "# date:tomorrow - Tomorrow's Events\n", "htichips = \"date:today\"\n", "\n", "# No need to set these\n", "city_names=[\"New York City\", \"London\", \"Tokyo\", \"San Francisco\"]\n", "city_ids=[1,2,3,4]\n", "city_timezones=[\"US/Eastern\",\"Europe/London\",\"Asia/Tokyo\",\"US/Pacific\"]\n", "number_of_coffee_trucks = \"5\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2EfhBUenO8OC" }, "outputs": [], "source": [ "import sys\n", "!{sys.executable} -m pip install google-search-results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HTNQKUNaRsyO" }, "outputs": [], "source": [ "from google.cloud import bigquery\n", "\n", "client = bigquery.Client()" ] }, { "cell_type": "markdown", "metadata": { "id": "KOXpog83RlPG" }, "source": [ "## Create the GenAI Insights table" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jrWuWfUCSZUv" }, "outputs": [], "source": [ "%%bigquery\n", "CREATE TABLE IF NOT EXISTS `${project_id}.${bigquery_data_beans_curated_dataset}.event`\n", "--CREATE OR REPLACE TABLE `${project_id}.${bigquery_data_beans_curated_dataset}.event` -- only use this for starting over\n", "(\n", " event_id INT NOT NULL OPTIONS(description=\"The primary key of the event.\"),\n", " city_id INT NOT NULL OPTIONS(description=\"The foreign key of the city.\"),\n", " event_title STRING NOT NULL OPTIONS(description=\"The title of the event.\"),\n", " event_date DATE NOT NULL OPTIONS(description=\"The date of the event.\"),\n", " event_time_string STRING NOT NULL OPTIONS(description=\"The time (string value) of the event.\"),\n", " event_venue STRING NOT NULL OPTIONS(description=\"The venue of the event.\"),\n", " event_venue_link STRING NOT NULL OPTIONS(description=\"The generated insight in text\"),\n", " event_address STRING NOT NULL OPTIONS(description=\"The full address of the event.\"),\n", " event_description STRING NOT NULL OPTIONS(description=\"The description of the event.\"),\n", " event_reviews INT64 NOT NULL OPTIONS(description=\"The number of reviews of the event.\"),\n", " event_thumbnail STRING NOT NULL OPTIONS(description=\"The thumbnail image for the event.\")\n", ")\n", "CLUSTER BY event_id;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Supporting Functions" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "GZ18slUFRpjG" }, "outputs": [], "source": [ "def RunQuery(sql):\n", " import time\n", "\n", " if (sql.startswith(\"SELECT\")):\n", " df_result = client.query(sql).to_dataframe()\n", " return df_result\n", " else:\n", " job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)\n", " query_job = client.query(sql, job_config=job_config)\n", "\n", " # Check on the progress by getting the job's updated state.\n", " query_job = client.get_job(\n", " query_job.job_id, location=query_job.location\n", " )\n", " print(\"Job {} is currently in state {} with error result of {}\".format(query_job.job_id, query_job.state, query_job.error_result))\n", "\n", " while query_job.state != \"DONE\":\n", " time.sleep(2)\n", " query_job = client.get_job(\n", " query_job.job_id, location=query_job.location\n", " )\n", " print(\"Job {} is currently in state {} with error result of {}\".format(query_job.job_id, query_job.state, query_job.error_result))\n", "\n", " if query_job.error_result == None:\n", " return True\n", " else:\n", " return False" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qNUW0MFiUOFy" }, "outputs": [], "source": [ "def GetNextPrimaryKey(fully_qualified_table_name, field_name):\n", " sql = f\"\"\"\n", " SELECT IFNULL(MAX({field_name}),0) AS result\n", " FROM `{fully_qualified_table_name}`\n", " \"\"\"\n", " # print(sql)\n", " df_result = client.query(sql).to_dataframe()\n", " # display(df_result)\n", " return df_result['result'].iloc[0] + 1" ] }, { "cell_type": "markdown", "metadata": { "id": "ZrQ_gOebR2vX" }, "source": [ "## Get the list of Events" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can use a 3rd party service to download the events\n", "- https://serpapi.com/google-events-api\n", "- https://serpapi.com/blog/scrape-google-events-results-with-python/" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kuR_DSxPO0CR" }, "outputs": [], "source": [ "from serpapi import GoogleSearch\n", "from datetime import date\n", "\n", "event_id = GetNextPrimaryKey(\"${project_id}.${bigquery_data_beans_curated_dataset}.event\",\"event_id\")\n", "print(f\"event_id: {event_id}\")\n", "\n", "event_date = date.today()\n", "event_records = []\n", "\n", "# Loop for each city\n", "for city_index in range(0, 4):\n", " print(f\"City: {city_ids[city_index]}\")\n", "\n", " google_events_params = {\n", " \"engine\": \"google_events\",\n", " \"q\": f\"Events in {city_names[city_index]}\",\n", " \"hl\": \"en\",\n", " \"gl\": \"us\",\n", " \"api_key\": f\"{serpapi_key}\",\n", " 'start': 0,\n", " \"htichips\" : f\"{htichips}\"\n", " }\n", "\n", " while True:\n", " search = GoogleSearch(google_events_params)\n", " event_search_results = search.get_dict()\n", " if 'error' in event_search_results:\n", " break\n", "\n", " for item in event_search_results[\"events_results\"]:\n", " try:\n", " event_venue = \"\"\n", " address = \"\"\n", " for address_item in item[\"address\"]:\n", " if event_venue == \"\":\n", " split_text = address_item.split(\", \")\n", " event_venue = split_text[0]\n", " address = split_text[1]\n", " else:\n", " address = address + \" \" + address_item\n", "\n", " #print(f\"item: {item}\")\n", " #print(f\"event_venue: {event_venue}\")\n", " #print(f\"address: {address}\")\n", " #print(\"------------------------------------------------------\")\n", "\n", " event = {\n", " \"event_id\" : event_id,\n", " \"city_id\" : city_ids[city_index],\n", " \"event_title\" : item['title'],\n", " \"event_date\" : event_date,\n", " \"event_time_string\" : item['date']['when'],\n", " \"event_venue\" : event_venue,\n", " \"event_venue_link\" : item['venue']['link'],\n", " \"event_address\" : address,\n", " \"event_description\" : item['description'],\n", " \"event_reviews\" : item['venue']['reviews'],\n", " \"event_thumbnail\" : item['thumbnail']\n", " }\n", " event_id = event_id + 1\n", "\n", " event_records.append(event)\n", "\n", " except Exception as error:\n", " print(f\"error: {error}\")\n", " print(f\"item: {item}\")\n", " print(f\"------------------------------\")\n", "\n", " google_events_params['start'] += 10\n", "\n", " if google_events_params['start'] > 30:\n", " break # only do 30 for now so our prompt is not too large" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "xUBsOC6BwuXL" }, "outputs": [], "source": [ "event_records" ] }, { "cell_type": "markdown", "metadata": { "id": "fmvU3Lv9R6TO" }, "source": [ "## Insert the events (in bulk)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sFuLk875qfPk" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "-- remove the current days events (in case you re-run this notebook)\n", "DELETE\n", " FROM `${project_id}.${bigquery_data_beans_curated_dataset}.event` AS event\n", " WHERE event_date = CURRENT_DATE();" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "A2EA2U8feZjh" }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# Load the events table (in bulk)\n", "table_id = \"${project_id}.${bigquery_data_beans_curated_dataset}.event\"\n", "\n", "dataframe = pd.DataFrame(\n", " pd.DataFrame(event_records), # Your source data\n", " columns=[\n", " \"event_id\",\n", " \"city_id\",\n", " \"event_title\",\n", " \"event_date\",\n", " \"event_time_string\",\n", " \"event_venue\",\n", " \"event_venue_link\",\n", " \"event_address\",\n", " \"event_description\",\n", " \"event_reviews\",\n", " \"event_thumbnail\"\n", " ],\n", ")\n", "\n", "job_config = bigquery.LoadJobConfig(\n", " schema=[\n", " bigquery.SchemaField(\"event_id\", bigquery.enums.SqlTypeNames.INT64, mode=\"REQUIRED\"),\n", " bigquery.SchemaField(\"city_id\", bigquery.enums.SqlTypeNames.INT64, mode=\"REQUIRED\"),\n", " bigquery.SchemaField(\"event_title\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\"),\n", " bigquery.SchemaField(\"event_date\", bigquery.enums.SqlTypeNames.DATE, mode=\"REQUIRED\"),\n", " bigquery.SchemaField(\"event_time_string\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\"),\n", " bigquery.SchemaField(\"event_venue\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\"),\n", " bigquery.SchemaField(\"event_venue_link\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\"),\n", " bigquery.SchemaField(\"event_address\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\"),\n", " bigquery.SchemaField(\"event_description\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\"),\n", " bigquery.SchemaField(\"event_reviews\", bigquery.enums.SqlTypeNames.INT64, mode=\"REQUIRED\"),\n", " bigquery.SchemaField(\"event_thumbnail\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\")\n", " ],\n", " write_disposition=\"WRITE_APPEND\",\n", ")\n", "\n", "job = client.load_table_from_dataframe(dataframe, table_id, job_config=job_config)\n", "job.result() # Wait for the job to complete.\n", "\n", "table = client.get_table(table_id) # Make an API request.\n", "print(\"Loaded {} rows and {} columns to {}\".format(table.num_rows, len(table.schema), table_id))" ] }, { "cell_type": "markdown", "metadata": { "id": "PampTfgdRfqe" }, "source": [ "## See the results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "RCCogPjFSPHO" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "# Display the results\n", "SELECT *\n", " FROM `${project_id}.${bigquery_data_beans_curated_dataset}.event`\n", "WHERE event_date = CURRENT_DATE()\n", "ORDER BY event_id" ] } ], "metadata": { "colab": { "collapsed_sections": [ "ZrQ_gOebR2vX" ], "name": "BigQuery table", "private_outputs": true, "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }