qwiklabs/colab-enterprise/gen-ai-demo/Event-Populate-Table.ipynb (467 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "k6eIqerFOzyj"
},
"source": [
"## <img src=\"https://lh3.googleusercontent.com/mUTbNK32c_DTSNrhqETT5aQJYFKok2HB1G2nk2MZHvG5bSs0v_lmDm_ArW7rgd6SDGHXo0Ak2uFFU96X6Xd0GQ=w160-h128\" width=\"45\" valign=\"top\" alt=\"BigQuery\"> Populate the BigQuery Event table\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### License"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"##################################################################################\n",
"# Copyright 2024 Google LLC\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"# \n",
"# https://www.apache.org/licenses/LICENSE-2.0\n",
"# \n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License.\n",
"###################################################################################"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Notebook Overview"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- This notebook will download the current day events from Google Events for each city.\n",
"\n",
"- Notebook Logic:\n",
" 1. For each city\n",
" - Get a list of the events \n",
" - Append this to an overall list\n",
" 2. Bulk insert the data into BigQuery\n",
" 3. Show the results"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2FHvD3ffG52M"
},
"source": [
"## Initialize Python"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "h-3IL5aFP6Dl"
},
"outputs": [],
"source": [
"# You will need to create an account and verify your email.\n",
"# https://serpapi.com/\n",
"# You get 100 free calls (per month)\n",
"\n",
"serpapi_key = \"<<Get your own key>>\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Y8loQuV0Xu10"
},
"outputs": [],
"source": [
"# We will generate events for each city for the number of days. Since we have 4 cites and 100 free calls, you need to do at most 25 days.\n",
"# date:today - Today's Events\n",
"# date:tomorrow - Tomorrow's Events\n",
"htichips = \"date:today\"\n",
"\n",
"# No need to set these\n",
"city_names=[\"New York City\", \"London\", \"Tokyo\", \"San Francisco\"]\n",
"city_ids=[1,2,3,4]\n",
"city_timezones=[\"US/Eastern\",\"Europe/London\",\"Asia/Tokyo\",\"US/Pacific\"]\n",
"number_of_coffee_trucks = \"5\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2EfhBUenO8OC"
},
"outputs": [],
"source": [
"import sys\n",
"!{sys.executable} -m pip install google-search-results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "HTNQKUNaRsyO"
},
"outputs": [],
"source": [
"from google.cloud import bigquery\n",
"\n",
"client = bigquery.Client()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "KOXpog83RlPG"
},
"source": [
"## Create the GenAI Insights table"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jrWuWfUCSZUv"
},
"outputs": [],
"source": [
"%%bigquery\n",
"CREATE TABLE IF NOT EXISTS `${project_id}.${bigquery_data_beans_curated_dataset}.event`\n",
"--CREATE OR REPLACE TABLE `${project_id}.${bigquery_data_beans_curated_dataset}.event` -- only use this for starting over\n",
"(\n",
" event_id INT NOT NULL OPTIONS(description=\"The primary key of the event.\"),\n",
" city_id INT NOT NULL OPTIONS(description=\"The foreign key of the city.\"),\n",
" event_title STRING NOT NULL OPTIONS(description=\"The title of the event.\"),\n",
" event_date DATE NOT NULL OPTIONS(description=\"The date of the event.\"),\n",
" event_time_string STRING NOT NULL OPTIONS(description=\"The time (string value) of the event.\"),\n",
" event_venue STRING NOT NULL OPTIONS(description=\"The venue of the event.\"),\n",
" event_venue_link STRING NOT NULL OPTIONS(description=\"The generated insight in text\"),\n",
" event_address STRING NOT NULL OPTIONS(description=\"The full address of the event.\"),\n",
" event_description STRING NOT NULL OPTIONS(description=\"The description of the event.\"),\n",
" event_reviews INT64 NOT NULL OPTIONS(description=\"The number of reviews of the event.\"),\n",
" event_thumbnail STRING NOT NULL OPTIONS(description=\"The thumbnail image for the event.\")\n",
")\n",
"CLUSTER BY event_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Supporting Functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "GZ18slUFRpjG"
},
"outputs": [],
"source": [
"def RunQuery(sql):\n",
" import time\n",
"\n",
" if (sql.startswith(\"SELECT\")):\n",
" df_result = client.query(sql).to_dataframe()\n",
" return df_result\n",
" else:\n",
" job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)\n",
" query_job = client.query(sql, job_config=job_config)\n",
"\n",
" # Check on the progress by getting the job's updated state.\n",
" query_job = client.get_job(\n",
" query_job.job_id, location=query_job.location\n",
" )\n",
" print(\"Job {} is currently in state {} with error result of {}\".format(query_job.job_id, query_job.state, query_job.error_result))\n",
"\n",
" while query_job.state != \"DONE\":\n",
" time.sleep(2)\n",
" query_job = client.get_job(\n",
" query_job.job_id, location=query_job.location\n",
" )\n",
" print(\"Job {} is currently in state {} with error result of {}\".format(query_job.job_id, query_job.state, query_job.error_result))\n",
"\n",
" if query_job.error_result == None:\n",
" return True\n",
" else:\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qNUW0MFiUOFy"
},
"outputs": [],
"source": [
"def GetNextPrimaryKey(fully_qualified_table_name, field_name):\n",
" sql = f\"\"\"\n",
" SELECT IFNULL(MAX({field_name}),0) AS result\n",
" FROM `{fully_qualified_table_name}`\n",
" \"\"\"\n",
" # print(sql)\n",
" df_result = client.query(sql).to_dataframe()\n",
" # display(df_result)\n",
" return df_result['result'].iloc[0] + 1"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZrQ_gOebR2vX"
},
"source": [
"## Get the list of Events"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can use a 3rd party service to download the events\n",
"- https://serpapi.com/google-events-api\n",
"- https://serpapi.com/blog/scrape-google-events-results-with-python/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "kuR_DSxPO0CR"
},
"outputs": [],
"source": [
"from serpapi import GoogleSearch\n",
"from datetime import date\n",
"\n",
"event_id = GetNextPrimaryKey(\"${project_id}.${bigquery_data_beans_curated_dataset}.event\",\"event_id\")\n",
"print(f\"event_id: {event_id}\")\n",
"\n",
"event_date = date.today()\n",
"event_records = []\n",
"\n",
"# Loop for each city\n",
"for city_index in range(0, 4):\n",
" print(f\"City: {city_ids[city_index]}\")\n",
"\n",
" google_events_params = {\n",
" \"engine\": \"google_events\",\n",
" \"q\": f\"Events in {city_names[city_index]}\",\n",
" \"hl\": \"en\",\n",
" \"gl\": \"us\",\n",
" \"api_key\": f\"{serpapi_key}\",\n",
" 'start': 0,\n",
" \"htichips\" : f\"{htichips}\"\n",
" }\n",
"\n",
" while True:\n",
" search = GoogleSearch(google_events_params)\n",
" event_search_results = search.get_dict()\n",
" if 'error' in event_search_results:\n",
" break\n",
"\n",
" for item in event_search_results[\"events_results\"]:\n",
" try:\n",
" event_venue = \"\"\n",
" address = \"\"\n",
" for address_item in item[\"address\"]:\n",
" if event_venue == \"\":\n",
" split_text = address_item.split(\", \")\n",
" event_venue = split_text[0]\n",
" address = split_text[1]\n",
" else:\n",
" address = address + \" \" + address_item\n",
"\n",
" #print(f\"item: {item}\")\n",
" #print(f\"event_venue: {event_venue}\")\n",
" #print(f\"address: {address}\")\n",
" #print(\"------------------------------------------------------\")\n",
"\n",
" event = {\n",
" \"event_id\" : event_id,\n",
" \"city_id\" : city_ids[city_index],\n",
" \"event_title\" : item['title'],\n",
" \"event_date\" : event_date,\n",
" \"event_time_string\" : item['date']['when'],\n",
" \"event_venue\" : event_venue,\n",
" \"event_venue_link\" : item['venue']['link'],\n",
" \"event_address\" : address,\n",
" \"event_description\" : item['description'],\n",
" \"event_reviews\" : item['venue']['reviews'],\n",
" \"event_thumbnail\" : item['thumbnail']\n",
" }\n",
" event_id = event_id + 1\n",
"\n",
" event_records.append(event)\n",
"\n",
" except Exception as error:\n",
" print(f\"error: {error}\")\n",
" print(f\"item: {item}\")\n",
" print(f\"------------------------------\")\n",
"\n",
" google_events_params['start'] += 10\n",
"\n",
" if google_events_params['start'] > 30:\n",
" break # only do 30 for now so our prompt is not too large"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xUBsOC6BwuXL"
},
"outputs": [],
"source": [
"event_records"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "fmvU3Lv9R6TO"
},
"source": [
"## Insert the events (in bulk)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "sFuLk875qfPk"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"-- remove the current days events (in case you re-run this notebook)\n",
"DELETE\n",
" FROM `${project_id}.${bigquery_data_beans_curated_dataset}.event` AS event\n",
" WHERE event_date = CURRENT_DATE();"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "A2EA2U8feZjh"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# Load the events table (in bulk)\n",
"table_id = \"${project_id}.${bigquery_data_beans_curated_dataset}.event\"\n",
"\n",
"dataframe = pd.DataFrame(\n",
" pd.DataFrame(event_records), # Your source data\n",
" columns=[\n",
" \"event_id\",\n",
" \"city_id\",\n",
" \"event_title\",\n",
" \"event_date\",\n",
" \"event_time_string\",\n",
" \"event_venue\",\n",
" \"event_venue_link\",\n",
" \"event_address\",\n",
" \"event_description\",\n",
" \"event_reviews\",\n",
" \"event_thumbnail\"\n",
" ],\n",
")\n",
"\n",
"job_config = bigquery.LoadJobConfig(\n",
" schema=[\n",
" bigquery.SchemaField(\"event_id\", bigquery.enums.SqlTypeNames.INT64, mode=\"REQUIRED\"),\n",
" bigquery.SchemaField(\"city_id\", bigquery.enums.SqlTypeNames.INT64, mode=\"REQUIRED\"),\n",
" bigquery.SchemaField(\"event_title\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\"),\n",
" bigquery.SchemaField(\"event_date\", bigquery.enums.SqlTypeNames.DATE, mode=\"REQUIRED\"),\n",
" bigquery.SchemaField(\"event_time_string\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\"),\n",
" bigquery.SchemaField(\"event_venue\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\"),\n",
" bigquery.SchemaField(\"event_venue_link\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\"),\n",
" bigquery.SchemaField(\"event_address\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\"),\n",
" bigquery.SchemaField(\"event_description\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\"),\n",
" bigquery.SchemaField(\"event_reviews\", bigquery.enums.SqlTypeNames.INT64, mode=\"REQUIRED\"),\n",
" bigquery.SchemaField(\"event_thumbnail\", bigquery.enums.SqlTypeNames.STRING, mode=\"REQUIRED\")\n",
" ],\n",
" write_disposition=\"WRITE_APPEND\",\n",
")\n",
"\n",
"job = client.load_table_from_dataframe(dataframe, table_id, job_config=job_config)\n",
"job.result() # Wait for the job to complete.\n",
"\n",
"table = client.get_table(table_id) # Make an API request.\n",
"print(\"Loaded {} rows and {} columns to {}\".format(table.num_rows, len(table.schema), table_id))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "PampTfgdRfqe"
},
"source": [
"## See the results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "RCCogPjFSPHO"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"# Display the results\n",
"SELECT *\n",
" FROM `${project_id}.${bigquery_data_beans_curated_dataset}.event`\n",
"WHERE event_date = CURRENT_DATE()\n",
"ORDER BY event_id"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [
"ZrQ_gOebR2vX"
],
"name": "BigQuery table",
"private_outputs": true,
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}