genai-for-marketing/notebooks/1_environment_setup.ipynb (303 lines of code) (raw):
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Copyright 2023 Google LLC\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# https://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License."
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Environment Setup"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Install the following python packages to setup the environment."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! pip install -U google-cloud-datacatalog\n",
"! pip install -U google-cloud-storage\n",
"! pip install -U google-cloud-bigquery\n",
"! pip install -U numpy"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Specify your project ID in the next cell."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"PROJECT_ID = '<YOUR PROJECT ID>' # Change to your project ID\n",
"LOCATION = 'us-central1'\n",
"DATASET_ID = 'cdp_dataset'\n",
"\n",
"# Tag template \n",
"TAG_TEMPLATE_ID = 'llmcdptemplate'\n",
"TAG_TEMPLATE_PATH = f\"projects/{PROJECT_ID}/locations/{LOCATION}/tagTemplates/{TAG_TEMPLATE_ID}\"\n",
"\n",
"# Set the project id\n",
"! gcloud config set project {PROJECT_ID}"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### BigQuery: Create dataset"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a BigQuery dataset to upload the CDP data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create BigQuery Dataset talktodata on your project\n",
"from google.cloud import bigquery\n",
"from google.cloud import datacatalog_v1\n",
"\n",
"bq_client = bigquery.Client(project=PROJECT_ID)\n",
"datacatalog_client = datacatalog_v1.DataCatalogClient()\n",
"\n",
"dataset_id = \"{}.{}\".format(bq_client.project, DATASET_ID)\n",
"dataset = bigquery.Dataset(dataset_id)\n",
"dataset.location = \"US\"\n",
"\n",
"# Create the dataset\n",
"try:\n",
" dataset = bq_client.create_dataset(dataset, timeout=30)\n",
" print(f'Dataset {DATASET_ID} create successfully.')\n",
"except Exception as e:\n",
" print(e)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### BigQuery: Create tables and populate with data\n",
"\n",
"The next cell will generate the synthetic data for the tables and load to BigQuery.\n",
"\n",
"> This process will take approximately 1 minute and 30 seconds.\n",
"\n",
"If this process fails, try to recreate the dataset with the cell above and regenerate the data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from aux_data import data_gen\n",
"\n",
"data_gen.generate_and_populate_dataset(\n",
" PROJECT_ID=PROJECT_ID,\n",
" DATASET_ID=DATASET_ID\n",
")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Setup Data Catalog"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"The cell bellow will execute the following steps:\n",
"\n",
"1) Specify a query to retrieve the metadata from the tables you just uploaded;\n",
"2) Create a TagTemplate on Google Dataplex that specifies how the table will be tagged with medatada;\n",
"3) Tag all the tables you created on BigQuery."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from aux_data import bq_tag_generation\n",
"\n",
"bq_tag_generation.create_template_and_tag_bq(\n",
" PROJECT_ID,\n",
" DATASET_ID,\n",
" TAG_TEMPLATE_ID,\n",
" LOCATION\n",
")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Quick test"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Test the integration by retrieving the metadata from BigQuery tables."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"QUERY = f'SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.INFORMATION_SCHEMA.TABLES` WHERE table_name NOT LIKE \"%metadata%\"'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_tags_from_table(table_id):\n",
" # Lookup Data Catalog's Entry referring to the table.\n",
" resource_name = (\n",
" f\"//bigquery.googleapis.com/projects/{PROJECT_ID}/datasets/{DATASET_ID}/tables/{table_id}\"\n",
" )\n",
" table_entry = datacatalog_client.lookup_entry(\n",
" request={\"linked_resource\": resource_name}\n",
" )\n",
"\n",
" # Make the request\n",
" page_result = datacatalog_client.list_tags(parent=table_entry.name)\n",
" # print(page_result)\n",
"\n",
" tags_str = ''\n",
"\n",
" # Handle the response\n",
" for response in page_result:\n",
" if response.template == TAG_TEMPLATE_PATH:\n",
" desc = response.fields[\"description\"].string_value\n",
" data_type = response.fields[\"data_type\"].string_value\n",
" pk = response.fields[\"is_primary_key\"].bool_value\n",
" fk = response.fields[\"is_foreign_key\"].bool_value \n",
" tags_str += (\"Table: {} \"\n",
" \"- Column: {} \" \n",
" \"- Data Type: {} \" \n",
" \"- Primary Key: {} \" \n",
" \"- Foreing Key: {} \" \n",
" \"- Description: {}\\n\".format(\n",
" table_id, response.column, data_type, pk, fk, desc))\n",
" return tags_str"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_metadata_from_dataset(\n",
" query: str\n",
"):\n",
" # print(\"Gets the metadata once\")\n",
" query_job = bq_client.query(query) # API request\n",
" rows = query_job.result()\n",
" metadata = []\n",
"\n",
" for row in rows:\n",
" table_metadata = {}\n",
" table_metadata['ddl'] = row.ddl\n",
" table_metadata['description'] = get_tags_from_table(row.table_name)\n",
" metadata.append(table_metadata)\n",
" \n",
" return metadata"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tags = get_metadata_from_dataset(QUERY)\n",
"for i in tags:\n",
" print(i['description'])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "llm-dev-py311",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}