genai-for-marketing/notebooks/1_environment_setup.ipynb (303 lines of code) (raw):

{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Copyright 2023 Google LLC\n", "#\n", "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", "# You may obtain a copy of the License at\n", "#\n", "# https://www.apache.org/licenses/LICENSE-2.0\n", "#\n", "# Unless required by applicable law or agreed to in writing, software\n", "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Environment Setup" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Install the following python packages to setup the environment." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install -U google-cloud-datacatalog\n", "! pip install -U google-cloud-storage\n", "! pip install -U google-cloud-bigquery\n", "! pip install -U numpy" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Specify your project ID in the next cell." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "PROJECT_ID = '<YOUR PROJECT ID>' # Change to your project ID\n", "LOCATION = 'us-central1'\n", "DATASET_ID = 'cdp_dataset'\n", "\n", "# Tag template \n", "TAG_TEMPLATE_ID = 'llmcdptemplate'\n", "TAG_TEMPLATE_PATH = f\"projects/{PROJECT_ID}/locations/{LOCATION}/tagTemplates/{TAG_TEMPLATE_ID}\"\n", "\n", "# Set the project id\n", "! gcloud config set project {PROJECT_ID}" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### BigQuery: Create dataset" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Create a BigQuery dataset to upload the CDP data." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create BigQuery Dataset talktodata on your project\n", "from google.cloud import bigquery\n", "from google.cloud import datacatalog_v1\n", "\n", "bq_client = bigquery.Client(project=PROJECT_ID)\n", "datacatalog_client = datacatalog_v1.DataCatalogClient()\n", "\n", "dataset_id = \"{}.{}\".format(bq_client.project, DATASET_ID)\n", "dataset = bigquery.Dataset(dataset_id)\n", "dataset.location = \"US\"\n", "\n", "# Create the dataset\n", "try:\n", " dataset = bq_client.create_dataset(dataset, timeout=30)\n", " print(f'Dataset {DATASET_ID} create successfully.')\n", "except Exception as e:\n", " print(e)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### BigQuery: Create tables and populate with data\n", "\n", "The next cell will generate the synthetic data for the tables and load to BigQuery.\n", "\n", "> This process will take approximately 1 minute and 30 seconds.\n", "\n", "If this process fails, try to recreate the dataset with the cell above and regenerate the data." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from aux_data import data_gen\n", "\n", "data_gen.generate_and_populate_dataset(\n", " PROJECT_ID=PROJECT_ID,\n", " DATASET_ID=DATASET_ID\n", ")" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Setup Data Catalog" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "The cell bellow will execute the following steps:\n", "\n", "1) Specify a query to retrieve the metadata from the tables you just uploaded;\n", "2) Create a TagTemplate on Google Dataplex that specifies how the table will be tagged with medatada;\n", "3) Tag all the tables you created on BigQuery." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from aux_data import bq_tag_generation\n", "\n", "bq_tag_generation.create_template_and_tag_bq(\n", " PROJECT_ID,\n", " DATASET_ID,\n", " TAG_TEMPLATE_ID,\n", " LOCATION\n", ")" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Quick test" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Test the integration by retrieving the metadata from BigQuery tables." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "QUERY = f'SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.INFORMATION_SCHEMA.TABLES` WHERE table_name NOT LIKE \"%metadata%\"'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_tags_from_table(table_id):\n", " # Lookup Data Catalog's Entry referring to the table.\n", " resource_name = (\n", " f\"//bigquery.googleapis.com/projects/{PROJECT_ID}/datasets/{DATASET_ID}/tables/{table_id}\"\n", " )\n", " table_entry = datacatalog_client.lookup_entry(\n", " request={\"linked_resource\": resource_name}\n", " )\n", "\n", " # Make the request\n", " page_result = datacatalog_client.list_tags(parent=table_entry.name)\n", " # print(page_result)\n", "\n", " tags_str = ''\n", "\n", " # Handle the response\n", " for response in page_result:\n", " if response.template == TAG_TEMPLATE_PATH:\n", " desc = response.fields[\"description\"].string_value\n", " data_type = response.fields[\"data_type\"].string_value\n", " pk = response.fields[\"is_primary_key\"].bool_value\n", " fk = response.fields[\"is_foreign_key\"].bool_value \n", " tags_str += (\"Table: {} \"\n", " \"- Column: {} \" \n", " \"- Data Type: {} \" \n", " \"- Primary Key: {} \" \n", " \"- Foreing Key: {} \" \n", " \"- Description: {}\\n\".format(\n", " table_id, response.column, data_type, pk, fk, desc))\n", " return tags_str" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_metadata_from_dataset(\n", " query: str\n", "):\n", " # print(\"Gets the metadata once\")\n", " query_job = bq_client.query(query) # API request\n", " rows = query_job.result()\n", " metadata = []\n", "\n", " for row in rows:\n", " table_metadata = {}\n", " table_metadata['ddl'] = row.ddl\n", " table_metadata['description'] = get_tags_from_table(row.table_name)\n", " metadata.append(table_metadata)\n", " \n", " return metadata" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tags = get_metadata_from_dataset(QUERY)\n", "for i in tags:\n", " print(i['description'])" ] } ], "metadata": { "kernelspec": { "display_name": "llm-dev-py311", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }