data-analytics-demos/bigquery-data-governance/sql-scripts/governed_data_raw/initialize.sql (66 lines of code) (raw):
/*##################################################################################
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###################################################################################*/
/*
Author: Adam Paternostro
Use Cases:
- Initializes the system (you can re-run this)
Description:
- Loads all tables from the public storage account
- Uses AVRO so we can bring in JSON and GEO types
References:
-
Clean up / Reset script:
- n/a
*/
------------------------------------------------------------------------------------------------------------
-- Create GenAI / Vertex AI connections
------------------------------------------------------------------------------------------------------------
CREATE MODEL IF NOT EXISTS `${project_id}.${bigquery_governed_data_raw_dataset}.gemini_model`
REMOTE WITH CONNECTION `${project_id}.${multi_region}.vertex-ai`
OPTIONS (endpoint = 'gemini-2.0-flash');
CREATE MODEL IF NOT EXISTS `${project_id}.${bigquery_governed_data_raw_dataset}.textembedding_model`
REMOTE WITH CONNECTION `${project_id}.${multi_region}.vertex-ai`
OPTIONS (endpoint = 'text-embedding-005');
------------------------------------------------------------------------------------------------------------
-- Load all data
------------------------------------------------------------------------------------------------------------
LOAD DATA OVERWRITE `${project_id}.${bigquery_governed_data_raw_dataset}.customer`
FROM FILES (format = 'AVRO', enable_logical_types = true, uris = ['gs://${governed_data_raw_bucket}/customer/customer.avro']);
LOAD DATA OVERWRITE `${project_id}.${bigquery_governed_data_raw_dataset}.customer_transaction`
FROM FILES (format = 'PARQUET', uris = ['gs://${governed_data_raw_bucket}/customer_transaction/customer_transaction.parquet']);
LOAD DATA OVERWRITE `${project_id}.${bigquery_governed_data_raw_dataset}.product`
FROM FILES (format = 'JSON', uris = ['gs://${governed_data_raw_bucket}/product/product.json']);
LOAD DATA OVERWRITE `${project_id}.${bigquery_governed_data_raw_dataset}.product_category`
(
product_category STRING,
description STRING
)
FROM FILES (format = 'CSV', skip_leading_rows = 0, uris = ['gs://${governed_data_raw_bucket}/product_category/product_category.csv']);
------------------------------------------------------------------------------------------------------------
-- Set the table descriptions for each table:
------------------------------------------------------------------------------------------------------------
ALTER TABLE `${project_id}.${bigquery_governed_data_raw_dataset}.customer`
SET OPTIONS (
description = 'Table containing customer raw information.'
);
ALTER TABLE `${project_id}.${bigquery_governed_data_raw_dataset}.customer`
ALTER COLUMN customer_id SET OPTIONS (description='Unique identifier for the customer.'),
ALTER COLUMN first_name SET OPTIONS (description='The first name of the customer.'),
ALTER COLUMN last_name SET OPTIONS (description='The last name of the customer.'),
ALTER COLUMN email SET OPTIONS (description='The email address of the customer.'),
ALTER COLUMN phone SET OPTIONS (description='The phone number of the customer.'),
ALTER COLUMN gender SET OPTIONS (description='The gender of the customer.'),
ALTER COLUMN ip_address SET OPTIONS (description='The IP address of the customer.'),
ALTER COLUMN ssn SET OPTIONS (description='The Social Security Number of the customer.'),
ALTER COLUMN address SET OPTIONS (description='The street address of the customer.'),
ALTER COLUMN city SET OPTIONS (description='The city of the customer.'),
ALTER COLUMN state SET OPTIONS (description='The state of the customer.'),
ALTER COLUMN zip SET OPTIONS (description='The zip code of the customer.');
ALTER TABLE `${project_id}.${bigquery_governed_data_raw_dataset}.customer_transaction`
SET OPTIONS (
description = 'Table containing raw customer transaction details, with multiple rows per order. This table will be used to create order header and detail tables during an ETL process.'
);
ALTER TABLE `${project_id}.${bigquery_governed_data_raw_dataset}.customer_transaction`
ALTER COLUMN transaction_id SET OPTIONS (description='Unique identifier for the transaction.'),
ALTER COLUMN customer_id SET OPTIONS (description='The ID of the customer who made the transaction.'),
ALTER COLUMN order_date SET OPTIONS (description='The date the order was placed.'),
ALTER COLUMN order_time SET OPTIONS (description='The time the order was placed.'),
ALTER COLUMN transaction_type SET OPTIONS (description='The type of transaction (e.g., purchase, return).'),
ALTER COLUMN region SET OPTIONS (description='The region where the transaction occurred.'),
ALTER COLUMN quantity SET OPTIONS (description='The quantity of the product purchased.'),
ALTER COLUMN product SET OPTIONS (description='The name or identifier of the product purchased.'),
ALTER COLUMN product_category SET OPTIONS (description='The category of the product purchased.'),
ALTER COLUMN price SET OPTIONS (description='The price of the product.');
ALTER TABLE `${project_id}.${bigquery_governed_data_raw_dataset}.product`
SET OPTIONS (
description = 'Table containing raw product information.'
);
ALTER TABLE `${project_id}.${bigquery_governed_data_raw_dataset}.product`
ALTER COLUMN description SET OPTIONS (description='Description of the product.'),
ALTER COLUMN product SET OPTIONS (description='The name or identifier of the product.');
ALTER TABLE `${project_id}.${bigquery_governed_data_raw_dataset}.product_category`
SET OPTIONS (
description = 'Table containing raw product category information.'
);
ALTER TABLE `${project_id}.${bigquery_governed_data_raw_dataset}.product_category`
ALTER COLUMN product_category SET OPTIONS (description='The name or identifier of the product category.'),
ALTER COLUMN description SET OPTIONS (description='Description of the product category.');
------------------------------------------------------------------------------------------------------------
-- Run ELT processes in the enriched dataset
------------------------------------------------------------------------------------------------------------
CALL `${project_id}.${bigquery_governed_data_enriched_dataset}.initialize`();
------------------------------------------------------------------------------------------------------------
-- Run ELT processes in the curated dataset
------------------------------------------------------------------------------------------------------------
CALL `${project_id}.${bigquery_governed_data_curated_dataset}.initialize`();