sql-scripts/taxi_dataset/sp_create_taxi_external_tables.sql (9 lines of code) (raw):

/*################################################################################## # Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ###################################################################################*/ /* Use Cases: - External tables are great for querying data on cloud storage (files, Hive, etc.) - External tables make loading data easy Description: - This will create external tables over "Hive" partitioned tables on storage. - Customers with Hadoop/Spark/Hive ecosystem can query their data in place using BigQuery. Show: - BQ supports Avro, Csv (any delimiated, tab, pipes, etc.), Google Sheets, Json, Orc, Parquet - No need to setup special security with data sources and storage keys, etc. Seemless security. References: - https://cloud.google.com/bigquery/docs/hive-partitioned-queries-gcs Clean up / Reset script: DROP EXTERNAL TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.ext_green_trips_parquet`; DROP EXTERNAL TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.ext_yellow_trips_csv`; DROP EXTERNAL TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.ext_yellow_trips_json`; DROP EXTERNAL TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.ext_yellow_trips_parquet`; DROP EXTERNAL TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.ext_vendor`; DROP EXTERNAL TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.ext_rate_code`; DROP EXTERNAL TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.ext_payment_type`; DROP EXTERNAL TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.ext_trip_type`; */ CREATE OR REPLACE EXTERNAL TABLE `${project_id}.${bigquery_taxi_dataset}.ext_green_trips_parquet` WITH PARTITION COLUMNS ( year INTEGER, -- column order must match the external path month INTEGER ) OPTIONS ( format = "PARQUET", hive_partition_uri_prefix = "gs://${bucket_name}/processed/taxi-data/green/trips_table/parquet/", uris = ['gs://${bucket_name}/processed/taxi-data/green/trips_table/parquet/*.parquet'] ); CREATE OR REPLACE EXTERNAL TABLE `${project_id}.${bigquery_taxi_dataset}.ext_yellow_trips_csv` ( Vendor_Id INTEGER, Pickup_DateTime TIMESTAMP, Dropoff_DateTime TIMESTAMP, Passenger_Count INTEGER, Trip_Distance NUMERIC, Rate_Code_Id INTEGER, Store_And_Forward STRING, PULocationID INTEGER, DOLocationID INTEGER, Payment_Type_Id INTEGER, Fare_Amount NUMERIC, Surcharge NUMERIC, MTA_Tax NUMERIC, Tip_Amount NUMERIC, Tolls_Amount NUMERIC, Improvement_Surcharge NUMERIC, Total_Amount NUMERIC, Congestion_Surcharge NUMERIC ) WITH PARTITION COLUMNS ( -- column order must match the external path year INTEGER, month INTEGER ) OPTIONS ( format = "CSV", field_delimiter = ',', skip_leading_rows = 1, hive_partition_uri_prefix = "gs://${bucket_name}/processed/taxi-data/yellow/trips_table/csv/", uris = ['gs://${bucket_name}/processed/taxi-data/yellow/trips_table/csv/*.csv'] ); CREATE OR REPLACE EXTERNAL TABLE `${project_id}.${bigquery_taxi_dataset}.ext_yellow_trips_json` ( Vendor_Id INTEGER, Pickup_DateTime TIMESTAMP, Dropoff_DateTime TIMESTAMP, Passenger_Count INTEGER, Trip_Distance NUMERIC, Rate_Code_Id INTEGER, Store_And_Forward STRING, PULocationID INTEGER, DOLocationID INTEGER, Payment_Type_Id INTEGER, Fare_Amount NUMERIC, Surcharge NUMERIC, MTA_Tax NUMERIC, Tip_Amount NUMERIC, Tolls_Amount NUMERIC, Improvement_Surcharge NUMERIC, Total_Amount NUMERIC, Congestion_Surcharge NUMERIC ) WITH PARTITION COLUMNS ( -- column order must match the external path year INTEGER, month INTEGER ) OPTIONS ( format = "JSON", hive_partition_uri_prefix = "gs://${bucket_name}/processed/taxi-data/yellow/trips_table/json/", uris = ['gs://${bucket_name}/processed/taxi-data/yellow/trips_table/json/*.json'] ); CREATE OR REPLACE EXTERNAL TABLE `${project_id}.${bigquery_taxi_dataset}.ext_yellow_trips_parquet` WITH PARTITION COLUMNS ( year INTEGER, -- column order must match the external path month INTEGER ) OPTIONS ( format = "PARQUET", hive_partition_uri_prefix = "gs://${bucket_name}/processed/taxi-data/yellow/trips_table/parquet/", uris = ['gs://${bucket_name}/processed/taxi-data/yellow/trips_table/parquet/*.parquet'] ); CREATE OR REPLACE EXTERNAL TABLE `${project_id}.${bigquery_taxi_dataset}.ext_vendor` OPTIONS ( format = "PARQUET", uris = ['gs://${bucket_name}/processed/taxi-data/vendor_table/*.parquet'] ); CREATE OR REPLACE EXTERNAL TABLE `${project_id}.${bigquery_taxi_dataset}.ext_rate_code` OPTIONS ( format = "PARQUET", uris = ['gs://${bucket_name}/processed/taxi-data/rate_code_table/*.parquet'] ); CREATE OR REPLACE EXTERNAL TABLE `${project_id}.${bigquery_taxi_dataset}.ext_payment_type` OPTIONS ( format = "PARQUET", uris = ['gs://${bucket_name}/processed/taxi-data/payment_type_table/*.parquet'] ); CREATE OR REPLACE EXTERNAL TABLE `${project_id}.${bigquery_taxi_dataset}.ext_trip_type` OPTIONS ( format = "PARQUET", uris = ['gs://${bucket_name}/processed/taxi-data/trip_type_table/*.parquet'] ); CREATE OR REPLACE EXTERNAL TABLE `${project_id}.${bigquery_taxi_dataset}.ext_location` OPTIONS ( format = "PARQUET", uris = ['gs://${bucket_name}/processed/taxi-data/location/*.parquet'] ); -- Query External Tables SELECT * FROM `${project_id}.${bigquery_taxi_dataset}.ext_yellow_trips_csv` WHERE year=2020 AND month=1 LIMIT 100; SELECT * FROM `${project_id}.${bigquery_taxi_dataset}.ext_yellow_trips_json` WHERE year=2020 AND month=1 LIMIT 100; SELECT * FROM `${project_id}.${bigquery_taxi_dataset}.ext_yellow_trips_parquet` WHERE year=2020 AND month=1 LIMIT 100;