bigquery-snowflake-iceberg.sql

-- Step 1: -- Create a bucket to hold your BigLake Managed Table -- Open: https://console.cloud.google.com/storage/browser -- Click the Create Bucket button -- Enter your bucket name: iceberg-sharing-bigquery (you can choose a different name) -- Click Next: Use Region: us-central1 -- Click Create at the bottom -- Step 2: -- Navigate to BigQuery -- Open: https://console.cloud.google.com/bigquery -- Click the Add button -- Select "Connections to external data sources" -- Select "Vertex AI remote models, remote functions and BigLake (Cloud Resource)" -- Enter a name: iceberg-connection-bigquery (use the for friendly name and description) -- Step 3: -- Expand your project in the left hand panel -- Expand external connections -- Double click on us-central1.iceberg-connection-bigquery -- Copy the service account id: e.g. bqcx-xxxxxxxxxxxx-s3rf@gcp-sa-bigquery-condel.iam.gserviceaccount.com -- Step 4: -- Open your storage account you created -- Open: https://console.cloud.google.com/storage/browser -- Click on: iceberg-sharing-bigquery (or whatever you named it) -- Click on Permissions -- Click Grant Access -- Paste in the service account name -- For the role select Cloud Storage | Storage Object Admin [We want admin since we want Read/Write] -- Click Save -- Step 5: -- Navigate to BigQuery -- Open: https://console.cloud.google.com/bigquery -- Open a query window: -- Run this to create a dataset CREATE SCHEMA IF NOT EXISTS iceberg_dataset OPTIONS(location = 'us-central1'); -- Step 6: -- Run this to create a BigLake Managed table (change the storage account name below) CREATE OR REPLACE TABLE `iceberg_dataset.driver` ( driver_id INT64, driver_name STRING, driver_mobile_number STRING, driver_license_number STRING, driver_email_address STRING, driver_dob DATE, driver_ach_routing_number STRING, driver_ach_account_number STRING ) CLUSTER BY driver_id WITH CONNECTION `us-central1.iceberg-connection-bigquery` OPTIONS ( file_format = 'PARQUET', table_format = 'ICEBERG', storage_uri = 'gs://iceberg-sharing-bigquery/driver' ); -- Step 7: -- Load the table with some data LOAD DATA INTO `iceberg_dataset.driver` FROM FILES ( format = 'parquet', uris = ['gs://data-analytics-golden-demo/biglake/v1-source/managed-table-source/driver/*.parquet']); -- View the data SELECT * FROM `iceberg_dataset.driver` LIMIT 1000; -- Step 8: -- View the storage account -- Open: https://console.cloud.google.com/storage/browser and click on your storage account -- You should see a folder called "driver" -- You can navigate to the "metadata" folder -- You will probably see the file "v0.metadata.json" -- To export the most recent metadata run this in BigQuery (a seperate window is preferred) EXPORT TABLE METADATA FROM iceberg_dataset.driver; -- In your storage window -- Press the Refresh button (top right) -- You will not see many files in the metadata folder -- Step 9: -- Let's connect the data to Snowflake -- Login to Snowflake -- Click the Create button on the top left -- Select SQL Worksheet -- Run this command to switch to an account admin since we have to run some commands that requires this role USE ROLE accountadmin; -- Step 10: -- Create a warehouse to hold the data CREATE OR REPLACE WAREHOUSE BIGQUERY_WAREHOUSE WITH WAREHOUSE_SIZE='XSMALL'; -- Step 11: -- Create a database in snowflake CREATE OR REPLACE DATABASE BIGQUERY_DATABASE; -- Step 12: -- Select the database USE DATABASE BIGQUERY_DATABASE; -- Step 13: -- Create the schema to hold the table CREATE SCHEMA IF NOT EXISTS BIGQUERY_SCHEMA; -- Step 14: -- Select the schema USE SCHEMA BIGQUERY_SCHEMA; -- Step 15: -- Create our GCS volume integration. This will create a link between Snowflake and GCS. -- A service principal will be created and we will grant access to our GCS bucket. -- https://docs.snowflake.com/en/sql-reference/sql/create-storage-integration -- Change the name of the storage account CREATE STORAGE INTEGRATION gcs_storage_integration TYPE = EXTERNAL_STAGE STORAGE_PROVIDER = 'GCS' ENABLED = TRUE STORAGE_ALLOWED_LOCATIONS = ('gcs://iceberg-sharing-bigquery/'); -- Step 16: -- Get the service principal that we will grant Storage Object Admin in our GCS bucket DESC STORAGE INTEGRATION gcs_storage_integration; -- Copy the STORAGE_GCP_SERVICE_ACCOUNT -- e.g. xxxxxxxxx@gcpuscentral1-1dfa.iam.gserviceaccount.com -- Step 17: -- https://docs.snowflake.com/en/user-guide/tables-iceberg-configure-external-volume-gcs?_fsi=YFzl41ld&_fsi=YFzl41ld&_fsi=YFzl41ld -- Create a custom IAM role -- Create a custom role that has the permissions required to access the bucket and get objects. -- Open: https://console.cloud.google.com/iam-admin/roles -- Select Create Role. -- Enter a Title and optional Description for the custom role. [e.g. Snowflake Reader] -- Select Add Permissions. -- In Filter, select Service and then select storage. -- Filter the list of permissions, and add the following from the list: -- storage.buckets.get -- storage.objects.get -- storage.objects.list -- Select Add. -- Select Create. -- Step 18: -- Open your storage account you created -- Open: https://console.cloud.google.com/storage/browser -- Click on: iceberg-sharing-bigquery (or whatever you named it) -- Click on Permissions -- Click Grant Access -- Paste in the service account name (from Snowflake) -- For the role select Custom | Snowflake Reader -- Click Save -- Step 19: -- Create an external volume on GCS -- https://docs.snowflake.com/en/sql-reference/sql/create-external-volume CREATE OR REPLACE EXTERNAL VOLUME gcs_volume STORAGE_LOCATIONS = ( ( NAME = 'gcs_volume' STORAGE_PROVIDER = 'GCS' STORAGE_BASE_URL = 'gcs://iceberg-sharing-bigquery/' ) ); -- Step 20: -- Create a catalog integration to manage Iceberg tables in the external volume CREATE OR REPLACE CATALOG INTEGRATION catalog_integration CATALOG_SOURCE=OBJECT_STORE -- Indicates that the catalog is backed by an object store TABLE_FORMAT=ICEBERG -- Specifies the table format as Iceberg ENABLED=TRUE; -- Enables the catalog integration -- Step 21: -- Create the Iceberg table, pointing to the existing metadata file -- To get the "latest" metadata file -- Open your storage account and navigate to the driver/metadata folder -- Open the version-hint.text, there will be a number inside -- Replace the number below (in the value of v173076288, but leave the "v") -- NOTE: Due to some metadata optional parameters set/required by V1 of the Iceberg we need -- to ingore some metadata. To due this, please ask Snowflake to perform this on your account. -- This should not be an issue when Iceberg V2 is used. -- JIRA for instructions to set the metadata flag on the Snowflake account - SNOW-1624657. CREATE OR REPLACE ICEBERG TABLE driver CATALOG='catalog_integration' -- The catalog where the table will reside EXTERNAL_VOLUME='gcs_volume' -- The external volume for table data BASE_LOCATION='' -- Optional: Subdirectory within the storage location METADATA_FILE_PATH='driver/metadata/v1741294463.metadata.json'; -- Path to the existing metadata file -- Step 22: -- This will show the table just created SHOW TABLES -- Step 23: -- Query the Iceberg table SELECT * FROM driver; SELECT COUNT(*) FROM driver; -- Now that you are linked, you can try the following -- Insert a record into the BigQuery table: INSERT INTO `iceberg_dataset.driver` (driver_id, driver_name, driver_mobile_number, driver_license_number, driver_email_address, driver_dob, driver_ach_routing_number, driver_ach_account_number) VALUES (0, 'New Driver', 'xxx-xxx-xxxx', 'driver_license_number', 'driver_email_address', CURRENT_DATE(), 'driver_ach_routing_number','driver_ach_account_number'); -- Now Query the record in Snowflake SELECT * FROM driver WHERE driver_id = 0; -- You will not see the new record -- You first need to tell BigQuery to export the latest metadata (Step 8). -- Update the metadata used by Snowflake (Step 21) pointing to the latest JSON file

bigquery-snowflake-iceberg.sql (23 lines of code) (raw):