sql-scripts/taxi_dataset/sp_demo_bigsearch.sql (40 lines of code) (raw):
/*##################################################################################
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###################################################################################*/
/*
Use Cases:
- Show BigSearch on 5.83 TB table with 5.2 billion rows
- Show the table with and without a BigSearch index
- NOTE: You will be consuming at lot of bytes scanned during this demo ($6.25 per 1 TB scanned)
- NOTE: Drop the Large tables to save on Costs! (at end of script)
Description:
- Shows searching a large table on various columns with various partitioning filters
Show:
-
References:
- https://cloud.google.com/bigquery/docs/search-intro
Clean up / Reset script:
DROP SEARCH INDEX IF EXISTS idx_all_bigsearch_log_5b_5t_json_hourly ON `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`;
DROP TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_NOT_INDEXED`;
DROP TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`;
More Data (BigSearch on more Data):
- NOTE: To run some SQL on a 60TB, 50 billon row table head over to the shared project
- Open a new tab and paste the URL below.
- https://console.cloud.google.com/bigquery?project=${shared_demo_project_id}
- OPEN the stored procedure: ${shared_demo_project_id}.bigquery_features.sp_demo_bigsearch_50b_60t
*/
---------------------------------------------------------------------------------------------------------
-- NOTE: You might want to copy the tables and create the indexs BEFORE your demo.
-- It can take 15 minutes for everything to complete!
---------------------------------------------------------------------------------------------------------
CREATE TABLE `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_NOT_INDEXED`
COPY `${shared_demo_project_id}.data_analytics_shared_data.bigsearch_log_5b_5t_json_hourly`;
CREATE TABLE `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`
COPY `${shared_demo_project_id}.data_analytics_shared_data.bigsearch_log_5b_5t_json_hourly`;
---------------------------------------------------------------------------------------------------------
-- Create an index on the "Indexed" table
---------------------------------------------------------------------------------------------------------
-- LOG_ANALYZER analyzer works well for machine generated logs and has special rules around tokens commonly found in observability data, such as IP addresses or emails.
CREATE SEARCH INDEX idx_all_bigsearch_log_5b_5t_json_hourly
ON `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED` (ALL COLUMNS)
OPTIONS (analyzer = 'LOG_ANALYZER');
-- Check our index (index_status = "Active", "total storage bytes" = 117 GB)
-- See the unindexed_row_count (rows not indexed)
-- Make sure your coverage_percent = 100 (so we know we are done indexing)
SELECT * -- table_name, index_name, ddl, coverage_percentage
FROM `${project_id}.${bigquery_taxi_dataset}.INFORMATION_SCHEMA.SEARCH_INDEXES`
WHERE index_status = 'ACTIVE';
-- See the columns that are in the index
SELECT * -- table_name, index_name, ddl, coverage_percentage
FROM `${project_id}.${bigquery_taxi_dataset}.INFORMATION_SCHEMA.SEARCH_INDEX_COLUMNS`
WHERE index_name = 'idx_all_bigsearch_log_5b_5t_json_hourly';
---------------------------------------------------------------------------------------------------------
-- Compare the INDEX versus the NOT INDEX tables for performance
---------------------------------------------------------------------------------------------------------
-- NOT_INDEXED
-- Search the Text Payload field of the log table (lots of text to sort through)
-- Result: Cnt = 351,124
-- Duration: 8 sec
-- Bytes processed: 2.99 TB
SELECT COUNT(*) AS Cnt
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_NOT_INDEXED`
WHERE SEARCH(textPayload, 'service281');
-- Search the Text Payload field of the log table (lots of text to sort through)
-- Result: Cnt = 351,124
-- Duration: 6 sec
-- Bytes processed: 2.99 TB
-- Index Usage Mode: FULLY_USED
SELECT COUNT(*) AS Cnt
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`
WHERE SEARCH(textPayload, 'service281');
-- NOT_INDEXED
-- Search the field "insertId" for a value (just one column)
-- Duration: 5 sec
-- Bytes processed: 5.83 TB
-- Index Usage Mode: UNUSED
SELECT *
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_NOT_INDEXED`
WHERE SEARCH(insertId, '40ms4zeh78tg6o6jx');
-- Search the field "insertId" for a value (just one column)
-- Duration: 0 sec
-- Bytes processed: 81.44 MB
-- Index Usage Mode: FULLY_USED
SELECT *
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`
WHERE SEARCH(insertId, '40ms4zeh78tg6o6jx');
---------------------------------------------------------------------------------------------------------
-- Compare RegEx search versus BigSearch SEARCH
---------------------------------------------------------------------------------------------------------
-- Use RegEx
-- Duration: 5 sec
-- Bytes processed: 739.99 GB
-- Count: 26,504
SELECT COUNT(*) AS Cnt
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`
WHERE REGEXP_CONTAINS(internalClusterId, r"(?i)(\b|_)gke-workload-test-gpu-pool-c358b5f5-2fll(\b|_)");
-- Use BQ Search
-- Duration: 2 sec
-- Bytes processed: 123.58 GB
-- Count: 26,504
SELECT COUNT(*) AS Cnt
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`
WHERE SEARCH(internalClusterId, '`gke-workload-test-gpu-pool-c358b5f5-2fll`');
---------------------------------------------------------------------------------------------------------
-- Search ALL columns or on SEVERAL columns
---------------------------------------------------------------------------------------------------------
-- Search across all columns (has to search lots of data)
-- Duration: 36 sec
-- Bytes processed: 5.79 TB
-- Cnt: 2,207,223
SELECT COUNT(*) AS Cnt
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED` AS LogsIndexed
WHERE SEARCH(LogsIndexed, 'service281');
-- Search 2 columns (searches less data since we know the columns to search)
-- Duration: 4 sec
-- Bytes processed: 1.11 TB
-- Cnt: 1,856,005
SELECT COUNT(*) AS Cnt
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED` AS LogsIndexed
WHERE SEARCH((internalClusterId, labels), 'service281');
---------------------------------------------------------------------------------------------------------
-- Clean up Large Data (to save on costs)
---------------------------------------------------------------------------------------------------------
DROP SEARCH INDEX IF EXISTS idx_all_bigsearch_log_5b_5t_json_hourly ON `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`;
DROP TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_NOT_INDEXED`;
DROP TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`;
---------------------------------------------------------------------------------------------------------
-- NOTE: To run some SQL on a 60TB, 50 billon row table head over to the shared project
-- Open a new tab and paste the URL below.
-- https://console.cloud.google.com/bigquery?project=${shared_demo_project_id}
-- OPEN the stored procedure: ${shared_demo_project_id}.bigquery_features.sp_demo_bigsearch_50b_60t
---------------------------------------------------------------------------------------------------------