genai-on-vertex-ai/gemini/evals_playbook/bigquery_sqls/evals

-- Copyright 2024 Google LLC -- -- Licensed under the Apache License, Version 2.0 (the "License"); -- you may not use this file except in compliance with the License. -- You may obtain a copy of the License at -- -- https://www.apache.org/licenses/LICENSE-2.0 -- -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -- See the License for the specific language governing permissions and -- limitations under the License. -- Configuration Tables -- eval_tasks -- [TODO]: Delete fields not being used CREATE TABLE IF NOT EXISTS eval_tasks ( task_id STRING OPTIONS(description="Unique identifier for the evaluation task"), task_desc STRING OPTIONS(description="Description of the evaluation task"), create_datetime DATETIME OPTIONS(description="Timestamp of when the task was created"), update_datetime DATETIME OPTIONS(description="Timestamp of when the task was last updated"), tags ARRAY<STRING> OPTIONS(description="Tags associated with the task for easy filtering and searching"), metadata STRING OPTIONS(description="Additional metadata related to the task") ) OPTIONS( description="Table storing information about different Generative AI tasks to be evaluated", labels=[("tool", "vertexai-gemini-evals")] ); -- eval_experiments CREATE TABLE IF NOT EXISTS eval_experiments ( experiment_id STRING OPTIONS(description="Unique identifier for the evaluation experiment"), experiment_desc STRING OPTIONS(description="Description of the evaluation experiment"), task_id STRING OPTIONS(description="Foreign key referencing the eval_tasks table, linking the experiment to its corresponding task"), eval_dataset_id STRING OPTIONS(description="Foreign key referencing the eval_datasets table, linking the experiment to its dataset"), -- Prompt ID here refers to a prompt template prompt_id STRING OPTIONS(description="Foreign key referencing the eval_prompts table, linking the experiment to its prompt"), model_endpoint STRING OPTIONS(description="The endpoint of the model being evaluated"), model_name STRING OPTIONS(description="The name of the model being evaluated"), generation_config STRING OPTIONS(description="JSON string containing the model generation configuration"), is_streaming BOOL OPTIONS(description="Indicates whether the evaluation is streaming or not"), safety_settings STRING OPTIONS(description="JSON string containing the safety settings for the evaluation"), metric_config STRING OPTIONS(description="JSON string containing the configuration for the metrics used in the evaluation"), create_datetime DATETIME OPTIONS(description="Timestamp of when the experiment was created"), update_datetime DATETIME OPTIONS(description="Timestamp of when the experiment was last updated"), elapsed_time NUMERIC OPTIONS(description="Total time taken for the experiment to complete"), tags ARRAY<STRING> OPTIONS(description="Tags associated with the experiment for easy filtering and searching"), metadata STRING OPTIONS(description="Additional metadata related to the experiment") ) OPTIONS( description="Table storing information about evaluation experiments conducted on different tasks", labels=[("tool", "vertexai-gemini-evals")] ); -- eval_prompts CREATE TABLE IF NOT EXISTS eval_prompts ( prompt_id STRING OPTIONS(description="Unique identifier for the prompt"), prompt_description STRING OPTIONS(description="Description of the prompt"), system_instruction STRING OPTIONS(description="System instructions provided to the model before the prompt"), prompt_template STRING OPTIONS(description="Template used to construct the prompt"), prompt_type STRING OPTIONS(description="Type of prompt (e.g., single-turn, chat)"), contents STRING OPTIONS(description="Array of prompt contents"), tools STRING OPTIONS(description="Array of function declarations for tools used in the prompt"), tool_config STRING OPTIONS(description="Configuration for the tools used in the prompt"), is_multimodal BOOL OPTIONS(description="Indicates whether the prompt is multimodal or not"), version_num STRING OPTIONS(description="Version number of the prompt"), create_datetime DATETIME OPTIONS(description="Timestamp of when the prompt was created"), update_datetime DATETIME OPTIONS(description="Timestamp of when the prompt was last updated"), tags ARRAY<STRING> OPTIONS(description="Tags associated with the prompt for easy filtering and searching"), metadata STRING OPTIONS(description="Additional metadata related to the prompt") ) OPTIONS( description="Table storing information about different prompts used in evaluations", labels=[("tool", "vertexai-gemini-evals")] ); -- eval_datasets CREATE TABLE IF NOT EXISTS eval_datasets ( dataset_id STRING OPTIONS(description="Unique identifier for the evaluation dataset"), dataset_desc STRING OPTIONS(description="Description of the evaluation dataset"), dataset_format STRING OPTIONS(description="Format of the evaluation dataset (e.g., JSON, CSV)"), dataset_location STRING OPTIONS(description="Location of the evaluation dataset (e.g., GCS bucket)"), reference_column_name STRING OPTIONS(description="Name of the column in the dataset containing the reference/ground truth data"), create_datetime DATETIME OPTIONS(description="Timestamp of when the dataset was created"), update_datetime DATETIME OPTIONS(description="Timestamp of when the dataset was last updated") ) OPTIONS( description="Table storing references to evaluation datasets used in experiments", labels=[("tool", "vertexai-gemini-evals")] ); -- eval_runs -- [TODO: Add col for gcs] CREATE TABLE IF NOT EXISTS eval_runs ( run_id STRING OPTIONS(description="Unique identifier for the evaluation run"), experiment_id STRING OPTIONS(description="Foreign key referencing the eval_experiments table, linking the run to its corresponding experiment"), task_id STRING OPTIONS(description="Foreign key referencing the eval_tasks table, linking the run to its corresponding task"), -- dataset_row_id STRING OPTIONS(description="Identifier for the specific example within the dataset used in this run"), -- system_instruction STRING OPTIONS(description="System instructions provided to the model before the input prompt"), -- input_prompt STRING OPTIONS(description="The input prompt used in the evaluation run"), -- input_prompt_gcs_uri STRING OPTIONS(description="GCS URI of the input prompt used in the evaluation run"), -- output_text STRING OPTIONS(description="The text output generated by the model"), -- output_response STRING OPTIONS(description="The complete response generated by the model, including any structured data"), metrics STRING OPTIONS(description="JSON string containing the metrics and their scores for this run"), -- total_elapsed_time NUMERIC OPTIONS(description="Total time taken for this run to complete"), -- avg_latency_per_request NUMERIC OPTIONS(description="Average latency per request in this run"), -- avg_output_token_count INT OPTIONS(description="Average number of output tokens generated in this run"), -- total_input_token_count INT OPTIONS(description="Total number of input tokens in this run"), -- total_output_token_count INT OPTIONS(description="Total number of output tokens generated in this run"), -- total_total_token_count INT OPTIONS(description="Total number of tokens (input + output) in this run"), create_datetime DATETIME OPTIONS(description="Timestamp of when the run was created"), update_datetime DATETIME OPTIONS(description="Timestamp of when the run was last updated"), tags ARRAY<STRING> OPTIONS(description="Tags associated with the run for easy filtering and searching"), metadata STRING OPTIONS(description="Additional metadata related to the run") ) OPTIONS( description="Table storing information about individual evaluation runs within experiments", labels=[("tool", "vertexai-gemini-evals")] ); -- Results -- eval_run_details CREATE TABLE IF NOT EXISTS eval_run_details ( run_id STRING OPTIONS(description="Unique identifier for the evaluation run, referencing the eval_runs table"), experiment_id STRING OPTIONS(description="Foreign key referencing the eval_experiments table, linking the run details to its corresponding experiment"), task_id STRING OPTIONS(description="Foreign key referencing the eval_tasks table, linking the run details to its corresponding task"), dataset_row_id STRING OPTIONS(description="Identifier for the specific trial/repetition of a run, a run_id can be repeated multiple time to check for repeatability"), system_instruction STRING OPTIONS(description="System instructions provided to the model before the input prompt"), -- input_prompt STRING OPTIONS(description="The input prompt used in the evaluation run"), input_prompt_gcs_uri STRING OPTIONS(description="GCS URI of the input prompt used in the evaluation run"), output_text STRING OPTIONS(description="The text output generated by the model"), -- output_response STRING OPTIONS(description="The complete response generated by the model, including any structured data"), ground_truth STRING OPTIONS(description="The expected/correct output for the given input"), metrics STRING OPTIONS(description="JSON string containing the metrics and their scores for this run"), -- input_token_count INT OPTIONS(description="Number of input tokens in this run"), -- output_token_count INT OPTIONS(description="Number of output tokens generated in this run"), -- total_token_count INT OPTIONS(description="Total number of tokens (input + output) in this run"), -- num_retries INT OPTIONS(description="Number of retries attempted for this run"), -- avg_latency NUMERIC OPTIONS(description="Average latency for this run"), latencies ARRAY<NUMERIC> OPTIONS(description="Array of latencies for each request in this run"), create_datetime DATETIME OPTIONS(description="Timestamp of when the run details were created"), update_datetime DATETIME OPTIONS(description="Timestamp of when the run details were last updated"), tags ARRAY<STRING> OPTIONS(description="Tags associated with the run details for easy filtering and searching"), metadata STRING OPTIONS(description="Additional metadata related to the run details") ) OPTIONS( description="Table storing detailed information about individual evaluation runs, including ground truth and latencies", labels=[("tool", "vertexai-gemini-evals")] ); ALTER TABLE eval_tasks ALTER COLUMN create_datetime SET DEFAULT (CURRENT_DATETIME()); ALTER TABLE eval_tasks ALTER COLUMN update_datetime SET DEFAULT (CURRENT_DATETIME()); ALTER TABLE eval_experiments ALTER COLUMN create_datetime SET DEFAULT (CURRENT_DATETIME()); ALTER TABLE eval_experiments ALTER COLUMN update_datetime SET DEFAULT (CURRENT_DATETIME()); ALTER TABLE eval_prompts ALTER COLUMN create_datetime SET DEFAULT (CURRENT_DATETIME()); ALTER TABLE eval_prompts ALTER COLUMN update_datetime SET DEFAULT (CURRENT_DATETIME()); ALTER TABLE eval_datasets ALTER COLUMN create_datetime SET DEFAULT (CURRENT_DATETIME()); ALTER TABLE eval_datasets ALTER COLUMN update_datetime SET DEFAULT (CURRENT_DATETIME()); ALTER TABLE eval_runs ALTER COLUMN create_datetime SET DEFAULT (CURRENT_DATETIME()); ALTER TABLE eval_runs ALTER COLUMN update_datetime SET DEFAULT (CURRENT_DATETIME()); ALTER TABLE eval_run_details ALTER COLUMN create_datetime SET DEFAULT (CURRENT_DATETIME()); ALTER TABLE eval_run_details ALTER COLUMN update_datetime SET DEFAULT (CURRENT_DATETIME());

genai-on-vertex-ai/gemini/evals_playbook/bigquery_sqls/evals_bigquery.sql (113 lines of code) (raw):