genai-on-vertex-ai/gemini/model_upgrades/multiturn_chat/vertex_script/eval.py (66 lines of code) (raw):
import json
import os
import pandas as pd
import vertexai
from datetime import datetime
from google import genai
from google.genai.types import Content, Part
from vertexai.evaluation import EvalTask, MetricPromptTemplateExamples
def load_dataset(dataset_local_path: str) -> pd.DataFrame:
'''Load conversation histories from local files to a Pandas DataFrame.'''
with open(dataset_local_path, 'r') as file:
data = [json.loads(line) for line in file if line.strip()]
dataset = pd.DataFrame(data)
dataset['history'] = dataset['chat_path'].apply(lambda chat_path: open(chat_path, 'r').read())
return dataset[['history']]
def generate_chat_responses(project_id: str, location:str, model: str, dataset: pd.DataFrame, response_column_name: str) -> None:
'''Generate the final model response for each conversation in the dataset using the specified model.'''
client = genai.Client(vertexai=True, project=project_id, location=location)
responses = []
user_prompts = []
for i, record in dataset.iterrows():
print(f'Generating chat completion #{i+1} with {model}')
messages = json.loads(record.get('history'))
last_user_message = messages.pop()
history = [
Content(
role=message['role'],
parts=[Part(text=message['content'])],
)
for message in messages
]
chat = client.chats.create(model=model, history=history)
response = chat.send_message(message=[Part(text=last_user_message['content'])])
user_prompts.append(last_user_message)
responses.append( response.candidates[0].content.parts[0].text )
dataset['prompt'] = user_prompts # The last user message is required by the Autorater
dataset[response_column_name] = responses
print(f'{len(responses)} responses from model {model} are stored in dataset column "{response_column_name}"')
def run_eval(project_id: str, location:str, experiment_name: str, baseline_model: str, candidate_model: str, dataset_local_path: str):
'''Run a pairwise evaluation to compare the quality of model responses from the baseline and candidate models.'''
vertexai.init(project=project_id, location=location)
timestamp = f"{datetime.now().strftime('%b-%d-%H-%M-%S')}".lower()
dataset=load_dataset(dataset_local_path)
generate_chat_responses(project_id, location, baseline_model, dataset, 'baseline_model_response')
generate_chat_responses(project_id, location, candidate_model, dataset, 'response')
task = EvalTask(
dataset=dataset,
metrics=[MetricPromptTemplateExamples.Pairwise.MULTI_TURN_CHAT_QUALITY],
experiment=experiment_name
)
eval_results = task.evaluate(
experiment_run_name=f"{timestamp}-{baseline_model.replace('.', '-')}"
)
print(f"Baseline model win rate: {eval_results.summary_metrics['pairwise_multi_turn_chat_quality/baseline_model_win_rate']:.2f}")
print(f"Candidate model win rate: {eval_results.summary_metrics['pairwise_multi_turn_chat_quality/candidate_model_win_rate']:.2f}")
if __name__ == '__main__':
if os.getenv('PROJECT_ID', 'your-project-id') == 'your-project-id':
raise ValueError('Please configure your Google Cloud Project ID.')
run_eval(
project_id=os.getenv('PROJECT_ID'),
location=os.getenv('LOCATION') or 'us-central1',
experiment_name = 'eval-multiturn-chat',
baseline_model = 'gemini-1.5-flash-001',
candidate_model = 'gemini-2.0-flash-001',
dataset_local_path = 'dataset.jsonl'
)