in genai-on-vertex-ai/gemini/model_upgrades/multiturn_chat/vertex_script/eval.py [0:0]
def run_eval(project_id: str, location:str, experiment_name: str, baseline_model: str, candidate_model: str, dataset_local_path: str):
'''Run a pairwise evaluation to compare the quality of model responses from the baseline and candidate models.'''
vertexai.init(project=project_id, location=location)
timestamp = f"{datetime.now().strftime('%b-%d-%H-%M-%S')}".lower()
dataset=load_dataset(dataset_local_path)
generate_chat_responses(project_id, location, baseline_model, dataset, 'baseline_model_response')
generate_chat_responses(project_id, location, candidate_model, dataset, 'response')
task = EvalTask(
dataset=dataset,
metrics=[MetricPromptTemplateExamples.Pairwise.MULTI_TURN_CHAT_QUALITY],
experiment=experiment_name
)
eval_results = task.evaluate(
experiment_run_name=f"{timestamp}-{baseline_model.replace('.', '-')}"
)
print(f"Baseline model win rate: {eval_results.summary_metrics['pairwise_multi_turn_chat_quality/baseline_model_win_rate']:.2f}")
print(f"Candidate model win rate: {eval_results.summary_metrics['pairwise_multi_turn_chat_quality/candidate_model_win_rate']:.2f}")