def run_eval()

in genai-on-vertex-ai/gemini/model_upgrades/multiturn_chat/vertex_script/eval.py [0:0]


def run_eval(project_id: str, location:str, experiment_name: str, baseline_model: str, candidate_model: str, dataset_local_path: str):
    '''Run a pairwise evaluation to compare the quality of model responses from the baseline and candidate models.'''
    vertexai.init(project=project_id, location=location)
    timestamp = f"{datetime.now().strftime('%b-%d-%H-%M-%S')}".lower()
    dataset=load_dataset(dataset_local_path)
    generate_chat_responses(project_id, location, baseline_model, dataset, 'baseline_model_response')
    generate_chat_responses(project_id, location, candidate_model, dataset, 'response')
    task = EvalTask(
        dataset=dataset,
        metrics=[MetricPromptTemplateExamples.Pairwise.MULTI_TURN_CHAT_QUALITY],
        experiment=experiment_name
    )
    eval_results = task.evaluate(
        experiment_run_name=f"{timestamp}-{baseline_model.replace('.', '-')}"
    )
    print(f"Baseline model win rate: {eval_results.summary_metrics['pairwise_multi_turn_chat_quality/baseline_model_win_rate']:.2f}")
    print(f"Candidate model win rate: {eval_results.summary_metrics['pairwise_multi_turn_chat_quality/candidate_model_win_rate']:.2f}")