def run_eval()

in genai-on-vertex-ai/gemini/model_upgrades/document_qna/vertex_script/eval.py [0:0]


def run_eval(experiment_name: str, baseline_model: str, candidate_model: str, prompt_template_local_path: str, dataset_local_path: str):
    timestamp = f"{datetime.now().strftime('%b-%d-%H-%M-%S')}".lower()
    prompt_template = open(prompt_template_local_path).read()
    task = EvalTask(
        dataset=load_dataset(dataset_local_path),
        metrics=[MetricPromptTemplateExamples.Pointwise.QUESTION_ANSWERING_QUALITY],
        experiment=experiment_name
    )
    baseline_results = task.evaluate(
        experiment_run_name=f"{timestamp}-{baseline_model.replace('.', '-')}",
        prompt_template=prompt_template,
        model=GenerativeModel(baseline_model)
    )

    candidate_results = task.evaluate(
        experiment_run_name=f"{timestamp}-{candidate_model.replace('.', '-')}",
        prompt_template=prompt_template,
        model=GenerativeModel(candidate_model)
    )
    print(f"Baseline model score: {baseline_results.summary_metrics['question_answering_quality/mean']*20:.1f}%")
    print(f"Candidate model score: {candidate_results.summary_metrics['question_answering_quality/mean']*20:.1f}%")
    export_results(baseline_model, baseline_results, candidate_model, candidate_results, f'eval_results_{timestamp}.json')