def combine_eval_runs()

in genai-on-vertex-ai/gemini/model_upgrades/document_qna/vertex_script/eval.py [0:0]


def combine_eval_runs(baseline: EvalResult, candidate: EvalResult) -> list[dict]:
    '''Combine the evaluation results for the two models and calculate the pairwise score.'''
    if None in [baseline, candidate] or len(baseline.metrics_table.index) != len(candidate.metrics_table.index):
        raise ValueError(f'Invalid eval results!')
    examples = []
    for b, c in zip(baseline.metrics_table.to_dict(orient='records'), candidate.metrics_table.to_dict(orient='records')):
        score_b = b.get('question_answering_quality/score')
        score_c = c.get('question_answering_quality/score')
        examples.append(dict(
            input_text=b.get('prompt'),
            output_text_a=b.get('response').strip(),
            output_text_b=c.get('response').strip(),
            score = 0 if score_b == score_c else 1.0 if score_b > score_c else -1.0,
            tags=[],
            individual_rater_scores=[]
        ))
    return examples