in genai-on-vertex-ai/gemini/model_upgrades/document_qna/vertex_script/eval.py [0:0]
def combine_eval_runs(baseline: EvalResult, candidate: EvalResult) -> list[dict]:
'''Combine the evaluation results for the two models and calculate the pairwise score.'''
if None in [baseline, candidate] or len(baseline.metrics_table.index) != len(candidate.metrics_table.index):
raise ValueError(f'Invalid eval results!')
examples = []
for b, c in zip(baseline.metrics_table.to_dict(orient='records'), candidate.metrics_table.to_dict(orient='records')):
score_b = b.get('question_answering_quality/score')
score_c = c.get('question_answering_quality/score')
examples.append(dict(
input_text=b.get('prompt'),
output_text_a=b.get('response').strip(),
output_text_b=c.get('response').strip(),
score = 0 if score_b == score_c else 1.0 if score_b > score_c else -1.0,
tags=[],
individual_rater_scores=[]
))
return examples