in evaluation_pipeline/evaluation.py [0:0]
def run_llm_judge(judge, query_id, query, retrieved_texts, k):
row = {'query_id': query_id}
row['query'] = query
# call llm
decisions = []
for retrieved_text in retrieved_texts:
try:
llm_judge_response = judge.evaluation_prompt(query, retrieved_text)
# clean up JSON format from LLM
response = format_judge_response(llm_judge_response)
decisions.append(response['binary_decision'])
except:
decisions.append(0)
# store results
row['decisions'] = decisions
row[f'on_topic_number@{k}'] = sum(decisions)
row[f'on_topic_rate@{k}'] = sum(decisions) / float(k)
return row