def run_llm_judge()

in evaluation_pipeline/evaluation.py [0:0]


def run_llm_judge(judge, query_id, query, retrieved_texts, k):
    row = {'query_id': query_id}
    row['query'] = query
    # call llm
    decisions = []
    for retrieved_text in retrieved_texts:
        try:
            llm_judge_response = judge.evaluation_prompt(query, retrieved_text)
            # clean up JSON format from LLM
            response = format_judge_response(llm_judge_response)
            decisions.append(response['binary_decision'])
        except:
            decisions.append(0)
    # store results
    row['decisions'] = decisions
    row[f'on_topic_number@{k}'] = sum(decisions)
    row[f'on_topic_rate@{k}'] = sum(decisions) / float(k)
    return row