in src/dfcx_scrapi/tools/metrics.py [0:0]
def __call__(self, inputs: dict[str, Any]) -> dict[str, Any]:
if "reference_statements" in inputs:
reference_statements = inputs["reference_statements"]
else:
reference_statements = self._statement_extractor.extract_statements(
question=inputs["query"],
answer=inputs["expected_answer"]
)
recall_result = self._recall_answer_scorer.score(
question=inputs["query"],
candidate_answer=inputs["query_result"].answer_text,
baseline_statements=reference_statements,
)
recall_score = recall_result.mean_score if recall_result else np.nan
if not self.compute_precision:
return {"answer_correctness_recall": recall_score}
if "prediction_statements" in inputs:
prediction_statements = inputs["prediction_statements"]
else:
prediction_statements = (
self._statement_extractor.extract_statements(
question=inputs["query"],
answer=inputs["query_result"].answer_text
)
)
precision_result = self._precision_answer_scorer.score(
question=inputs["query"],
candidate_answer=inputs["expected_answer"],
baseline_statements=prediction_statements,
)
pecision_score = (
precision_result.mean_score if precision_result else np.nan
)
if recall_result and precision_result:
f1_score = statistics.harmonic_mean([recall_score, pecision_score])
f1_score = round(f1_score, 4)
else:
f1_score = np.nan
return {
"answer_correctness_recall": recall_score,
"answer_correctness_precision": pecision_score,
"answer_correctness_f1": f1_score,
}