def __call__()

in src/dfcx_scrapi/tools/metrics.py [0:0]


    def __call__(self, inputs: dict[str, Any]) -> dict[str, Any]:
        if "reference_statements" in inputs:
            reference_statements = inputs["reference_statements"]
        else:
            reference_statements = self._statement_extractor.extract_statements(
                question=inputs["query"],
                answer=inputs["expected_answer"]
            )
        recall_result = self._recall_answer_scorer.score(
            question=inputs["query"],
            candidate_answer=inputs["query_result"].answer_text,
            baseline_statements=reference_statements,
        )

        recall_score = recall_result.mean_score if recall_result else np.nan

        if not self.compute_precision:
            return {"answer_correctness_recall": recall_score}

        if "prediction_statements" in inputs:
            prediction_statements = inputs["prediction_statements"]
        else:
            prediction_statements = (
                self._statement_extractor.extract_statements(
                    question=inputs["query"],
                    answer=inputs["query_result"].answer_text
                )
            )
        precision_result = self._precision_answer_scorer.score(
            question=inputs["query"],
            candidate_answer=inputs["expected_answer"],
            baseline_statements=prediction_statements,
        )

        pecision_score = (
            precision_result.mean_score if precision_result else np.nan
        )

        if recall_result and precision_result:
            f1_score = statistics.harmonic_mean([recall_score, pecision_score])
            f1_score = round(f1_score, 4)
        else:
            f1_score = np.nan

        return {
            "answer_correctness_recall": recall_score,
            "answer_correctness_precision": pecision_score,
            "answer_correctness_f1": f1_score,
        }