def evaluate()

in 3_optimization-design-ptn/03_prompt-optimization/promptwizard/glue/promptopt/techniques/critique_n_refine/core_logic.py [0:0]


    def evaluate(self, generated_text: str, dataset_subset: List) -> List:
        """
        Compare predicted answers with actual answers from the dataset.
        Return the list of questions for which the predicted answer was wrong.

        :param generated_text: Output of LLM, that has answers for a mini-batch of questions
                               (which were send in single go)
        :param dataset_subset: List of examples with question and ground truth.
        :return: List of examples that were wrongly classified.
        """
        # Find all matches of the pattern in the text
        answer_matches = re.findall(
            DatasetSpecificProcessing.ANSWER_DELIMITER_PATTERN, generated_text
        )

        # answer_matches = [self.chat_completion(FINAL_ANSWER_EXTRACTION_PROMPT.format(text=generated_text), "You are an AI assistant. Please follow the users requests.")]
        answer_matches = [generated_text]
        #
        answers_len, dataset_len = len(answer_matches), len(dataset_subset)
        if answers_len != dataset_len:
            self.logger.info(
                f"Answers extracted from LLM output={answers_len}, Questions asked to LLM {dataset_len}"
            )
            if answers_len > dataset_len:
                # Select last `dataset_len` number of extractions as final.
                answer_matches = answer_matches[-dataset_len:]

        wrong_examples = []
        for i in range(min(answers_len, dataset_len)):
            print("dataset_subset", dataset_subset)
            actual_answer = dataset_subset[i][
                DatasetSpecificProcessing.FINAL_ANSWER_LITERAL
            ]
            question = dataset_subset[i][DatasetSpecificProcessing.QUESTION_LITERAL]
            is_correct, _ = self.data_processor.access_answer(
                answer_matches[i], actual_answer
            )
            if not is_correct:
                wrong_examples.append(dataset_subset[i])
        #
        return wrong_examples