in 3_optimization-design-ptn/03_prompt-optimization/promptwizard/glue/promptopt/techniques/critique_n_refine/core_logic.py [0:0]
def evaluate(self, generated_text: str, dataset_subset: List) -> List:
"""
Compare predicted answers with actual answers from the dataset.
Return the list of questions for which the predicted answer was wrong.
:param generated_text: Output of LLM, that has answers for a mini-batch of questions
(which were send in single go)
:param dataset_subset: List of examples with question and ground truth.
:return: List of examples that were wrongly classified.
"""
# Find all matches of the pattern in the text
answer_matches = re.findall(
DatasetSpecificProcessing.ANSWER_DELIMITER_PATTERN, generated_text
)
# answer_matches = [self.chat_completion(FINAL_ANSWER_EXTRACTION_PROMPT.format(text=generated_text), "You are an AI assistant. Please follow the users requests.")]
answer_matches = [generated_text]
#
answers_len, dataset_len = len(answer_matches), len(dataset_subset)
if answers_len != dataset_len:
self.logger.info(
f"Answers extracted from LLM output={answers_len}, Questions asked to LLM {dataset_len}"
)
if answers_len > dataset_len:
# Select last `dataset_len` number of extractions as final.
answer_matches = answer_matches[-dataset_len:]
wrong_examples = []
for i in range(min(answers_len, dataset_len)):
print("dataset_subset", dataset_subset)
actual_answer = dataset_subset[i][
DatasetSpecificProcessing.FINAL_ANSWER_LITERAL
]
question = dataset_subset[i][DatasetSpecificProcessing.QUESTION_LITERAL]
is_correct, _ = self.data_processor.access_answer(
answer_matches[i], actual_answer
)
if not is_correct:
wrong_examples.append(dataset_subset[i])
#
return wrong_examples