in evals/elsuite/mmmu/eval.py [0:0]
def eval_sample(self, sample: Sample, rng):
assert isinstance(sample, Sample)
if sample.question_type == "multiple-choice":
options, correct_answer = make_abc(
answers=sample.answers,
correct_idx=sample.label,
rng=rng,
)
prompt = sample.question + "\n" + options
system_prompt = f'You are an expert in {self.subject} whose job is to answer questions from the user using images. First, reason about the correct answer. Then write the answer in the following format where X is exactly one of A,B,C,D: "ANSWER: X". If you are uncertain of the correct answer, guess the most likely one.'
else:
correct_answer = sample.label
prompt = sample.question
system_prompt = f'You are an expert in {self.subject} whose job is to answer questions from the user using images. First, reason about the correct answer. Then write the answer in the following format where X is only the answer and nothing else: "ANSWER: X"'
images = [
image
for image in [
sample.image_1,
sample.image_2,
sample.image_3,
sample.image_4,
sample.image_5,
sample.image_6,
sample.image_7,
]
if image is not None
]
base_64_images = []
for image in images:
buffer = BytesIO()
image.save(buffer, format="PNG")
img_str = base64.b64encode(buffer.getvalue())
base_64_images.append(img_str.decode())
try:
result = self.completion_fn(
prompt=[
{
"role": "system",
"content": [
{
"type": "text",
"text": system_prompt,
},
],
},
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt,
},
]
+ [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base_64_image}",
},
}
for base_64_image in base_64_images
],
},
],
temperature=0.0,
max_tokens=4096,
)
sampled = result.get_completions()[0]
except Exception as e:
logging.info("Sampling failed!")
logging.info(sample)
logging.info(f"Prompt: {prompt}")
logging.info(f"Error: {str(e)}")
sampled = "ERROR: " + str(e)
match = sampled.find(f"ANSWER: {correct_answer}") != -1
if not match and sampled.find("ANSWER") == -1 and sample.question_type == "multiple-choice":
# The model didn't answer anything, so randomly pick an answer
# This matches the behavior described in section 4.1 of the MMMU paper: https://arxiv.org/pdf/2311.16502.pdf
logging.info("No answer found for multiple choice so picking a random answer.")
answer_idx = rng.randint(0, len(sample.answers) - 1)
answer_letter = chr(ord("A") + answer_idx)
match = correct_answer == answer_letter
record_match(
match,
expected=correct_answer,
picked=(correct_answer if match else None),
sampled=sampled,
)