in lmms_eval/tasks/ferret/utils.py [0:0]
def ferret_process_results(doc, result):
"""
Args:
doc: a instance of the eval dataset
results: [pred]
Returns:
a dictionary with key: metric name (in this case coco_bleu), value: metric value
"""
try:
question = doc.get("question", "")
ans1 = doc.get("gpt_answer", "")
ans2 = result[0] if result else ""
context = doc.get("context", [])
context = "\n".join(context) if isinstance(context, list) else context
category = doc.get("category", "")
rule = rule_dict.get(category, {})
prompt = rule.get("prompt", "")
role = rule.get("role", "user")
content = f"[Context]\n{context}\n\n" f"[Question]\n{question}\n\n" f"[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n" f"[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n" f"[System]\n{prompt}\n\n"
review, model_name = get_eval(content, 1024)
scores = parse_score(review)
except Exception as e:
eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}")
review = "Failed to Get a Proper Review."
model_name = "Failed Request"
scores = [-1, -1]
metric = f"gpt_eval_ferret_{doc.get('category', 'all')}"
category_review_dict = {
"question": question,
"ans1": ans1,
"ans2": ans2,
"context": context,
"category": category,
"review": review,
"scores": scores,
"eval_model": model_name,
}
non_category_review_dict = deepcopy(category_review_dict)
non_category_review_dict["scores"] = [-999, -999]
data_dict = {}
for m in FERRET_W_METRICS:
if m == metric:
data_dict[m] = category_review_dict
else:
data_dict[m] = non_category_review_dict
data_dict["gpt_eval_ferret_all"] = category_review_dict
# return {"gpt_eval_ferret_all": review_dict}
return data_dict