lmms_eval/tasks/chartqa/utils.py (34 lines of code) (raw):

def chartqa_doc_to_visual(doc): return [doc["image"].convert("RGB")] def chartqa_doc_to_text(doc, model_specific_prompt_kwargs): question = doc["question"] pre_prompt = model_specific_prompt_kwargs["pre_prompt"] post_prompt = model_specific_prompt_kwargs["post_prompt"] return f"{pre_prompt}{question}{post_prompt}" def chartqa_process_results(doc, results): pred = results[0] type = doc["type"] score = relaxed_correctness(pred, doc["answer"]) score = 1.0 if score else 0.0 return_dict = {"relaxed_overall": score} if type == "human_test": return_dict["relaxed_human_split"] = score else: return_dict["relaxed_augmented_split"] = score return return_dict def relaxed_correctness(prediction, target, max_relative_change: float = 0.05) -> bool: """Calculates relaxed correctness. The correctness tolerates certain error ratio defined by max_relative_change. See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1: “Following Methani et al. (2020), we use a relaxed accuracy measure for the numeric answers to allow a minor inaccuracy that may result from the automatic data extraction process. We consider an answer to be correct if it is within 5% of the gold answer. For non-numeric answers, we still need an exact match to consider an answer to be correct.” This funcion is taken from https://github.com/QwenLM/Qwen-VL/blob/34b4c0ee7b07726371b960911f249fe61b362ca3/eval_mm/evaluate_vqa.py#L113 Args: target: List of target string. prediction: List of predicted string. max_relative_change: Maximum relative change. Returns: Whether the prediction was correct given the specified tolerance. """ def _to_float(text: str): try: if text.endswith("%"): # Convert percentages to floats. return float(text.rstrip("%")) / 100.0 else: return float(text) except ValueError: return None prediction_float = _to_float(prediction) target_float = _to_float(target) if prediction_float is not None and target_float: relative_change = abs(prediction_float - target_float) / abs(target_float) return relative_change <= max_relative_change else: return prediction.lower() == target.lower()