lmms_eval/tasks/ocrbench/upload_ocrbench.py (66 lines of code) (raw):
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import datasets
from PIL import Image as PIL_Image
import json
from uuid import uuid4
from datasets import Dataset, Features
import pandas as pd
from tqdm import tqdm
import io
# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """https://arxiv.org/abs/2305.07895"""
_DESCRIPTION = "OCRBench is a comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models."
def image2byte(image):
img_bytes = io.BytesIO()
image.save(img_bytes, format="JPEG")
image_bytes = img_bytes.getvalue()
return image_bytes
def get_builder_config(VERSION):
builder_config = [
datasets.BuilderConfig(
name=f"ocrbench",
version=VERSION,
description=f"ocrbench",
)
]
return builder_config
ocrbench_json = "pathto/OCRBench/OCRBench.json"
img_dir = "pathto/OCRBench_Images/"
dataset_features = Features(
{
"dataset": datasets.Value("string"),
"question": datasets.Value("string"),
"question_type": datasets.Value("string"),
"answer": datasets.features.Sequence(datasets.Value("string")),
"image": datasets.Image(),
}
)
df_items = {
"dataset": [],
"question": [],
"question_type": [],
"answer": [],
"image": [],
}
# img_feature = datasets.Image(decode=False)
with open(ocrbench_json, "r") as f:
data = json.load(f)
for i in tqdm(range(len(data))):
dataset_name = data[i]["dataset_name"]
image_path = img_dir + data[i]["image_path"]
question = data[i]["question"]
answers = data[i]["answers"]
question_type = data[i]["type"]
if type(answers) == str:
answers = [answers]
img = PIL_Image.open(image_path).convert("RGB")
byte_data = image2byte(img)
image = {"bytes": byte_data, "path": ""}
df_items["image"].append(image)
df_items["question"].append(str(question))
df_items["answer"].append(answers)
df_items["question_type"].append(str(question_type))
df_items["dataset"].append(str(dataset_name))
df_items = pd.DataFrame(df_items)
df_items.head()
dataset = Dataset.from_pandas(df_items, features=dataset_features)
hub_dataset_path = "echo840/OCRBench"
dataset.push_to_hub(repo_id=hub_dataset_path, split="test")