tools/make_hf_dataset.ipynb (120 lines of code) (raw):

{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This notebook will guide you to make correct format of Huggingface dataset, in proper parquet format and visualizable in Huggingface dataset hub.\n", "# We will take the example of the dataset \"Otter-AI/MMVet\" and convert it to the proper format." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "data_path = \"Otter-AI/MMVet\"\n", "df = load_dataset(data_path, split=\"test\").to_pandas()\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import Dataset, Features, Value, Image\n", "import pandas as pd\n", "\n", "# Define the features for the dataset\n", "features = Features(\n", " {\n", " \"question_id\": Value(dtype=\"string\"),\n", " \"image\": Image(),\n", " \"question\": Value(dtype=\"string\"),\n", " \"answer\": Value(dtype=\"string\"),\n", " \"image_source\": Value(dtype=\"string\"),\n", " \"capability\": Value(dtype=\"string\"),\n", " # Add other fields as necessary\n", " }\n", ")\n", "\n", "df_items = {\n", " \"question_id\": [],\n", " \"image\": [],\n", " \"question\": [],\n", " \"answer\": [],\n", " \"image_source\": [],\n", " \"capability\": [],\n", "}\n", "\n", "for idx, row in df.iterrows():\n", " df_items[\"question_id\"].append(str(row[\"id\"]))\n", " image = {\"bytes\": row[\"images\"][0][\"bytes\"], \"path\": \"\"}\n", " df_items[\"image\"].append(image)\n", " df_items[\"question\"].append(str(row[\"instruction\"]))\n", " df_items[\"answer\"].append(str(row[\"answer\"]))\n", " df_items[\"image_source\"].append(str(row[\"image_source\"]))\n", " df_items[\"capability\"].append(\",\".join(list(row[\"capability\"])))\n", " # Add other fields as necessary\n", "\n", "df_items = pd.DataFrame(df_items)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_items.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset = Dataset.from_pandas(df_items, features=features)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hub_dataset_path = \"lmms-lab/MMVet\"\n", "dataset.push_to_hub(repo_id=hub_dataset_path, split=\"test\")" ] } ], "metadata": { "kernelspec": { "display_name": "lmms-eval", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 2 }