community-efforts/image_preferences/04_binarize_preference_results.ipynb (276 lines of code) (raw):
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install datasets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and transform the dataset\n",
"\n",
"First, we load the dataset.\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/davidberenstein/Documents/programming/argilla/data-is-better-together/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['id', 'status', '_server_id', 'images', 'model_1', 'model_2', 'evolution', 'category', 'sub_category', 'preference.responses', 'preference.responses.users', 'preference.responses.status'],\n",
" num_rows: 5000\n",
"})"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from datasets import load_dataset\n",
"\n",
"ds = load_dataset(\"data-is-better-together/image-preferences-v1-results\", split=\"train\")\n",
"ds"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'id': '3368-quality',\n",
" 'status': 'completed',\n",
" '_server_id': 'c2306976-5e44-4ad4-b2ce-8a510ec6086b',\n",
" 'images': {'image_1': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',\n",
" 'image_2': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',\n",
" 'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'},\n",
" 'model_1': 'dev',\n",
" 'model_2': 'sd',\n",
" 'evolution': 'quality',\n",
" 'category': 'Manga',\n",
" 'sub_category': 'detailed',\n",
" 'preference.responses': ['both_good', 'image_1', 'image_1'],\n",
" 'preference.responses.users': ['50b9a890-173b-4999-bffa-fc0524ba6c63',\n",
" 'caf19767-2989-4b3c-a653-9c30afc6361d',\n",
" 'ae3e20b2-9aeb-4165-af54-69eac3f2448b'],\n",
" 'preference.responses.status': ['submitted', 'submitted', 'submitted']}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds[0]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"ds = ds.filter(lambda example: example['preference.responses'] is not None)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Map: 100%|██████████| 4997/4997 [00:00<00:00, 12626.85 examples/s]\n"
]
},
{
"data": {
"text/plain": [
"{'id': '3368-quality',\n",
" 'status': 'completed',\n",
" '_server_id': 'c2306976-5e44-4ad4-b2ce-8a510ec6086b',\n",
" 'images': {'image_1': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',\n",
" 'image_2': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',\n",
" 'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'},\n",
" 'model_1': 'dev',\n",
" 'model_2': 'sd',\n",
" 'evolution': 'quality',\n",
" 'category': 'Manga',\n",
" 'sub_category': 'detailed',\n",
" 'preference.responses': ['both_good', 'image_1', 'image_1'],\n",
" 'preference.responses.users': ['50b9a890-173b-4999-bffa-fc0524ba6c63',\n",
" 'caf19767-2989-4b3c-a653-9c30afc6361d',\n",
" 'ae3e20b2-9aeb-4165-af54-69eac3f2448b'],\n",
" 'preference.responses.status': ['submitted', 'submitted', 'submitted'],\n",
" 'chosen': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',\n",
" 'chosen_model': 'black-forest-labs/FLUX.1-dev',\n",
" 'rejected': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',\n",
" 'rejected_model': 'stabilityai/stable-diffusion-3.5-large',\n",
" 'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from collections import Counter\n",
"\n",
"def get_preference_winner(batch):\n",
" responses = batch['preference.responses']\n",
" cleaned_responses = []\n",
" for response in responses:\n",
" if response == 'both_good':\n",
" cleaned_responses.append('image_1')\n",
" cleaned_responses.append('image_2')\n",
" else:\n",
" cleaned_responses.append(response)\n",
" counts = Counter(cleaned_responses)\n",
" if counts['image_1'] > counts['image_2'] and counts['image_1'] > counts['both_bad']:\n",
" batch['chosen'] = batch['images']['image_1']\n",
" batch['chosen_model'] = batch[\"model_1\"]\n",
" batch['rejected'] = batch['images']['image_2']\n",
" batch['rejected_model'] = batch[\"model_2\"]\n",
" elif counts['image_2'] > counts['image_1'] and counts['image_2'] > counts['both_bad']:\n",
" batch['chosen'] = batch['images']['image_2']\n",
" batch['chosen_model'] = batch[\"model_2\"]\n",
" batch['rejected'] = batch['images']['image_1']\n",
" batch['rejected_model'] = batch[\"model_1\"]\n",
" else:\n",
" batch['chosen'] = None\n",
" batch['chosen_model'] = None\n",
" batch['rejected'] = None\n",
" batch['rejected_model'] = None\n",
"\n",
" batch[\"prompt\"] = batch[\"images\"][\"prompt\"]\n",
" \n",
" if batch['chosen_model'] == 'dev':\n",
" batch['chosen_model'] = 'black-forest-labs/FLUX.1-dev'\n",
" batch['rejected_model'] = 'stabilityai/stable-diffusion-3.5-large'\n",
" else:\n",
" batch['rejected_model'] = 'black-forest-labs/FLUX.1-dev'\n",
" batch['chosen_model'] = 'stabilityai/stable-diffusion-3.5-large'\n",
" \n",
" return batch\n",
"\n",
"\n",
"ds_formatted = ds.map(get_preference_winner)\n",
"ds_formatted[0]\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Filter: 100%|██████████| 4997/4997 [00:00<00:00, 48227.03 examples/s]\n"
]
},
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['id', 'status', '_server_id', 'images', 'model_1', 'model_2', 'evolution', 'category', 'sub_category', 'preference.responses', 'preference.responses.users', 'preference.responses.status', 'chosen', 'chosen_model', 'rejected', 'rejected_model', 'prompt'],\n",
" num_rows: 3007\n",
"})"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds_formatted_filtered = ds_formatted.filter(lambda example: example['chosen'] is not None)\n",
"ds_formatted_filtered"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Map: 100%|██████████| 1504/1504 [28:41<00:00, 1.14s/ examples]t/s]\n",
"Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 70.73ba/s]\n",
"Map: 100%|██████████| 1503/1503 [27:23<00:00, 1.09s/ examples], 1737.29s/it]\n",
"Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 90.22ba/s]\n",
"Uploading the dataset shards: 100%|██████████| 2/2 [56:40<00:00, 1700.25s/it]\n"
]
},
{
"data": {
"text/plain": [
"CommitInfo(commit_url='https://huggingface.co/datasets/data-is-better-together/image-preferences-results-binarized/commit/a1b48e23f0d2bbb0339d4d0a8a6f0dc6b59cc5e9', commit_message='Upload dataset', commit_description='', oid='a1b48e23f0d2bbb0339d4d0a8a6f0dc6b59cc5e9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/data-is-better-together/image-preferences-results-binarized', endpoint='https://huggingface.co', repo_type='dataset', repo_id='data-is-better-together/image-preferences-results-binarized'), pr_revision=None, pr_num=None)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from datasets import Image\n",
"relevant_columns = ['id', 'prompt', 'chosen', 'rejected', 'chosen_model', 'rejected_model', 'evolution', 'category', 'sub_category']\n",
"ds_formatted_filtered_columns = ds_formatted_filtered.select_columns(relevant_columns)\n",
"ds_formatted_filtered_columns = ds_formatted_filtered_columns.cast_column('chosen', Image())\n",
"ds_formatted_filtered_columns = ds_formatted_filtered_columns.cast_column('rejected', Image())\n",
"ds_formatted_filtered_columns.push_to_hub(\"data-is-better-together/open-image-preferences-v1-binarized\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}