community-efforts/image_preferences/04_binarize_preference_results.ipynb (276 lines of code) (raw):

{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load and transform the dataset\n", "\n", "First, we load the dataset.\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/davidberenstein/Documents/programming/argilla/data-is-better-together/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "data": { "text/plain": [ "Dataset({\n", " features: ['id', 'status', '_server_id', 'images', 'model_1', 'model_2', 'evolution', 'category', 'sub_category', 'preference.responses', 'preference.responses.users', 'preference.responses.status'],\n", " num_rows: 5000\n", "})" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datasets import load_dataset\n", "\n", "ds = load_dataset(\"data-is-better-together/image-preferences-v1-results\", split=\"train\")\n", "ds" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': '3368-quality',\n", " 'status': 'completed',\n", " '_server_id': 'c2306976-5e44-4ad4-b2ce-8a510ec6086b',\n", " 'images': {'image_1': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',\n", " 'image_2': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',\n", " 'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'},\n", " 'model_1': 'dev',\n", " 'model_2': 'sd',\n", " 'evolution': 'quality',\n", " 'category': 'Manga',\n", " 'sub_category': 'detailed',\n", " 'preference.responses': ['both_good', 'image_1', 'image_1'],\n", " 'preference.responses.users': ['50b9a890-173b-4999-bffa-fc0524ba6c63',\n", " 'caf19767-2989-4b3c-a653-9c30afc6361d',\n", " 'ae3e20b2-9aeb-4165-af54-69eac3f2448b'],\n", " 'preference.responses.status': ['submitted', 'submitted', 'submitted']}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ds[0]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "ds = ds.filter(lambda example: example['preference.responses'] is not None)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 4997/4997 [00:00<00:00, 12626.85 examples/s]\n" ] }, { "data": { "text/plain": [ "{'id': '3368-quality',\n", " 'status': 'completed',\n", " '_server_id': 'c2306976-5e44-4ad4-b2ce-8a510ec6086b',\n", " 'images': {'image_1': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',\n", " 'image_2': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',\n", " 'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'},\n", " 'model_1': 'dev',\n", " 'model_2': 'sd',\n", " 'evolution': 'quality',\n", " 'category': 'Manga',\n", " 'sub_category': 'detailed',\n", " 'preference.responses': ['both_good', 'image_1', 'image_1'],\n", " 'preference.responses.users': ['50b9a890-173b-4999-bffa-fc0524ba6c63',\n", " 'caf19767-2989-4b3c-a653-9c30afc6361d',\n", " 'ae3e20b2-9aeb-4165-af54-69eac3f2448b'],\n", " 'preference.responses.status': ['submitted', 'submitted', 'submitted'],\n", " 'chosen': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',\n", " 'chosen_model': 'black-forest-labs/FLUX.1-dev',\n", " 'rejected': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',\n", " 'rejected_model': 'stabilityai/stable-diffusion-3.5-large',\n", " 'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import Counter\n", "\n", "def get_preference_winner(batch):\n", " responses = batch['preference.responses']\n", " cleaned_responses = []\n", " for response in responses:\n", " if response == 'both_good':\n", " cleaned_responses.append('image_1')\n", " cleaned_responses.append('image_2')\n", " else:\n", " cleaned_responses.append(response)\n", " counts = Counter(cleaned_responses)\n", " if counts['image_1'] > counts['image_2'] and counts['image_1'] > counts['both_bad']:\n", " batch['chosen'] = batch['images']['image_1']\n", " batch['chosen_model'] = batch[\"model_1\"]\n", " batch['rejected'] = batch['images']['image_2']\n", " batch['rejected_model'] = batch[\"model_2\"]\n", " elif counts['image_2'] > counts['image_1'] and counts['image_2'] > counts['both_bad']:\n", " batch['chosen'] = batch['images']['image_2']\n", " batch['chosen_model'] = batch[\"model_2\"]\n", " batch['rejected'] = batch['images']['image_1']\n", " batch['rejected_model'] = batch[\"model_1\"]\n", " else:\n", " batch['chosen'] = None\n", " batch['chosen_model'] = None\n", " batch['rejected'] = None\n", " batch['rejected_model'] = None\n", "\n", " batch[\"prompt\"] = batch[\"images\"][\"prompt\"]\n", " \n", " if batch['chosen_model'] == 'dev':\n", " batch['chosen_model'] = 'black-forest-labs/FLUX.1-dev'\n", " batch['rejected_model'] = 'stabilityai/stable-diffusion-3.5-large'\n", " else:\n", " batch['rejected_model'] = 'black-forest-labs/FLUX.1-dev'\n", " batch['chosen_model'] = 'stabilityai/stable-diffusion-3.5-large'\n", " \n", " return batch\n", "\n", "\n", "ds_formatted = ds.map(get_preference_winner)\n", "ds_formatted[0]\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Filter: 100%|██████████| 4997/4997 [00:00<00:00, 48227.03 examples/s]\n" ] }, { "data": { "text/plain": [ "Dataset({\n", " features: ['id', 'status', '_server_id', 'images', 'model_1', 'model_2', 'evolution', 'category', 'sub_category', 'preference.responses', 'preference.responses.users', 'preference.responses.status', 'chosen', 'chosen_model', 'rejected', 'rejected_model', 'prompt'],\n", " num_rows: 3007\n", "})" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ds_formatted_filtered = ds_formatted.filter(lambda example: example['chosen'] is not None)\n", "ds_formatted_filtered" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 1504/1504 [28:41<00:00, 1.14s/ examples]t/s]\n", "Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 70.73ba/s]\n", "Map: 100%|██████████| 1503/1503 [27:23<00:00, 1.09s/ examples], 1737.29s/it]\n", "Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 90.22ba/s]\n", "Uploading the dataset shards: 100%|██████████| 2/2 [56:40<00:00, 1700.25s/it]\n" ] }, { "data": { "text/plain": [ "CommitInfo(commit_url='https://huggingface.co/datasets/data-is-better-together/image-preferences-results-binarized/commit/a1b48e23f0d2bbb0339d4d0a8a6f0dc6b59cc5e9', commit_message='Upload dataset', commit_description='', oid='a1b48e23f0d2bbb0339d4d0a8a6f0dc6b59cc5e9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/data-is-better-together/image-preferences-results-binarized', endpoint='https://huggingface.co', repo_type='dataset', repo_id='data-is-better-together/image-preferences-results-binarized'), pr_revision=None, pr_num=None)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datasets import Image\n", "relevant_columns = ['id', 'prompt', 'chosen', 'rejected', 'chosen_model', 'rejected_model', 'evolution', 'category', 'sub_category']\n", "ds_formatted_filtered_columns = ds_formatted_filtered.select_columns(relevant_columns)\n", "ds_formatted_filtered_columns = ds_formatted_filtered_columns.cast_column('chosen', Image())\n", "ds_formatted_filtered_columns = ds_formatted_filtered_columns.cast_column('rejected', Image())\n", "ds_formatted_filtered_columns.push_to_hub(\"data-is-better-together/open-image-preferences-v1-binarized\")\n" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }