In [None]:
!pip install datasets

## Load and transform the dataset

First, we load the dataset.


In [1]:
from datasets import load_dataset

ds = load_dataset("data-is-better-together/image-preferences-v1-results", split="train")
ds

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['id', 'status', '_server_id', 'images', 'model_1', 'model_2', 'evolution', 'category', 'sub_category', 'preference.responses', 'preference.responses.users', 'preference.responses.status'],
    num_rows: 5000
})

In [2]:
ds[0]

{'id': '3368-quality',
 'status': 'completed',
 '_server_id': 'c2306976-5e44-4ad4-b2ce-8a510ec6086b',
 'images': {'image_1': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',
  'image_2': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',
  'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'},
 'model_1': 'dev',
 'model_2': 'sd',
 'evolution': 'quality',
 'category': 'Manga',
 'sub_category': 'detailed',
 'preference.responses': ['both_good', 'image_1', 'image_1'],
 'preference.responses.users': ['50b9a890-173b-4999-bffa-fc0524ba6c63',
  'caf19767-2989-4b3c-a653-9c30afc6361d',
  'ae3e20b2-9aeb-4165-af54-69eac3f2448b'],
 'pref

In [3]:
ds = ds.filter(lambda example: example['preference.responses'] is not None)

In [9]:
from collections import Counter

def get_preference_winner(batch):
    responses = batch['preference.responses']
    cleaned_responses = []
    for response in responses:
        if response == 'both_good':
            cleaned_responses.append('image_1')
            cleaned_responses.append('image_2')
        else:
            cleaned_responses.append(response)
    counts = Counter(cleaned_responses)
    if counts['image_1'] > counts['image_2'] and counts['image_1'] > counts['both_bad']:
        batch['chosen'] = batch['images']['image_1']
        batch['chosen_model'] = batch["model_1"]
        batch['rejected'] = batch['images']['image_2']
        batch['rejected_model'] = batch["model_2"]
    elif counts['image_2'] > counts['image_1'] and counts['image_2'] > counts['both_bad']:
        batch['chosen'] = batch['images']['image_2']
        batch['chosen_model'] = batch["model_2"]
        batch['rejected'] = batch['images']['image_1']
        batch['rejected_model'] = batch["model_1"]
    else:
        batch['chosen'] = None
        batch['chosen_model'] = None
        batch['rejected'] = None
        batch['rejected_model'] = None

    batch["prompt"] = batch["images"]["prompt"]
    
    if batch['chosen_model'] == 'dev':
        batch['chosen_model'] = 'black-forest-labs/FLUX.1-dev'
        batch['rejected_model'] = 'stabilityai/stable-diffusion-3.5-large'
    else:
        batch['rejected_model'] = 'black-forest-labs/FLUX.1-dev'
        batch['chosen_model'] = 'stabilityai/stable-diffusion-3.5-large'
        
    return batch


ds_formatted = ds.map(get_preference_winner)
ds_formatted[0]


Map: 100%|██████████| 4997/4997 [00:00<00:00, 12626.85 examples/s]


{'id': '3368-quality',
 'status': 'completed',
 '_server_id': 'c2306976-5e44-4ad4-b2ce-8a510ec6086b',
 'images': {'image_1': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_dev/3368.jpg',
  'image_2': 'https://huggingface.co/datasets/data-is-better-together/image-preferences-filtered/resolve/main/image_quality_sd/3368.jpg',
  'prompt': 'a bustling manga street, devoid of vehicles, detailed with vibrant colors and dynamic line work, characters in the background adding life and movement, under a soft golden hour light, with rich textures and a lively atmosphere, high resolution, sharp focus'},
 'model_1': 'dev',
 'model_2': 'sd',
 'evolution': 'quality',
 'category': 'Manga',
 'sub_category': 'detailed',
 'preference.responses': ['both_good', 'image_1', 'image_1'],
 'preference.responses.users': ['50b9a890-173b-4999-bffa-fc0524ba6c63',
  'caf19767-2989-4b3c-a653-9c30afc6361d',
  'ae3e20b2-9aeb-4165-af54-69eac3f2448b'],
 'pref

In [10]:
ds_formatted_filtered = ds_formatted.filter(lambda example: example['chosen'] is not None)
ds_formatted_filtered

Filter: 100%|██████████| 4997/4997 [00:00<00:00, 48227.03 examples/s]


Dataset({
    features: ['id', 'status', '_server_id', 'images', 'model_1', 'model_2', 'evolution', 'category', 'sub_category', 'preference.responses', 'preference.responses.users', 'preference.responses.status', 'chosen', 'chosen_model', 'rejected', 'rejected_model', 'prompt'],
    num_rows: 3007
})

In [11]:
from datasets import Image
relevant_columns = ['id', 'prompt', 'chosen', 'rejected', 'chosen_model', 'rejected_model', 'evolution', 'category', 'sub_category']
ds_formatted_filtered_columns = ds_formatted_filtered.select_columns(relevant_columns)
ds_formatted_filtered_columns = ds_formatted_filtered_columns.cast_column('chosen', Image())
ds_formatted_filtered_columns = ds_formatted_filtered_columns.cast_column('rejected', Image())
ds_formatted_filtered_columns.push_to_hub("data-is-better-together/open-image-preferences-v1-binarized")


Map: 100%|██████████| 1504/1504 [28:41<00:00,  1.14s/ examples]t/s]
Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 70.73ba/s]
Map: 100%|██████████| 1503/1503 [27:23<00:00,  1.09s/ examples], 1737.29s/it]
Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 90.22ba/s]
Uploading the dataset shards: 100%|██████████| 2/2 [56:40<00:00, 1700.25s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/data-is-better-together/image-preferences-results-binarized/commit/a1b48e23f0d2bbb0339d4d0a8a6f0dc6b59cc5e9', commit_message='Upload dataset', commit_description='', oid='a1b48e23f0d2bbb0339d4d0a8a6f0dc6b59cc5e9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/data-is-better-together/image-preferences-results-binarized', endpoint='https://huggingface.co', repo_type='dataset', repo_id='data-is-better-together/image-preferences-results-binarized'), pr_revision=None, pr_num=None)