community-efforts/image_preferences/03_upload_to_argilla.ipynb (1,326 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Upload the dataset to Argilla\n",
"\n",
"Before we can upload the dataset to Argilla, we need to apply some transformations and postprocessing.\n",
"\n",
"## Load and transform the dataset\n",
"\n",
"First, we load the dataset.\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a1b887feda7741138af1676b4a60d393",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Resolving data files: 0%| | 0/39 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6f7eba6dd38e4a218a13daa73aa2c4c2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Resolving data files: 0%| | 0/24 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "552baba457024d3f86ba38463cc75674",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Resolving data files: 0%| | 0/39 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3cd59c197bf84e10b58fb6e296126383",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Resolving data files: 0%| | 0/24 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d229be2519a14734b57504a4494534f1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading dataset shards: 0%| | 0/21 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quality_prompt</th>\n",
" <th>category</th>\n",
" <th>subcategory</th>\n",
" <th>style_prompt</th>\n",
" <th>simplified_prompt</th>\n",
" <th>__index_level_0__</th>\n",
" <th>grouped_model_name</th>\n",
" <th>prompt</th>\n",
" <th>distilabel_metadata</th>\n",
" <th>image_quality_dev</th>\n",
" <th>image_simplified_dev</th>\n",
" <th>image_quality_sd</th>\n",
" <th>image_simplified_sd</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>a harp without strings, in an anime style, wit...</td>\n",
" <td>Anime</td>\n",
" <td>anime style</td>\n",
" <td>a harp without strings, in an anime style, wit...</td>\n",
" <td>A harp without strings, in anime style, with i...</td>\n",
" <td>1</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>a harp without any strings</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A majestic horse perched on an astronaut's sho...</td>\n",
" <td>Cinematic</td>\n",
" <td>bokeh</td>\n",
" <td>A horse perched on an astronaut's shoulders, c...</td>\n",
" <td>A horse on an astronaut's shoulders, with cine...</td>\n",
" <td>2</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>A horse sitting on an astronaut's shoulders.</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>a 3D model of a minimalist, abstract financial...</td>\n",
" <td>3D Model</td>\n",
" <td>volumetric</td>\n",
" <td>a 3D model of a minimalist, abstract financial...</td>\n",
" <td>3D model of an abstract financial landscape wi...</td>\n",
" <td>4</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>a real photography that is minamalistic and th...</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Tempera painting of a fragmented, distorted sc...</td>\n",
" <td>Painting</td>\n",
" <td>Tempera</td>\n",
" <td>Tempera painting of a fragmented scene from th...</td>\n",
" <td>Tempera painting with distorted, blurred fragm...</td>\n",
" <td>15</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>digital art in the style of epic Surreal, abst...</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>a crown, captured in sharp black and white wit...</td>\n",
" <td>Photographic</td>\n",
" <td>Fomapan</td>\n",
" <td>a crown, captured in sharp black and white, wi...</td>\n",
" <td>A sharp, black and white crown with high contr...</td>\n",
" <td>16</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>a crown</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8784</th>\n",
" <td>Cristiano Ronaldo plays chess with Shrek, clay...</td>\n",
" <td>Animation</td>\n",
" <td>Claymation</td>\n",
" <td>Cristiano Ronaldo Plays Chess with Shrek, clay...</td>\n",
" <td>Cristiano Ronaldo plays chess with Shrek in a ...</td>\n",
" <td>14594</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>Cristiano Ronaldo Plays Chess with Shrek, intr...</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8785</th>\n",
" <td>a highly detailed red panda, ultra-sharp focus...</td>\n",
" <td>Photographic</td>\n",
" <td>highly detailed</td>\n",
" <td>a highly detailed red panda, ultra-sharp focus...</td>\n",
" <td>A detailed red panda with vibrant colors in st...</td>\n",
" <td>14595</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>a cute red panda high qualtity</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8786</th>\n",
" <td>Pop Art: Bold, vibrant ship sails serene, icy ...</td>\n",
" <td>Painting</td>\n",
" <td>Pop Art</td>\n",
" <td>Pop Art: Bold, vibrant ship sails serene, icy ...</td>\n",
" <td>Pop Art: Bold ship sails icy waters, surrounde...</td>\n",
" <td>14596</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>An epic, serene oil painting: Majestic ship sa...</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8787</th>\n",
" <td>Stylized vector animation of primitive people ...</td>\n",
" <td>Animation</td>\n",
" <td>Vector</td>\n",
" <td>Stylized vector animation of primitive people ...</td>\n",
" <td>Stylized vector animation of primitive people ...</td>\n",
" <td>14597</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>Nice photo about primitive people in the forest</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8788</th>\n",
" <td>iridescent sinewy male in sleek black scifi ar...</td>\n",
" <td>Animation</td>\n",
" <td>Stop motion</td>\n",
" <td>iridescent sinewy male in sleek black scifi ar...</td>\n",
" <td>Iridescent male in sleek black sci-fi armor, s...</td>\n",
" <td>14599</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>iridescent sinewy smooth muscular male sleek g...</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8789 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" quality_prompt category \\\n",
"0 a harp without strings, in an anime style, wit... Anime \n",
"1 A majestic horse perched on an astronaut's sho... Cinematic \n",
"2 a 3D model of a minimalist, abstract financial... 3D Model \n",
"3 Tempera painting of a fragmented, distorted sc... Painting \n",
"4 a crown, captured in sharp black and white wit... Photographic \n",
"... ... ... \n",
"8784 Cristiano Ronaldo plays chess with Shrek, clay... Animation \n",
"8785 a highly detailed red panda, ultra-sharp focus... Photographic \n",
"8786 Pop Art: Bold, vibrant ship sails serene, icy ... Painting \n",
"8787 Stylized vector animation of primitive people ... Animation \n",
"8788 iridescent sinewy male in sleek black scifi ar... Animation \n",
"\n",
" subcategory style_prompt \\\n",
"0 anime style a harp without strings, in an anime style, wit... \n",
"1 bokeh A horse perched on an astronaut's shoulders, c... \n",
"2 volumetric a 3D model of a minimalist, abstract financial... \n",
"3 Tempera Tempera painting of a fragmented scene from th... \n",
"4 Fomapan a crown, captured in sharp black and white, wi... \n",
"... ... ... \n",
"8784 Claymation Cristiano Ronaldo Plays Chess with Shrek, clay... \n",
"8785 highly detailed a highly detailed red panda, ultra-sharp focus... \n",
"8786 Pop Art Pop Art: Bold, vibrant ship sails serene, icy ... \n",
"8787 Vector Stylized vector animation of primitive people ... \n",
"8788 Stop motion iridescent sinewy male in sleek black scifi ar... \n",
"\n",
" simplified_prompt __index_level_0__ \\\n",
"0 A harp without strings, in anime style, with i... 1 \n",
"1 A horse on an astronaut's shoulders, with cine... 2 \n",
"2 3D model of an abstract financial landscape wi... 4 \n",
"3 Tempera painting with distorted, blurred fragm... 15 \n",
"4 A sharp, black and white crown with high contr... 16 \n",
"... ... ... \n",
"8784 Cristiano Ronaldo plays chess with Shrek in a ... 14594 \n",
"8785 A detailed red panda with vibrant colors in st... 14595 \n",
"8786 Pop Art: Bold ship sails icy waters, surrounde... 14596 \n",
"8787 Stylized vector animation of primitive people ... 14597 \n",
"8788 Iridescent male in sleek black sci-fi armor, s... 14599 \n",
"\n",
" grouped_model_name \\\n",
"0 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"1 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"2 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"3 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"4 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"... ... \n",
"8784 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"8785 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"8786 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"8787 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"8788 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"\n",
" prompt \\\n",
"0 a harp without any strings \n",
"1 A horse sitting on an astronaut's shoulders. \n",
"2 a real photography that is minamalistic and th... \n",
"3 digital art in the style of epic Surreal, abst... \n",
"4 a crown \n",
"... ... \n",
"8784 Cristiano Ronaldo Plays Chess with Shrek, intr... \n",
"8785 a cute red panda high qualtity \n",
"8786 An epic, serene oil painting: Majestic ship sa... \n",
"8787 Nice photo about primitive people in the forest \n",
"8788 iridescent sinewy smooth muscular male sleek g... \n",
"\n",
" distilabel_metadata \\\n",
"0 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"1 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"2 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"3 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"4 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"... ... \n",
"8784 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"8785 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"8786 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"8787 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"8788 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"\n",
" image_quality_dev \\\n",
"0 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"1 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"2 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"3 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"4 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"... ... \n",
"8784 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8785 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8786 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8787 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8788 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"\n",
" image_simplified_dev \\\n",
"0 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"1 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"2 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"3 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"4 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"... ... \n",
"8784 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8785 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8786 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8787 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8788 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"\n",
" image_quality_sd \\\n",
"0 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"1 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"2 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"3 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"4 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"... ... \n",
"8784 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8785 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8786 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8787 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8788 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"\n",
" image_simplified_sd \n",
"0 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"1 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"2 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"3 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"4 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"... ... \n",
"8784 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8785 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8786 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8787 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8788 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"\n",
"[8789 rows x 13 columns]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"from datasets import load_dataset\n",
"from datasets.arrow_dataset import Dataset\n",
"from datasets.dataset_dict import DatasetDict, IterableDatasetDict\n",
"from datasets.iterable_dataset import IterableDataset\n",
"import pandas as pd\n",
"import os\n",
"dataset: DatasetDict | Dataset | IterableDatasetDict | IterableDataset = load_dataset(\"data-is-better-together/image-preferences\")\n",
"df = dataset.to_pandas()\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, we transform the dataset to match the format required by Argilla."
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>category</th>\n",
" <th>subcategory</th>\n",
" <th>__index_level_0__</th>\n",
" <th>grouped_model_name</th>\n",
" <th>prompt</th>\n",
" <th>distilabel_metadata</th>\n",
" <th>image_quality_dev</th>\n",
" <th>image_simplified_dev</th>\n",
" <th>image_quality_sd</th>\n",
" <th>image_simplified_sd</th>\n",
" <th>generation</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Anime</td>\n",
" <td>anime style</td>\n",
" <td>1</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>a harp without any strings</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>[a harp without strings, in an anime style, wi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Cinematic</td>\n",
" <td>bokeh</td>\n",
" <td>2</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>A horse sitting on an astronaut's shoulders.</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>[A majestic horse perched on an astronaut's sh...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3D Model</td>\n",
" <td>volumetric</td>\n",
" <td>4</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>a real photography that is minamalistic and th...</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>[a 3D model of a minimalist, abstract financia...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Painting</td>\n",
" <td>Tempera</td>\n",
" <td>15</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>digital art in the style of epic Surreal, abst...</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>[Tempera painting of a fragmented, distorted s...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Photographic</td>\n",
" <td>Fomapan</td>\n",
" <td>16</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>a crown</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>[a crown, captured in sharp black and white wi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8784</th>\n",
" <td>Animation</td>\n",
" <td>Claymation</td>\n",
" <td>14594</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>Cristiano Ronaldo Plays Chess with Shrek, intr...</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>[Cristiano Ronaldo plays chess with Shrek, cla...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8785</th>\n",
" <td>Photographic</td>\n",
" <td>highly detailed</td>\n",
" <td>14595</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>a cute red panda high qualtity</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>[a highly detailed red panda, ultra-sharp focu...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8786</th>\n",
" <td>Painting</td>\n",
" <td>Pop Art</td>\n",
" <td>14596</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>An epic, serene oil painting: Majestic ship sa...</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>[Pop Art: Bold, vibrant ship sails serene, icy...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8787</th>\n",
" <td>Animation</td>\n",
" <td>Vector</td>\n",
" <td>14597</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>Nice photo about primitive people in the forest</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>[Stylized vector animation of primitive people...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8788</th>\n",
" <td>Animation</td>\n",
" <td>Stop motion</td>\n",
" <td>14599</td>\n",
" <td>[https://f94i5ss7a040r0v5.us-east-1.aws.endpoi...</td>\n",
" <td>iridescent sinewy smooth muscular male sleek g...</td>\n",
" <td>{'raw_input_image_gen_quality_dev': {'prompt':...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>{'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x...</td>\n",
" <td>[iridescent sinewy male in sleek black scifi a...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8789 rows × 11 columns</p>\n",
"</div>"
],
"text/plain": [
" category subcategory __index_level_0__ \\\n",
"0 Anime anime style 1 \n",
"1 Cinematic bokeh 2 \n",
"2 3D Model volumetric 4 \n",
"3 Painting Tempera 15 \n",
"4 Photographic Fomapan 16 \n",
"... ... ... ... \n",
"8784 Animation Claymation 14594 \n",
"8785 Photographic highly detailed 14595 \n",
"8786 Painting Pop Art 14596 \n",
"8787 Animation Vector 14597 \n",
"8788 Animation Stop motion 14599 \n",
"\n",
" grouped_model_name \\\n",
"0 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"1 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"2 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"3 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"4 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"... ... \n",
"8784 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"8785 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"8786 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"8787 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"8788 [https://f94i5ss7a040r0v5.us-east-1.aws.endpoi... \n",
"\n",
" prompt \\\n",
"0 a harp without any strings \n",
"1 A horse sitting on an astronaut's shoulders. \n",
"2 a real photography that is minamalistic and th... \n",
"3 digital art in the style of epic Surreal, abst... \n",
"4 a crown \n",
"... ... \n",
"8784 Cristiano Ronaldo Plays Chess with Shrek, intr... \n",
"8785 a cute red panda high qualtity \n",
"8786 An epic, serene oil painting: Majestic ship sa... \n",
"8787 Nice photo about primitive people in the forest \n",
"8788 iridescent sinewy smooth muscular male sleek g... \n",
"\n",
" distilabel_metadata \\\n",
"0 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"1 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"2 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"3 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"4 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"... ... \n",
"8784 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"8785 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"8786 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"8787 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"8788 {'raw_input_image_gen_quality_dev': {'prompt':... \n",
"\n",
" image_quality_dev \\\n",
"0 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"1 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"2 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"3 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"4 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"... ... \n",
"8784 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8785 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8786 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8787 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8788 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"\n",
" image_simplified_dev \\\n",
"0 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"1 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"2 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"3 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"4 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"... ... \n",
"8784 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8785 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8786 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8787 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8788 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"\n",
" image_quality_sd \\\n",
"0 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"1 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"2 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"3 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"4 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"... ... \n",
"8784 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8785 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8786 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8787 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8788 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"\n",
" image_simplified_sd \\\n",
"0 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"1 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"2 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"3 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"4 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"... ... \n",
"8784 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8785 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8786 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8787 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"8788 {'bytes': b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x... \n",
"\n",
" generation \n",
"0 [a harp without strings, in an anime style, wi... \n",
"1 [A majestic horse perched on an astronaut's sh... \n",
"2 [a 3D model of a minimalist, abstract financia... \n",
"3 [Tempera painting of a fragmented, distorted s... \n",
"4 [a crown, captured in sharp black and white wi... \n",
"... ... \n",
"8784 [Cristiano Ronaldo plays chess with Shrek, cla... \n",
"8785 [a highly detailed red panda, ultra-sharp focu... \n",
"8786 [Pop Art: Bold, vibrant ship sails serene, icy... \n",
"8787 [Stylized vector animation of primitive people... \n",
"8788 [iridescent sinewy male in sleek black scifi a... \n",
"\n",
"[8789 rows x 11 columns]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prompt_columns = df.columns.where(df.columns.str.endswith(\"_prompt\")).dropna()\n",
"df[\"generation\"] = df[prompt_columns].apply(lambda row: row.values, axis=1)\n",
"df = df.drop(columns=prompt_columns)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We also extract the categories and subcategories from the dataset. These will be used as metadata properties in Argilla."
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(['Anime',\n",
" 'Cinematic',\n",
" '3D Model',\n",
" 'Painting',\n",
" 'Photographic',\n",
" 'Illustration',\n",
" 'Manga',\n",
" 'Fantasy art',\n",
" 'Digital art',\n",
" 'Pixel art',\n",
" 'Neonpunk',\n",
" 'Animation'],\n",
" ['anime style',\n",
" 'bokeh',\n",
" 'volumetric',\n",
" 'Tempera',\n",
" 'Fomapan',\n",
" 'grainy',\n",
" 'dramatic lighting',\n",
" 'moody',\n",
" 'Fantasy',\n",
" 'Digital',\n",
" 'Ink Wash',\n",
" 'high-energy',\n",
" 'Comics',\n",
" 'majestic',\n",
" 'iconic',\n",
" 'painterly',\n",
" '8-bit graphics',\n",
" 'key visual',\n",
" 'epic',\n",
" 'detailed',\n",
" 'Sketch',\n",
" 'highly detailed',\n",
" 'Ektar',\n",
" 'dark purple shadows',\n",
" 'pixel art style',\n",
" 'dreamy',\n",
" 'octane render',\n",
" 'ultramodern',\n",
" 'Renaissance',\n",
" 'ethereal',\n",
" 'Landscape',\n",
" 'blocky',\n",
" 'low-res',\n",
" 'Pop Art',\n",
" 'Anime',\n",
" 'magical',\n",
" 'vibrant',\n",
" 'Hand-drawn',\n",
" 'film grain',\n",
" 'Pixel Art',\n",
" 'Oil',\n",
" 'Animation',\n",
" 'Mural',\n",
" 'Watercolor',\n",
" 'Impressionism',\n",
" 'studio anime',\n",
" 'Expressionism',\n",
" 'Vector',\n",
" 'CineStill 50D',\n",
" 'ColorPlus',\n",
" '4k',\n",
" 'magenta highlights',\n",
" 'Gold',\n",
" 'Realism',\n",
" 'illustrative',\n",
" 'matte painting',\n",
" 'Advertising',\n",
" 'harmonious',\n",
" 'sleek',\n",
" 'professional',\n",
" 'T-Max',\n",
" 'neon',\n",
" 'Japanese comic style',\n",
" 'high contrast',\n",
" 'Concept Art',\n",
" 'magnificent',\n",
" 'intricate',\n",
" 'Whiteboard',\n",
" 'digital artwork',\n",
" 'Fresco',\n",
" 'Technical',\n",
" 'emotional',\n",
" 'Stop motion',\n",
" 'Ektachrome',\n",
" 'Gouache',\n",
" 'celestial',\n",
" 'CineStill',\n",
" 'high budget',\n",
" 'StreetPan',\n",
" 'vaporwave',\n",
" 'Design',\n",
" 'gorgeous',\n",
" 'vibes',\n",
" 'C200',\n",
" 'Acrylic',\n",
" 'Spray Paint',\n",
" 'crisp',\n",
" 'cinemascope',\n",
" 'Portrait',\n",
" 'stunningly beautiful',\n",
" 'HP5',\n",
" 'cinematic',\n",
" 'Portra',\n",
" 'cyberpunk',\n",
" 'Claymation',\n",
" 'Provia',\n",
" 'ultra detailed',\n",
" 'Fashion',\n",
" 'fantasy art',\n",
" 'Editorial',\n",
" 'cover art',\n",
" 'Romanticism',\n",
" 'Baroque',\n",
" 'Manga',\n",
" 'Scientific',\n",
" 'Cutout',\n",
" 'Delta',\n",
" 'Minimalism',\n",
" 'Encaustic',\n",
" 'Cubism',\n",
" 'Storyboard',\n",
" 'film',\n",
" 'Surrealism',\n",
" 'Superia',\n",
" 'vignette',\n",
" 'Macro',\n",
" 'CineStill 800T',\n",
" 'Mixed Media',\n",
" 'Book',\n",
" 'Velvia',\n",
" 'Tri-X'])"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"categories = list(df_filtered.category.unique())\n",
"sub_categories = list(df_filtered.subcategory.unique())\n",
"categories, sub_categories\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"## Upload the dataset to Argilla\n",
"\n",
"We will use the Argilla Python client to upload the dataset to Argilla. We assume you are familiar with Argilla and how to deploy it. If not, please refer to the [Argilla quickstart documentation](https://docs.argilla.io/dev/getting_started/quickstart/)."
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"import argilla as rg\n",
"import os\n",
"\n",
"client = rg.Argilla(\n",
" api_url=\"https://data-is-better-together-image-preferences.hf.space\",\n",
")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's first generate some embeddings for the prompts."
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a262bc3f3991491bb2ce075bde653bcf",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"model.safetensors: 0%| | 0.00/30.2M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5981aa9847304a01beb69d54bcaf9e03",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"README.md: 0%| | 0.00/271k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "dca39b27df2348f3b050a538be46cf94",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"config.json: 0%| | 0.00/142 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fd720008adef4d059a3eb9ae9c03cd4a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer.json: 0%| | 0.00/684k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from model2vec import StaticModel\n",
"\n",
"model_name = \"minishlab/potion-base-8M\"\n",
"model = StaticModel.from_pretrained(model_name)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/9t/msy700h16jz3q35qvg4z1ln40000gn/T/ipykernel_81166/298091170.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_filtered[\"prompt_embedding\"] = model.encode(generations, max_length=512).tolist()\n"
]
}
],
"source": [
"generations = [gen[0] for gen in df[\"generation\"]]\n",
"df[\"prompt_embedding\"] = model.encode(generations, max_length=512).tolist()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We will now define the settings for the dataset. We will use a custom template for the representation of the images and a label question for the preference. Additionally, we will define the metadata properties for the dataset."
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/davidberenstein/Library/Caches/pypoetry/virtualenvs/api-nlp-wikification-weaviate-ZdxY32E0-py3.10/lib/python3.10/site-packages/argilla/datasets/_resource.py:264: UserWarning: Workspace not provided. Using default workspace: argilla id: 3486a24c-0ec7-4f8a-b0c2-b721d9b02484\n",
" warnings.warn(f\"Workspace not provided. Using default workspace: {workspace.name} id: {workspace.id}\")\n"
]
},
{
"data": {
"text/plain": [
"Dataset(id=UUID('f3ad715c-5a03-4193-8ac0-c968d8a40aab') inserted_at=datetime.datetime(2024, 11, 25, 21, 29, 25, 650044) updated_at=datetime.datetime(2024, 11, 25, 21, 29, 27, 273793) name='image_preferences' status='ready' guidelines=None allow_extra_metadata=True distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('3486a24c-0ec7-4f8a-b0c2-b721d9b02484') last_activity_at=datetime.datetime(2024, 11, 25, 21, 29, 27, 273793))"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"settings = rg.Settings(\n",
" fields=[\n",
" rg.CustomField(\"images\", template=\"template.html\"),\n",
" ],\n",
" questions=[\n",
" rg.LabelQuestion(\n",
" name=\"preference\",\n",
" title=\"Image preference\",\n",
" description=\"Which one is the overall better image (prompt adherence, semantics, and aesthetics)?\",\n",
" labels={\"image_1\": \"Image 1 is better\", \"image_2\": \"Image 2 is better\", \"both_good\": \"Both images are good\", \"both_bad\": \"Both images are bad\", \"toxic_content\": \"Toxic content ⚠️\"},\n",
" ),\n",
" ],\n",
" metadata=[\n",
" rg.TermsMetadataProperty(\n",
" name=\"model_1\",\n",
" options=[\n",
" \"dev\",\n",
" \"sd\",\n",
" ]\n",
" ),\n",
" rg.TermsMetadataProperty(\n",
" name=\"model_2\",\n",
" options=[\n",
" \"dev\",\n",
" \"sd\",\n",
" ]\n",
" ),\n",
" rg.TermsMetadataProperty(\n",
" name=\"evolution\",\n",
" options=[\n",
" \"simplified\",\n",
" \"quality\" \n",
" ]\n",
" ),\n",
" ],\n",
" vectors=[rg.VectorField(name=\"prompt\", dimensions=256)],\n",
" allow_extra_metadata=True,\n",
")\n",
"dataset = rg.Dataset(\n",
" name=\"image_preferences\",\n",
" settings=settings,\n",
")\n",
"dataset.create()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'simplified'"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"evolution"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"from importlib import metadata\n",
"import requests\n",
"from io import BytesIO\n",
"import random\n",
"import base64\n",
"from PIL import Image\n",
"\n",
"dataset_name = \"data-is-better-together/open-image-preferences-v1\"\n",
"output_dir = \"images\"\n",
"\n",
"\n",
"def make_image_from_url(idx, image_column):\n",
" base_url = f\"https://huggingface.co/datasets/{dataset_name}/resolve/main\"\n",
" full_url = \"/\".join([base_url, image_column, f\"{idx}.jpg\"])\n",
" return full_url\n",
"\n",
"def make_image_from_bytes(bytes_data: bytes):\n",
" base64_data = base64.b64encode(bytes_data).decode('utf-8')\n",
" return base64_data\n",
"\n",
"records = []\n",
"for i, row in df.iterrows():\n",
" for source, generation in zip(prompt_columns, row[\"generation\"]):\n",
" evolution = source.split(\"_\")[0]\n",
" image_columns = df.columns.where(\n",
" df.columns.str.startswith(f\"image_{evolution}\")\n",
" ).dropna()\n",
" if not image_columns.empty:\n",
" image_columns = image_columns.tolist()\n",
" random.shuffle(image_columns) \n",
" image_1 = row[\"__index_level_0__\"]\n",
" image_2 = row[\"__index_level_0__\"]\n",
" image_1 = make_image_from_url(image_1, image_columns[0])\n",
" image_2 = make_image_from_url(image_2, image_columns[1])\n",
" # print(image_1, image_2)\n",
" # image_1 = make_image_from_bytes(row[image_columns[0]][\"bytes\"])\n",
" # image_2 = make_image_from_bytes(row[image_columns[1]][\"bytes\"])\n",
" model_1 = image_columns[0].split(\"_\")[-1]\n",
" model_2 = image_columns[1].split(\"_\")[-1]\n",
"\n",
" record = rg.Record(\n",
" id=f\"{row['__index_level_0__']}-{evolution}\",\n",
" fields={\n",
" \"images\": {\n",
" \"image_1\": image_1,\n",
" \"image_2\": image_2,\n",
" \"prompt\": generation,\n",
" }\n",
" },\n",
" metadata={\n",
" \"model_1\": model_1,\n",
" \"model_2\": model_2,\n",
" \"evolution\": evolution,\n",
" \"category\": row.category,\n",
" \"sub_category\": row.subcategory,\n",
" },\n",
" vectors={\n",
" \"prompt\": row[\"prompt_embedding\"],\n",
" }\n",
" )\n",
" records.append(record)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"first_5000 = records[:5000]\n",
"random.shuffle(first_5000)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Sending records...: 20batch [00:45, 2.29s/batch] \n"
]
},
{
"data": {
"text/plain": [
"DatasetRecords(Dataset(id=UUID('f3ad715c-5a03-4193-8ac0-c968d8a40aab') inserted_at=datetime.datetime(2024, 11, 25, 21, 29, 25, 650044) updated_at=datetime.datetime(2024, 11, 25, 21, 29, 27, 273793) name='image_preferences' status='ready' guidelines=None allow_extra_metadata=True distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('3486a24c-0ec7-4f8a-b0c2-b721d9b02484') last_activity_at=datetime.datetime(2024, 11, 25, 21, 29, 27, 273793)))"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client.datasets(name=\"image_preferences\").records.log(first_5000)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Upload the results to Hugging Face"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"client.datasets(name=\"image_preferences\").to_hub(\"data-is-better-together/open-image-preferences-v1-results\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}