in src/hyperpod_nemo_adapter/collections/data/vision_dataset.py [0:0]
def __call__(self, samples):
dialogs, images = [], []
for sample in samples:
image_list, sample_list = sample["images"], sample["texts"]
if len(image_list) > 1:
raise ValueError("Only support one image per sample")
image = image_list[0].convert("RGB") # only use the first image
dialog = []
for sample_dict in sample_list:
if not dialog:
# only append image to the first sentence
dialog += [
{
"role": "user",
"content": [{"type": "image"}, {"type": "text", "text": sample_dict["user"].strip()}],
},
{"role": "assistant", "content": [{"type": "text", "text": sample_dict["assistant"].strip()}]},
]
else:
dialog += [
{"role": "user", "content": [{"type": "text", "text": sample_dict["user"].strip()}]},
{"role": "assistant", "content": [{"type": "text", "text": sample_dict["assistant"].strip()}]},
]
dialogs.append(dialog)
images.append([image])
return tokenize_dialogs(dialogs, images, self.processor)