def __call__()

in src/hyperpod_nemo_adapter/collections/data/vision_dataset.py [0:0]


    def __call__(self, samples):
        dialogs, images = [], []
        for sample in samples:
            image_list, sample_list = sample["images"], sample["texts"]
            if len(image_list) > 1:
                raise ValueError("Only support one image per sample")
            image = image_list[0].convert("RGB")  # only use the first image
            dialog = []
            for sample_dict in sample_list:
                if not dialog:
                    # only append image to the first sentence
                    dialog += [
                        {
                            "role": "user",
                            "content": [{"type": "image"}, {"type": "text", "text": sample_dict["user"].strip()}],
                        },
                        {"role": "assistant", "content": [{"type": "text", "text": sample_dict["assistant"].strip()}]},
                    ]

                else:
                    dialog += [
                        {"role": "user", "content": [{"type": "text", "text": sample_dict["user"].strip()}]},
                        {"role": "assistant", "content": [{"type": "text", "text": sample_dict["assistant"].strip()}]},
                    ]
            dialogs.append(dialog)
            images.append([image])
        return tokenize_dialogs(dialogs, images, self.processor)