def __getitem__()

in vision/smolvlm2/smolvlm/datasets/builder.py [0:0]


    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        """
        We ignore 'idx' because each call to __getitem__ gets the “next” packed sample,
        from self.packed_cursor onward.
        """
        if self.packed_cursor >= len(self):
            raise IndexError("No more sub-samples left to pack in PackedConcatDataset.")

        # Accumulate sub-samples
        chunk_input_ids = []
        chunk_labels = []
        chunk_subseq_ids = []
        pixel_key = None
        pixel_values_list = []

        sub_seq_counter = 0
        current_token_count = 0

        while True:
            if self.packed_cursor >= len(self):
                break

            sub_item = super().__getitem__(self.packed_cursor)
            self.packed_cursor += 1

            sub_len = sub_item["input_ids"].size(0)
            if (current_token_count > 0) and (current_token_count + sub_len) > self.cutoff_len:
                # Revert if we can't fit this sub-sample
                self.packed_cursor -= 1
                break

            sub_seq_counter += 1
            seq_id_tensor = torch.full(
                (sub_len,),
                fill_value=sub_seq_counter,
                dtype=torch.long,
                device=sub_item["input_ids"].device
            )

            chunk_input_ids.append(sub_item["input_ids"])
            chunk_labels.append(sub_item["labels"])
            chunk_subseq_ids.append(seq_id_tensor)

            # If images are present
            if "pixel_values" in sub_item:
                pixel_key = "pixel_values"
                pixel_values_list.append(sub_item["pixel_values"])

            current_token_count += sub_len
            print("[Sequence Packing] current num tokens:", current_token_count)
            if current_token_count >= self.cutoff_len:
                break

        # Merge text
        if len(chunk_input_ids) == 0:
            return {
                "input_ids": torch.tensor([], dtype=torch.long),
                "labels": torch.tensor([], dtype=torch.long),
                "attention_mask": torch.tensor([], dtype=torch.long),
            }

        merged_input_ids = torch.cat(chunk_input_ids, dim=0)
        merged_labels = torch.cat(chunk_labels, dim=0)
        merged_subseq_ids = torch.cat(chunk_subseq_ids, dim=0)

        # Merge images along frame dimension if present
        merged_pixel_values = None
        if pixel_key and pixel_values_list:
            merged_pixel_values = torch.cat(pixel_values_list, dim=0)
            # shape => (f1+f2+..., 3, H, W)

        loss_weight = torch.ones_like(merged_subseq_ids, dtype=torch.float32)
        unique_ids = merged_subseq_ids.unique()
        unique_ids = unique_ids[unique_ids > 0]  # ignore pad=0
        for sid in unique_ids.tolist():
            mask = (merged_subseq_ids == sid)
            num_eff = (merged_labels[mask] != IGNORE_INDEX).sum().item()
            w = len2weight(num_eff, self.data_args.loss_reduction)
            loss_weight[mask] = w
        
        # Build final
        out_dict = {
            "input_ids": merged_input_ids,
            "labels": merged_labels,
            "attention_mask": merged_subseq_ids,
            "loss_weight": loss_weight,  
        }
        if merged_pixel_values is not None:
            out_dict[pixel_key] = merged_pixel_values

        return out_dict