in vision/smolvlm2/smolvlm/datasets/builder.py [0:0]
def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
"""
We ignore 'idx' because each call to __getitem__ gets the “next” packed sample,
from self.packed_cursor onward.
"""
if self.packed_cursor >= len(self):
raise IndexError("No more sub-samples left to pack in PackedConcatDataset.")
# Accumulate sub-samples
chunk_input_ids = []
chunk_labels = []
chunk_subseq_ids = []
pixel_key = None
pixel_values_list = []
sub_seq_counter = 0
current_token_count = 0
while True:
if self.packed_cursor >= len(self):
break
sub_item = super().__getitem__(self.packed_cursor)
self.packed_cursor += 1
sub_len = sub_item["input_ids"].size(0)
if (current_token_count > 0) and (current_token_count + sub_len) > self.cutoff_len:
# Revert if we can't fit this sub-sample
self.packed_cursor -= 1
break
sub_seq_counter += 1
seq_id_tensor = torch.full(
(sub_len,),
fill_value=sub_seq_counter,
dtype=torch.long,
device=sub_item["input_ids"].device
)
chunk_input_ids.append(sub_item["input_ids"])
chunk_labels.append(sub_item["labels"])
chunk_subseq_ids.append(seq_id_tensor)
# If images are present
if "pixel_values" in sub_item:
pixel_key = "pixel_values"
pixel_values_list.append(sub_item["pixel_values"])
current_token_count += sub_len
print("[Sequence Packing] current num tokens:", current_token_count)
if current_token_count >= self.cutoff_len:
break
# Merge text
if len(chunk_input_ids) == 0:
return {
"input_ids": torch.tensor([], dtype=torch.long),
"labels": torch.tensor([], dtype=torch.long),
"attention_mask": torch.tensor([], dtype=torch.long),
}
merged_input_ids = torch.cat(chunk_input_ids, dim=0)
merged_labels = torch.cat(chunk_labels, dim=0)
merged_subseq_ids = torch.cat(chunk_subseq_ids, dim=0)
# Merge images along frame dimension if present
merged_pixel_values = None
if pixel_key and pixel_values_list:
merged_pixel_values = torch.cat(pixel_values_list, dim=0)
# shape => (f1+f2+..., 3, H, W)
loss_weight = torch.ones_like(merged_subseq_ids, dtype=torch.float32)
unique_ids = merged_subseq_ids.unique()
unique_ids = unique_ids[unique_ids > 0] # ignore pad=0
for sid in unique_ids.tolist():
mask = (merged_subseq_ids == sid)
num_eff = (merged_labels[mask] != IGNORE_INDEX).sum().item()
w = len2weight(num_eff, self.data_args.loss_reduction)
loss_weight[mask] = w
# Build final
out_dict = {
"input_ids": merged_input_ids,
"labels": merged_labels,
"attention_mask": merged_subseq_ids,
"loss_weight": loss_weight,
}
if merged_pixel_values is not None:
out_dict[pixel_key] = merged_pixel_values
return out_dict