src/nanotron/data/dataloader.py [165:185]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                    if dist.get_rank(parallel_context.pp_pg) == input_pp_rank
                    else TensorPointer(group_rank=input_pp_rank),
                    "label_ids": torch.randint(
                        0,
                        vocab_size,
                        (micro_batch_size, sequence_length),
                        dtype=torch.long,
                        device="cuda",
                        generator=generator,
                    )[:, local_slice]
                    if dist.get_rank(parallel_context.pp_pg) == output_pp_rank
                    else TensorPointer(group_rank=output_pp_rank),
                    "label_mask": torch.ones(
                        micro_batch_size,
                        sequence_length,
                        dtype=torch.bool,
                        device="cuda",
                    )[:, local_slice]
                    if dist.get_rank(parallel_context.pp_pg) == output_pp_rank
                    else TensorPointer(group_rank=output_pp_rank),
                }
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



src/nanotron/data/dataloader.py [205:225]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                    if dist.get_rank(parallel_context.pp_pg) == input_pp_rank
                    else TensorPointer(group_rank=input_pp_rank),
                    "label_ids": torch.randint(
                        0,
                        vocab_size,
                        (micro_batch_size, sequence_length),
                        dtype=torch.long,
                        device="cuda",
                        generator=generator,
                    )[:, local_slice]
                    if dist.get_rank(parallel_context.pp_pg) == output_pp_rank
                    else TensorPointer(group_rank=output_pp_rank),
                    "label_mask": torch.ones(
                        micro_batch_size,
                        sequence_length,
                        dtype=torch.bool,
                        device="cuda",
                    )[:, local_slice]
                    if dist.get_rank(parallel_context.pp_pg) == output_pp_rank
                    else TensorPointer(group_rank=output_pp_rank),
                }
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



