in dpr_scale/utils/utils.py [0:0]
def __iter__(self):
indices = list(range(len(self.dataset))) # type: ignore
if not self.drop_last:
# add extra samples to make it evenly divisible
padding_size = self.total_size - len(indices)
if padding_size <= len(indices):
indices += indices[:padding_size]
else:
indices += (indices * math.ceil(padding_size / len(indices)))[
:padding_size
]
else:
# remove tail of data to make it evenly divisible.
indices = indices[: self.total_size]
assert len(indices) == self.total_size
# subsample chunk
chunk_size = self.num_samples * self.num_replicas_per_node
node_rank = self.rank // self.num_replicas_per_node
local_rank = self.rank % self.num_replicas_per_node
start_idx = node_rank * chunk_size
indices = indices[start_idx : start_idx + chunk_size]
if self.shuffle:
# deterministically shuffle
g = torch.Generator()
g.manual_seed(self.seed + self.epoch + node_rank)
shuffle_idx = torch.randperm(
len(indices), generator=g
).tolist() # type: ignore
indices = [indices[idx] for idx in shuffle_idx]
# subsample
indices = indices[local_rank :: self.num_replicas_per_node]
assert len(indices) == self.num_samples
return iter(indices)