in oss-torch-connector/osstorchconnector/oss_iterable_dataset.py [0:0]
def shuffle(self, generator=None):
if generator is None:
seed = int(torch.empty((), dtype=torch.int64).random_().item())
generator = torch.Generator()
generator.manual_seed(seed)
log.debug("OssIterableDataset shuffle seed: %d", seed)
chunks = []
index = 0
while index < self._dataset_size:
chunk_size = min(max(1, int(random.gauss(self._chunk_size, 10))), self._dataset_size - index)
chunks.append((index, chunk_size))
index += chunk_size
random_sampler = torch.utils.data.SubsetRandomSampler(chunks, generator=generator)
self._chunks = list(random_sampler)
log.info("OssIterableDataset shuffle chunk indices, dataset size: %d, chunk num: %d",
self._dataset_size, len(self._chunks))