in petastorm/reader.py [0:0]
def _partition_row_groups(self, dataset, row_groups, shard_count, cur_shard,
filtered_row_group_indexes):
"""Filters the list of row group indexes based on the requested training partitions. Returns
a modified list of rowgroup indexes."""
if not shard_count \
or not isinstance(cur_shard, int) \
or not isinstance(shard_count, int):
raise ValueError('partition and num_partitions must be ints and both specified to use partitioning')
if shard_count is not None and len(row_groups) < shard_count:
raise NoDataAvailableError('Number of row-groups in the dataset must be greater or equal to the number of '
'requested shards. Otherwise, some of the shards will end up being empty.')
# We hash on the relative path of each parquet file to guarantee consistency between different reader
# constructions even after moving the dataset
filtered_row_group_indexes = [index for index in filtered_row_group_indexes if index % shard_count == cur_shard]
return filtered_row_group_indexes