def _partition_row_groups()

in petastorm/reader.py [0:0]


    def _partition_row_groups(self, dataset, row_groups, shard_count, cur_shard,
                              filtered_row_group_indexes):
        """Filters the list of row group indexes based on the requested training partitions. Returns
        a modified list of rowgroup indexes."""

        if not shard_count \
                or not isinstance(cur_shard, int) \
                or not isinstance(shard_count, int):
            raise ValueError('partition and num_partitions must be ints and both specified to use partitioning')

        if shard_count is not None and len(row_groups) < shard_count:
            raise NoDataAvailableError('Number of row-groups in the dataset must be greater or equal to the number of '
                                       'requested shards. Otherwise, some of the shards will end up being empty.')

        # We hash on the relative path of each parquet file to guarantee consistency between different reader
        # constructions even after moving the dataset
        filtered_row_group_indexes = [index for index in filtered_row_group_indexes if index % shard_count == cur_shard]
        return filtered_row_group_indexes