def load()

in sockeye/data_io_pt.py [0:0]


    def load(self,
             source_iterables: Sequence[Iterable],
             target_iterables: Sequence[Iterable],
             num_samples_per_bucket: List[int]) -> 'ParallelDataSet':

        assert len(num_samples_per_bucket) == len(self.buckets)
        num_source_factors = len(source_iterables)
        num_target_factors = len(target_iterables)

        data_source = [np.full((num_samples, source_len, num_source_factors), self.pad_id, dtype=self.dtype)
                       for (source_len, _), num_samples in zip(self.buckets, num_samples_per_bucket)]
        data_target = [np.full((num_samples, target_len + 1, num_target_factors), self.pad_id, dtype=self.dtype)
                       for (_, target_len), num_samples in zip(self.buckets, num_samples_per_bucket)]

        bucket_sample_index = [0 for _ in self.buckets]

        # track amount of padding introduced through bucketing
        num_tokens_source = 0
        num_tokens_target = 0
        num_pad_source = 0
        num_pad_target = 0

        # Bucket sentences as padded np arrays
        for sources, targets in parallel_iter(source_iterables, target_iterables, skip_blanks=self.skip_blanks):
            sources = [[] if stream is None else stream for stream in sources]
            targets = [[] if stream is None else stream for stream in targets]
            source_len = len(sources[0])
            target_len = len(targets[0])
            buck_index, buck = get_parallel_bucket(self.buckets, source_len, target_len)
            if buck is None:
                if self.skip_blanks:
                    continue  # skip this sentence pair
                else:
                    buck_index = len(self.buckets)
                    buck = self.buckets[buck_index]

            num_tokens_source += buck[0]
            num_tokens_target += buck[1]
            num_pad_source += buck[0] - source_len
            num_pad_target += buck[1] - target_len

            sample_index = bucket_sample_index[buck_index]
            for i, s in enumerate(sources):
                data_source[buck_index][sample_index, 0:source_len, i] = s
            for i, t in enumerate(targets):
                if i == 0 or not self.shift_target_factors:
                    # sequence: <BOS> ... <EOS>
                    t.append(self.eos_id)
                    data_target[buck_index][sample_index, 0:target_len + 1, i] = t
                else:
                    # sequence: <BOS> <BOS> ...
                    t.insert(0, C.BOS_ID)
                    data_target[buck_index][sample_index, 0:target_len + 1, i] = t

            bucket_sample_index[buck_index] += 1

        data_source_tensors = [torch.from_numpy(data) for data in data_source]
        data_target_tensors = [torch.from_numpy(data) for data in data_target]

        if num_tokens_source > 0 and num_tokens_target > 0:
            logger.info("Created bucketed parallel data set. Introduced padding: source=%.1f%% target=%.1f%%)",
                        num_pad_source / num_tokens_source * 100,
                        num_pad_target / num_tokens_target * 100)

        return ParallelDataSet(data_source_tensors, data_target_tensors)