in sockeye/data_io_pt.py [0:0]
def load(self,
source_iterables: Sequence[Iterable],
target_iterables: Sequence[Iterable],
num_samples_per_bucket: List[int]) -> 'ParallelDataSet':
assert len(num_samples_per_bucket) == len(self.buckets)
num_source_factors = len(source_iterables)
num_target_factors = len(target_iterables)
data_source = [np.full((num_samples, source_len, num_source_factors), self.pad_id, dtype=self.dtype)
for (source_len, _), num_samples in zip(self.buckets, num_samples_per_bucket)]
data_target = [np.full((num_samples, target_len + 1, num_target_factors), self.pad_id, dtype=self.dtype)
for (_, target_len), num_samples in zip(self.buckets, num_samples_per_bucket)]
bucket_sample_index = [0 for _ in self.buckets]
# track amount of padding introduced through bucketing
num_tokens_source = 0
num_tokens_target = 0
num_pad_source = 0
num_pad_target = 0
# Bucket sentences as padded np arrays
for sources, targets in parallel_iter(source_iterables, target_iterables, skip_blanks=self.skip_blanks):
sources = [[] if stream is None else stream for stream in sources]
targets = [[] if stream is None else stream for stream in targets]
source_len = len(sources[0])
target_len = len(targets[0])
buck_index, buck = get_parallel_bucket(self.buckets, source_len, target_len)
if buck is None:
if self.skip_blanks:
continue # skip this sentence pair
else:
buck_index = len(self.buckets)
buck = self.buckets[buck_index]
num_tokens_source += buck[0]
num_tokens_target += buck[1]
num_pad_source += buck[0] - source_len
num_pad_target += buck[1] - target_len
sample_index = bucket_sample_index[buck_index]
for i, s in enumerate(sources):
data_source[buck_index][sample_index, 0:source_len, i] = s
for i, t in enumerate(targets):
if i == 0 or not self.shift_target_factors:
# sequence: <BOS> ... <EOS>
t.append(self.eos_id)
data_target[buck_index][sample_index, 0:target_len + 1, i] = t
else:
# sequence: <BOS> <BOS> ...
t.insert(0, C.BOS_ID)
data_target[buck_index][sample_index, 0:target_len + 1, i] = t
bucket_sample_index[buck_index] += 1
data_source_tensors = [torch.from_numpy(data) for data in data_source]
data_target_tensors = [torch.from_numpy(data) for data in data_target]
if num_tokens_source > 0 and num_tokens_target > 0:
logger.info("Created bucketed parallel data set. Introduced padding: source=%.1f%% target=%.1f%%)",
num_pad_source / num_tokens_source * 100,
num_pad_target / num_tokens_target * 100)
return ParallelDataSet(data_source_tensors, data_target_tensors)