in notebooks/packed_bert/utils/packing/dataset_creator.py [0:0]
def create(self):
strategy_set = self.strategy[0]
strategy_repeat_count = self.strategy[1]
skip_cls = int(self.shift_cls_tokens)
# Sort the sequences by length
dataset_seq_lens = np.array([len(seq) for seq in self.unpacked_input_ids])
len_sorted_seq_idxs = np.argsort(dataset_seq_lens)
len_sorted_seq_lens = dataset_seq_lens[len_sorted_seq_idxs]
sorted_seqs = np.stack((len_sorted_seq_lens, len_sorted_seq_idxs))
# Pack the data using the developed strategies
pack_index = 0
st = time.time()
for i in range(len(strategy_repeat_count)):
strategy = strategy_set[i]
# This is the offset we apply to the start positions to account for the positional change of the logits when unmasking the pack to extract a set of logits for each sequence in the pack
if self.adjust_offset_positions:
positions_offset = [sum(strategy[:n]) for n in range(len(strategy))]
for _ in range(strategy_repeat_count[i]):
ref_inds = []
for x in strategy:
ref_ind = np.argwhere(sorted_seqs[0] == x)[-1]
sorted_seqs[0, ref_ind] = -1
ref_inds.append(ref_ind)
inds = sorted_seqs[1, ref_inds].ravel()
# Exclude the CLS tokens to put them at the end later
input_id_pack = list(itertools.chain(*[self.unpacked_input_ids[x][skip_cls:] for x in inds]))
attention_mask_pack = list(
itertools.chain(
*[
itertools.repeat(n + 1, len(self.unpacked_attention_mask[v]) - skip_cls)
for n, v in enumerate(inds)
]
)
)
token_type_ids_pack = list(
itertools.chain(*[self.unpacked_token_type_ids[x][skip_cls:] for x in inds])
)
position_ids_pack = list(
itertools.chain(
*[range(skip_cls, len(self.unpacked_attention_mask[v])) for n, v in enumerate(inds)]
)
)
# Create the equivalent tokenised packed dataset - we operate with python arrays due to inhomogenous dataset size
self.packed_input_ids[pack_index, : len(input_id_pack)] = input_id_pack
self.packed_attention_mask[pack_index, : len(attention_mask_pack)] = attention_mask_pack
self.packed_token_type_ids[pack_index, : len(token_type_ids_pack)] = token_type_ids_pack
self.packed_position_ids[pack_index, : len(position_ids_pack)] = position_ids_pack
if self.problem_type == "single_label_classification":
if self.training or self.validation:
labels_pack = [self.unpacked_labels[x] for x in inds]
self.packed_labels[pack_index, : len(labels_pack)] = labels_pack
if self.inference:
example_ids_pack = inds
self.packed_example_ids[pack_index, : len(example_ids_pack)] = example_ids_pack
if self.problem_type == "multi_label_classification":
if self.training or self.validation:
labels_pack = np.stack([self.unpacked_labels[x] for x in inds])
self.packed_labels[pack_index, : labels_pack.shape[0], :] = labels_pack
if self.inference:
example_ids_pack = inds
self.packed_example_ids[pack_index, : len(example_ids_pack)] = example_ids_pack
if self.problem_type == "question_answering":
if self.training:
start_positions_pack = [
max(self.unpacked_start_positions[v] + positions_offset[n], 0) for n, v in enumerate(inds)
]
end_positions_pack = [
max(self.unpacked_end_positions[v] + positions_offset[n], 0) for n, v in enumerate(inds)
]
self.packed_start_positions[pack_index, : len(start_positions_pack)] = start_positions_pack
self.packed_end_positions[pack_index, : len(end_positions_pack)] = end_positions_pack
if self.validation or self.inference:
example_ids_pack = [self.unpacked_example_ids[x] for x in inds]
offset_mapping_pack = list(itertools.chain(*[self.unpacked_offset_mapping[x] for x in inds]))
self.packed_example_ids[pack_index, : len(example_ids_pack)] = example_ids_pack
self.packed_offset_mapping[pack_index, : len(offset_mapping_pack)] = offset_mapping_pack
# Now add the CLS tokens and their masks at the end of the pack if classification task
if skip_cls:
self.packed_input_ids[pack_index, -self.max_seq_per_pack :] = [
self.unpacked_input_ids[0][0] for _ in range(self.max_seq_per_pack)
]
self.packed_attention_mask[pack_index, -self.max_seq_per_pack :] = list(
range(1, self.max_seq_per_pack + 1)
)
pack_index += 1
print(f"Packed dataset creation time: {round(time.time()-st, 4)}s")
if self.problem_type == "single_label_classification" or self.problem_type == "multi_label_classification":
return PackedClassificationDataset(
input_ids=self.packed_input_ids,
attention_mask=self.packed_attention_mask,
token_type_ids=self.packed_token_type_ids,
position_ids=self.packed_position_ids,
labels=self.packed_labels,
example_ids=self.packed_example_ids,
)
if self.problem_type == "question_answering":
return PackedQuestionAnsweringDataset(
input_ids=self.packed_input_ids,
attention_mask=self.packed_attention_mask,
token_type_ids=self.packed_token_type_ids,
position_ids=self.packed_position_ids,
start_positions=self.packed_start_positions,
end_positions=self.packed_end_positions,
offset_mapping=self.packed_offset_mapping,
example_ids=self.packed_example_ids,
)