def create()

in notebooks/packed_bert/utils/packing/dataset_creator.py [0:0]


    def create(self):
        strategy_set = self.strategy[0]
        strategy_repeat_count = self.strategy[1]
        skip_cls = int(self.shift_cls_tokens)

        # Sort the sequences by length
        dataset_seq_lens = np.array([len(seq) for seq in self.unpacked_input_ids])
        len_sorted_seq_idxs = np.argsort(dataset_seq_lens)
        len_sorted_seq_lens = dataset_seq_lens[len_sorted_seq_idxs]
        sorted_seqs = np.stack((len_sorted_seq_lens, len_sorted_seq_idxs))

        # Pack the data using the developed strategies
        pack_index = 0

        st = time.time()
        for i in range(len(strategy_repeat_count)):
            strategy = strategy_set[i]

            # This is the offset we apply to the start positions to account for the positional change of the logits when unmasking the pack to extract a set of logits for each sequence in the pack
            if self.adjust_offset_positions:
                positions_offset = [sum(strategy[:n]) for n in range(len(strategy))]

            for _ in range(strategy_repeat_count[i]):
                ref_inds = []
                for x in strategy:
                    ref_ind = np.argwhere(sorted_seqs[0] == x)[-1]
                    sorted_seqs[0, ref_ind] = -1
                    ref_inds.append(ref_ind)

                inds = sorted_seqs[1, ref_inds].ravel()

                # Exclude the CLS tokens to put them at the end later
                input_id_pack = list(itertools.chain(*[self.unpacked_input_ids[x][skip_cls:] for x in inds]))
                attention_mask_pack = list(
                    itertools.chain(
                        *[
                            itertools.repeat(n + 1, len(self.unpacked_attention_mask[v]) - skip_cls)
                            for n, v in enumerate(inds)
                        ]
                    )
                )
                token_type_ids_pack = list(
                    itertools.chain(*[self.unpacked_token_type_ids[x][skip_cls:] for x in inds])
                )
                position_ids_pack = list(
                    itertools.chain(
                        *[range(skip_cls, len(self.unpacked_attention_mask[v])) for n, v in enumerate(inds)]
                    )
                )

                # Create the equivalent tokenised packed dataset - we operate with python arrays due to inhomogenous dataset size
                self.packed_input_ids[pack_index, : len(input_id_pack)] = input_id_pack
                self.packed_attention_mask[pack_index, : len(attention_mask_pack)] = attention_mask_pack
                self.packed_token_type_ids[pack_index, : len(token_type_ids_pack)] = token_type_ids_pack
                self.packed_position_ids[pack_index, : len(position_ids_pack)] = position_ids_pack

                if self.problem_type == "single_label_classification":
                    if self.training or self.validation:
                        labels_pack = [self.unpacked_labels[x] for x in inds]
                        self.packed_labels[pack_index, : len(labels_pack)] = labels_pack
                    if self.inference:
                        example_ids_pack = inds
                        self.packed_example_ids[pack_index, : len(example_ids_pack)] = example_ids_pack

                if self.problem_type == "multi_label_classification":
                    if self.training or self.validation:
                        labels_pack = np.stack([self.unpacked_labels[x] for x in inds])
                        self.packed_labels[pack_index, : labels_pack.shape[0], :] = labels_pack
                    if self.inference:
                        example_ids_pack = inds
                        self.packed_example_ids[pack_index, : len(example_ids_pack)] = example_ids_pack

                if self.problem_type == "question_answering":
                    if self.training:
                        start_positions_pack = [
                            max(self.unpacked_start_positions[v] + positions_offset[n], 0) for n, v in enumerate(inds)
                        ]
                        end_positions_pack = [
                            max(self.unpacked_end_positions[v] + positions_offset[n], 0) for n, v in enumerate(inds)
                        ]
                        self.packed_start_positions[pack_index, : len(start_positions_pack)] = start_positions_pack
                        self.packed_end_positions[pack_index, : len(end_positions_pack)] = end_positions_pack

                    if self.validation or self.inference:
                        example_ids_pack = [self.unpacked_example_ids[x] for x in inds]
                        offset_mapping_pack = list(itertools.chain(*[self.unpacked_offset_mapping[x] for x in inds]))

                        self.packed_example_ids[pack_index, : len(example_ids_pack)] = example_ids_pack
                        self.packed_offset_mapping[pack_index, : len(offset_mapping_pack)] = offset_mapping_pack

                # Now add the CLS tokens and their masks at the end of the pack if classification task
                if skip_cls:
                    self.packed_input_ids[pack_index, -self.max_seq_per_pack :] = [
                        self.unpacked_input_ids[0][0] for _ in range(self.max_seq_per_pack)
                    ]
                    self.packed_attention_mask[pack_index, -self.max_seq_per_pack :] = list(
                        range(1, self.max_seq_per_pack + 1)
                    )

                pack_index += 1

        print(f"Packed dataset creation time: {round(time.time()-st, 4)}s")

        if self.problem_type == "single_label_classification" or self.problem_type == "multi_label_classification":
            return PackedClassificationDataset(
                input_ids=self.packed_input_ids,
                attention_mask=self.packed_attention_mask,
                token_type_ids=self.packed_token_type_ids,
                position_ids=self.packed_position_ids,
                labels=self.packed_labels,
                example_ids=self.packed_example_ids,
            )

        if self.problem_type == "question_answering":
            return PackedQuestionAnsweringDataset(
                input_ids=self.packed_input_ids,
                attention_mask=self.packed_attention_mask,
                token_type_ids=self.packed_token_type_ids,
                position_ids=self.packed_position_ids,
                start_positions=self.packed_start_positions,
                end_positions=self.packed_end_positions,
                offset_mapping=self.packed_offset_mapping,
                example_ids=self.packed_example_ids,
            )