def create_training_instance()

in pretrain/PyTorch/sources.py [0:0]


    def create_training_instance(self, index):
        document = self.documents[index]

        # Need to add [CLS] + 2*[SEP] tokens
        max_num_tokens = self.max_seq_length - 3

        # We want to maximize the inp sequence but also want inputs similar
        # to our generic task inputs which will be compartively smaller
        # than the data on which we intend to pre-train.
        target_seq_length = max_num_tokens
        if random.random() < self.small_seq_prob:
            target_seq_length = random.randint(5, max_num_tokens)

        # Need to make the sequences split for NSP task for interesting
        # rather than choosing some arbitrary point. If not the NSP
        # task might become way too easy.
        instances = []
        current_chunk = []
        current_length = 0
        i = 0
        while i < len(document):
            segment = document[i]
            current_chunk.append(segment)
            current_length += len(segment)
            if i == len(document)-1 or current_length >= target_seq_length:
                if current_chunk:
                    # `a_end` is how many segments from `current_chunk` go into the `A`
                    # (first) sentence.
                    a_end = 1
                    if len(current_chunk) >= 2:
                        a_end = random.randint(1, len(current_chunk) - 1)

                    tokens_a = []
                    for j in range(a_end):
                        tokens_a.extend(current_chunk[j])

                    tokens_b = []

                    # Random Next
                    is_random_next = False
                    if len(current_chunk) == 1 or random.random() < 0.5:
                        is_random_next = True
                        target_b_length = target_seq_length - len(tokens_a)

                        # Pick a random document
                        for _ in range(10):
                            random_doc_index = random.randint(
                                0, len(self.documents) - 1)
                            if random_doc_index != index:
                                break

                        random_doc = self.documents[random_doc_index]
                        random_start = random.randint(0, len(random_doc)-1)
                        for j in range(random_start, len(random_doc)):
                            tokens_b.extend(random_doc[j])
                            if len(tokens_b) >= target_b_length:
                                break

                        # We didn't actually use these segments so we "put them back" so
                        # they don't go to waste.
                        num_unused_segments = len(current_chunk) - a_end
                        i -= num_unused_segments

                    # Actual Next
                    else:
                        is_random_next = False
                        for j in range(a_end, len(current_chunk)):
                            tokens_b.extend(current_chunk[j])

                    truncate_input_sequence(tokens_a, tokens_b, max_num_tokens)

                    assert len(tokens_a) >= 1
                    assert len(tokens_b) >= 1

                    instances.append(TokenInstance(
                        tokens_a, tokens_b, int(is_random_next)))

                current_chunk = []
                current_length = 0
            i += 1

        return instances