in pretrain/PyTorch/sources.py [0:0]
def create_training_instance(self, index):
document = self.documents[index]
# Need to add [CLS] + 2*[SEP] tokens
max_num_tokens = self.max_seq_length - 3
# We want to maximize the inp sequence but also want inputs similar
# to our generic task inputs which will be compartively smaller
# than the data on which we intend to pre-train.
target_seq_length = max_num_tokens
if random.random() < self.small_seq_prob:
target_seq_length = random.randint(5, max_num_tokens)
# Need to make the sequences split for NSP task for interesting
# rather than choosing some arbitrary point. If not the NSP
# task might become way too easy.
instances = []
current_chunk = []
current_length = 0
i = 0
while i < len(document):
segment = document[i]
current_chunk.append(segment)
current_length += len(segment)
if i == len(document)-1 or current_length >= target_seq_length:
if current_chunk:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end = 1
if len(current_chunk) >= 2:
a_end = random.randint(1, len(current_chunk) - 1)
tokens_a = []
for j in range(a_end):
tokens_a.extend(current_chunk[j])
tokens_b = []
# Random Next
is_random_next = False
if len(current_chunk) == 1 or random.random() < 0.5:
is_random_next = True
target_b_length = target_seq_length - len(tokens_a)
# Pick a random document
for _ in range(10):
random_doc_index = random.randint(
0, len(self.documents) - 1)
if random_doc_index != index:
break
random_doc = self.documents[random_doc_index]
random_start = random.randint(0, len(random_doc)-1)
for j in range(random_start, len(random_doc)):
tokens_b.extend(random_doc[j])
if len(tokens_b) >= target_b_length:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments = len(current_chunk) - a_end
i -= num_unused_segments
# Actual Next
else:
is_random_next = False
for j in range(a_end, len(current_chunk)):
tokens_b.extend(current_chunk[j])
truncate_input_sequence(tokens_a, tokens_b, max_num_tokens)
assert len(tokens_a) >= 1
assert len(tokens_b) >= 1
instances.append(TokenInstance(
tokens_a, tokens_b, int(is_random_next)))
current_chunk = []
current_length = 0
i += 1
return instances