in model_code/process_data_to_source_target.py [0:0]
def masking_tokens(document):
source = []
target = []
to_replace = document.split()
mask_answers = []
k = int(len(to_replace) * 0.15)
indices = sorted(random.sample([i for i in range(len(to_replace))], k))
for j in indices:
mask = np.random.uniform(0, 1, 1) < 0.8
if mask:
mask_answers.append(to_replace[j])
to_replace[j] = "[MASK]"
else:
mask_answers.append(to_replace[j])
source.append("<masking> " + " ".join(to_replace))
target.append(" ".join(mask_answers))
return source, target