def masking_tokens()

in model_code/process_data_to_source_target.py [0:0]


def masking_tokens(document):
    source = []
    target = []
    to_replace = document.split()
    mask_answers = []
    k = int(len(to_replace) * 0.15)
    indices = sorted(random.sample([i for i in range(len(to_replace))], k))
    for j in indices:
        mask = np.random.uniform(0, 1, 1) < 0.8
        if mask:
            mask_answers.append(to_replace[j])
            to_replace[j] = "[MASK]"
        else:
            mask_answers.append(to_replace[j])
    source.append("<masking> " + " ".join(to_replace))
    target.append(" ".join(mask_answers))
    return source, target