in src/mlm/scorers.py [0:0]
def corpus_to_dataset(self, corpus: Corpus) -> SimpleDataset:
sents_expanded = []
for sent_idx, sent in enumerate(corpus.values()):
sent = self._apply_tokenizer_opts(sent)
if self._add_special:
tokens_original = [self._vocab.cls_token] + self._tokenizer(sent) + [self._vocab.sep_token]
else:
tokens_original = [self._vocab.cls_token] + self._tokenizer(sent)
ids_original = np.array(self._tokenizer.convert_tokens_to_ids(tokens_original))
# Enforce max length
if len(ids_original) > self._max_length:
logging.error("Line #{} is too long; will output score of 0 and omit in token counts (but not yet in word counts!)".format(sent_idx+1))
else:
ids_masked = self._ids_to_masked(ids_original)
if self._wwm:
# TODO: Wasteful, but for now "deserialize" the mask set into individual positions
# The masks are already applied in ids
for ids, mask_set in ids_masked:
for mask_el, id_original in zip(mask_set, ids_original[mask_set]):
sents_expanded.append((
sent_idx,
ids,
len(ids_original),
mask_el,
[id_original],
1))
else:
sents_expanded += [(
sent_idx,
ids,
len(ids_original),
mask_set,
ids_original[mask_set],
1)
for ids, mask_set in ids_masked]
return SimpleDataset(sents_expanded)