def corpus_to_dataset()

in src/mlm/scorers.py [0:0]


    def corpus_to_dataset(self, corpus: Corpus) -> SimpleDataset:

        sents_expanded = []

        for sent_idx, sent in enumerate(corpus.values()):
            sent = self._apply_tokenizer_opts(sent)
            if self._add_special:
                tokens_original = [self._vocab.cls_token] + self._tokenizer(sent) + [self._vocab.sep_token]
            else:
                tokens_original = [self._vocab.cls_token] + self._tokenizer(sent)
            ids_original = np.array(self._tokenizer.convert_tokens_to_ids(tokens_original))

            # Enforce max length
            if len(ids_original) > self._max_length:
                logging.error("Line #{} is too long; will output score of 0 and omit in token counts (but not yet in word counts!)".format(sent_idx+1))
            else:
                ids_masked = self._ids_to_masked(ids_original)

                if self._wwm:
                    # TODO: Wasteful, but for now "deserialize" the mask set into individual positions
                    # The masks are already applied in ids
                    for ids, mask_set in ids_masked:
                        for mask_el, id_original in zip(mask_set, ids_original[mask_set]):
                            sents_expanded.append((
                                    sent_idx,
                                    ids,
                                    len(ids_original),
                                    mask_el,
                                    [id_original],
                                1))
                else:
                    sents_expanded += [(
                            sent_idx,
                            ids,
                            len(ids_original),
                            mask_set,
                            ids_original[mask_set],
                            1)
                        for ids, mask_set in ids_masked]

        return SimpleDataset(sents_expanded)