def do()

in cc_net/tools/expand_corpus.py [0:0]


    def do(self, document: dict) -> Optional[str]:
        content: Optional[str] = document.get(self.field)
        if not content:
            return None
        all_sentences = [
            s for l in content.split("\n") if l for s in self.splitter.split(text=l)
        ]
        unique_sentences = []
        for s in all_sentences:
            if not s:
                continue
            h = dedup.str_hash(s)
            if h in self.hashes:
                continue
            self.hashes.add(h)
            unique_sentences.append(s)

        scores = []
        for sentence in unique_sentences:
            normalized = text_normalizer.normalize(sentence)
            pieces = self.sp.encode_as_pieces(normalized)
            log_score = self.lm.score(" ".join(pieces))
            pp = -1
            if len(pieces):
                pp = perplexity.pp(log_score, len(pieces))
            scores.append(pp)

        res = filter(
            lambda pp_s: self.threshold > pp_s[0] > 0, zip(scores, unique_sentences)
        )
        return "\n".join(f"{pp}\t{s}" for (pp, s) in res) or None