in cc_net/tools/expand_corpus.py [0:0]
def do(self, document: dict) -> Optional[str]:
content: Optional[str] = document.get(self.field)
if not content:
return None
all_sentences = [
s for l in content.split("\n") if l for s in self.splitter.split(text=l)
]
unique_sentences = []
for s in all_sentences:
if not s:
continue
h = dedup.str_hash(s)
if h in self.hashes:
continue
self.hashes.add(h)
unique_sentences.append(s)
scores = []
for sentence in unique_sentences:
normalized = text_normalizer.normalize(sentence)
pieces = self.sp.encode_as_pieces(normalized)
log_score = self.lm.score(" ".join(pieces))
pp = -1
if len(pieces):
pp = perplexity.pp(log_score, len(pieces))
scores.append(pp)
res = filter(
lambda pp_s: self.threshold > pp_s[0] > 0, zip(scores, unique_sentences)
)
return "\n".join(f"{pp}\t{s}" for (pp, s) in res) or None