in cc_net/split_by_lang.py [0:0]
def do(self, doc: dict) -> Optional[dict]:
text = doc.get(self.field, None)
if not text:
return None
if self.language and doc.get("language") != self.language:
self.n_ignored += 1
return doc
self.n_doc += 1
labels, scores = self.predict(text)
scores.round(self.rounding, out=scores)
for l in labels:
self.cnt[l] = self.cnt.get(l, 0) + 1
if self.top == 1:
existing_label = doc.get(self.out_field, None)
if existing_label and labels[0] != existing_label:
self.n_disagreement += 1
if all(s < self.threshold for s in scores):
return None
self.n_accepted += 1
if self.top == 1:
doc[self.out_field] = labels[0]
doc[self.out_field + "_score"] = scores[0]
else:
doc[self.out_field] = {l: s for l, s in zip(labels, scores)}
return doc