in src/datatrove/pipeline/filters/language_filter.py [0:0]
def filter(self, doc: Document) -> bool:
"""Args:
doc: document
Returns:
is_filter
"""
best_lang_pair, lang_pairs = self.model.predict(doc)
lang, lang_score = best_lang_pair
if self.backend == "glotlid":
lang, script = lang.split("_")
doc.metadata["language_script"] = script
doc.metadata["language"] = lang
doc.metadata["language_score"] = lang_score
if self.keep_top_pairs_threshold != -1:
for key, value in lang_pairs.items():
if value > self.keep_top_pairs_threshold:
doc.metadata[f"top_language_{key}_score"] = value
return (
self.label_only
or (self.languages and any(score > self.language_threshold for score in lang_pairs.values()))
or (self.languages is None and lang_score > self.language_threshold)
)