def filter()

in src/datatrove/pipeline/filters/language_filter.py [0:0]


    def filter(self, doc: Document) -> bool:
        """Args:
            doc: document

        Returns:
            is_filter
        """
        best_lang_pair, lang_pairs = self.model.predict(doc)
        lang, lang_score = best_lang_pair
        if self.backend == "glotlid":
            lang, script = lang.split("_")
            doc.metadata["language_script"] = script
        doc.metadata["language"] = lang
        doc.metadata["language_score"] = lang_score
        if self.keep_top_pairs_threshold != -1:
            for key, value in lang_pairs.items():
                if value > self.keep_top_pairs_threshold:
                    doc.metadata[f"top_language_{key}_score"] = value
        return (
            self.label_only
            or (self.languages and any(score > self.language_threshold for score in lang_pairs.values()))
            or (self.languages is None and lang_score > self.language_threshold)
        )