def _get_badwords()

in src/datatrove/pipeline/filters/c4_filters.py [0:0]


    def _get_badwords(self, lang: str):
        if lang not in self._badwords_regex:
            if lang not in _BADWORDS_LANGS:
                if self.fail_on_missing_language:
                    raise ValueError(
                        f'There is not badwords list available for "{lang}". '
                        f"Set fail_on_missing_language=False to continue anyway."
                    )
                else:
                    return None
            local_path = cached_asset_path_or_download(
                _BADWORDS_URL + lang if lang != "en" else _EN_BADWORDS_URL,
                namespace="filters",
                subfolder="c4_badwords",
            )
            badwords: set[str] = set()
            # load from file
            with open(local_path, "rt") as f:
                badwords.update(line.strip() for line in f)
            for lang, allowlist in _BADWORDS_ALLOWLIST.items():
                badwords -= allowlist

            words = [re.escape(w) for w in badwords]
            self._badwords_regex[lang] = (
                # For Japanese, Thai, and Chinese, do not require word separations.
                re.compile("|".join(words))
                if lang in ("ja", "th", "zh")
                # For other languages, match only when flanked by non-word chars.
                else re.compile(r"(?:\W|^)({})(?:\W|$)".format("|".join(words)))
            )
        return self._badwords_regex[lang]