in src/datatrove/pipeline/filters/c4_filters.py [0:0]
def _get_badwords(self, lang: str):
if lang not in self._badwords_regex:
if lang not in _BADWORDS_LANGS:
if self.fail_on_missing_language:
raise ValueError(
f'There is not badwords list available for "{lang}". '
f"Set fail_on_missing_language=False to continue anyway."
)
else:
return None
local_path = cached_asset_path_or_download(
_BADWORDS_URL + lang if lang != "en" else _EN_BADWORDS_URL,
namespace="filters",
subfolder="c4_badwords",
)
badwords: set[str] = set()
# load from file
with open(local_path, "rt") as f:
badwords.update(line.strip() for line in f)
for lang, allowlist in _BADWORDS_ALLOWLIST.items():
badwords -= allowlist
words = [re.escape(w) for w in badwords]
self._badwords_regex[lang] = (
# For Japanese, Thai, and Chinese, do not require word separations.
re.compile("|".join(words))
if lang in ("ja", "th", "zh")
# For other languages, match only when flanked by non-word chars.
else re.compile(r"(?:\W|^)({})(?:\W|$)".format("|".join(words)))
)
return self._badwords_regex[lang]