in pipeline/clean/tools/clean_mono.py [0:0]
def clean_mono(src, lang):
# TODO: move mono cleaning to OpusCleaner
# when it support this https://github.com/hplt-project/OpusCleaner/issues/141
# treat individual characters as tokens for CJK
src_toks = src.split() if lang not in {"zh", "ja", "ko"} else src
src_len = len(src_toks)
if not src_len:
return "EMPTY"
if src_len < MIN_LENGTH:
return "TOO_SHORT"
if src_len > MAX_LENGTH:
return "TOO_LONG"
if lang in CHARS:
num_alpha = sum([1 if re.match(CHARS[lang], t, re.IGNORECASE) else 0 for t in src_toks])
if num_alpha / float(src_len) < RATIO_ALPHA_WORDS:
return "RATIO_ALPHA"
char_alpha = len(re.findall(CHARS[lang], src, re.IGNORECASE))
if char_alpha / float(len(src.replace(" ", ""))) < RATIO_ALPHA_CHARS:
return "RATIO_CHARS"
return None