def clean_mono()

in pipeline/clean/tools/clean_mono.py [0:0]


def clean_mono(src, lang):
    # TODO: move mono cleaning to OpusCleaner
    #  when it support this https://github.com/hplt-project/OpusCleaner/issues/141

    # treat individual characters as tokens for CJK
    src_toks = src.split() if lang not in {"zh", "ja", "ko"} else src
    src_len = len(src_toks)

    if not src_len:
        return "EMPTY"

    if src_len < MIN_LENGTH:
        return "TOO_SHORT"

    if src_len > MAX_LENGTH:
        return "TOO_LONG"

    if lang in CHARS:
        num_alpha = sum([1 if re.match(CHARS[lang], t, re.IGNORECASE) else 0 for t in src_toks])
        if num_alpha / float(src_len) < RATIO_ALPHA_WORDS:
            return "RATIO_ALPHA"

        char_alpha = len(re.findall(CHARS[lang], src, re.IGNORECASE))
        if char_alpha / float(len(src.replace(" ", ""))) < RATIO_ALPHA_CHARS:
            return "RATIO_CHARS"

    return None