def clean_parallel()

in pipeline/clean/tools/clean_parallel.py [0:0]


def clean_parallel(src, trg, src_lang, trg_lang):
    if src.lower() == trg.lower():
        return "IDENTICAL"

    src_toks = src.split()
    trg_toks = trg.split()
    src_len = len(src_toks)
    trg_len = len(trg_toks)

    if not src_len or not trg_len:
        return "EMPTY"

    # https://stackoverflow.com/questions/23680976/python-removing-non-latin-characters
    # if re.search(u'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', src):
    #    return "SRC_NON_LATIN"

    # if re.search(u'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', trg):
    #    return "TRG_NON_LATIN"

    ratio_len = src_len / float(trg_len)
    if ratio_len < RATIO_LENGTH or ratio_len > (1.0 / RATIO_LENGTH):
        return "RATIO_LENGTH"

    if src_len < MIN_LENGTH or trg_len < MIN_LENGTH:
        return "TOO_SHORT"

    if src_len > MAX_LENGTH or trg_len > MAX_LENGTH:
        return "TOO_LONG"

    if src_lang in CHARS:
        num_alpha = sum(
            [1 if re.match(CHARS[src_lang], t, re.IGNORECASE) else 0 for t in src_toks]
        )
        if num_alpha / float(src_len) < RATIO_ALPHA_WORDS:
            return "RATIO_ALPHA_SRC"

        char_alpha = len(re.findall(CHARS[src_lang], src, re.IGNORECASE))
        if char_alpha / float(len(src.replace(" ", ""))) < RATIO_ALPHA_CHARS:
            return "RATIO_CHARS_SRC"

    if trg_lang in CHARS:
        num_alpha = sum(
            [1 if re.match(CHARS[trg_lang], t, re.IGNORECASE) else 0 for t in trg_toks]
        )
        if num_alpha / float(trg_len) < RATIO_ALPHA_WORDS:
            return "RATIO_ALPHA_TRG"

        char_alpha = len(re.findall(CHARS[trg_lang], trg, re.IGNORECASE))
        if char_alpha / float(len(trg.replace(" ", ""))) < RATIO_ALPHA_CHARS:
            return "RATIO_CHARS_TRG"

    return None