in pipeline/clean/tools/clean_parallel.py [0:0]
def clean_parallel(src, trg, src_lang, trg_lang):
if src.lower() == trg.lower():
return "IDENTICAL"
src_toks = src.split()
trg_toks = trg.split()
src_len = len(src_toks)
trg_len = len(trg_toks)
if not src_len or not trg_len:
return "EMPTY"
# https://stackoverflow.com/questions/23680976/python-removing-non-latin-characters
# if re.search(u'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', src):
# return "SRC_NON_LATIN"
# if re.search(u'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', trg):
# return "TRG_NON_LATIN"
ratio_len = src_len / float(trg_len)
if ratio_len < RATIO_LENGTH or ratio_len > (1.0 / RATIO_LENGTH):
return "RATIO_LENGTH"
if src_len < MIN_LENGTH or trg_len < MIN_LENGTH:
return "TOO_SHORT"
if src_len > MAX_LENGTH or trg_len > MAX_LENGTH:
return "TOO_LONG"
if src_lang in CHARS:
num_alpha = sum(
[1 if re.match(CHARS[src_lang], t, re.IGNORECASE) else 0 for t in src_toks]
)
if num_alpha / float(src_len) < RATIO_ALPHA_WORDS:
return "RATIO_ALPHA_SRC"
char_alpha = len(re.findall(CHARS[src_lang], src, re.IGNORECASE))
if char_alpha / float(len(src.replace(" ", ""))) < RATIO_ALPHA_CHARS:
return "RATIO_CHARS_SRC"
if trg_lang in CHARS:
num_alpha = sum(
[1 if re.match(CHARS[trg_lang], t, re.IGNORECASE) else 0 for t in trg_toks]
)
if num_alpha / float(trg_len) < RATIO_ALPHA_WORDS:
return "RATIO_ALPHA_TRG"
char_alpha = len(re.findall(CHARS[trg_lang], trg, re.IGNORECASE))
if char_alpha / float(len(trg.replace(" ", ""))) < RATIO_ALPHA_CHARS:
return "RATIO_CHARS_TRG"
return None