pipeline/clean/tools/clean_parallel.py (99 lines of code) (raw):
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import re
import sys
# The variables below need to be adjusted for a language pair and dataset.
# To add a new language, define the list of alpha characters in the dict below.
MIN_LENGTH = 1 # minimum number of words in a sentence, should be > 0
MAX_LENGTH = 150 # maximum number of words in a sentence
RATIO_LENGTH = 0.5 # minimum length ratio of source/target and target/source
RATIO_ALPHA_WORDS = 0.4 # minimum fraction of "real" words in a source sentence
RATIO_ALPHA_CHARS = 0.5 # minimum fraction of alpha characters in a source sentence
CHARS = {
"bg": r"[АаБбВвГгДддЕеЖжЗзИиЙйКкkasЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЬьЮюЯя]",
"cs": r"[a-zÁáČčĎďÉéěÍíŇňÓóŘřŠšŤťÚúůÝýŽž]",
"ca": r"[a-zÀàÈèÉéÍíÒòÓóÚúÇç]",
"da": r"[a-zÆæØøÅå]",
"de": r"[a-zÄäÖöÜüß]",
"en": r"[a-z]",
"el": r"[a-zΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω]",
"es": r"[a-zÁáÉéÍíÓóÚúñÑ]",
"et": r"[a-zÕõÄäÖöÜü]",
"eu": r"[a-zñÑ]",
"fi": r"[a-zÅåÄäÖö]",
"fr": r"[a-zÂâÁáÀàâÇçÉéÈèÊêÓóÒòÔôŒœÜüÛûŸÿ]",
"ga": r"[abcdefghilmnoprstuáéíóúÁÉÍÓÚ]",
"gl": r"[a-zÁáÉéÍíÓóÚúÑñ]",
"hr": r"[abcčČćĆdđĐefghijklmnoprsšŠtuvzžŽ]",
"hu": r"[a-zÁáÉéÍíÓóÖöŐőŰű]",
"is": r"[abdefghijklmnoprstuvxyÁáðÐÉéÍíÓóÚúÝýÞþÆæÖö]",
"it": r"[a-zàÀèÈéÉìÌíÍîÎòÒóÓùÙúÚ]",
"lt": r"[aąbcČčdeĘęĖėfghiĮįyjklmnoprsŠštuŲųŪūvzŽž]",
"lv": r"[aĀābcČčdeĒēfgĢģhiĪījkĶķlĻļmnŅņoprsŠštuŪūvzŽž]",
"mt": r"[abĊċdefĠġghĦħiiejklmnopqrstuvwxŻżz]",
"nb": r"[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÜüÆæØøÅå]",
"nl": r"[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÚú]",
"no": r"[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÜüÆæØøÅå]",
"nn": r"[a-zÂâÁáÀàâÉéÈèÊêÓóÒòÔôÜüÆæØøÅå]",
"pl": r"[a-zĄąĆćĘꣳŃńÓóŚśŹźŻż]",
"pt": r"[a-zÂâÁáÀàÃãÇçÉéÈèÊêÍíÌìÓóÒòÔôÕõÚúÙù]",
"ro": r"[a-zĂăÂâÎîȘșȚț]",
"ru": r"[а-я]",
"sk": r"[a-záäÁÄčČďĎžéÉíÍĺĹľĽňŇóÓôÔŕŔšŠťŤúÚýÝžŽ]",
"sl": r"[abcčČdđĐefghijklmnoprsšŠtuvzžŽ]",
"sv": r"[a-zÅåÄäÖö]",
}
def main():
args = parse_user_args()
for i, line in enumerate(sys.stdin):
fields = line.strip().split("\t")
if len(fields) < 2:
continue
src = fields[-2].strip()
trg = fields[-1].strip()
skip = clean_parallel(src, trg, args.src_lang, args.trg_lang)
if skip:
if args.debug:
sys.stderr.write("{}\t{}".format(skip, line))
continue
sys.stdout.write(line)
def clean_parallel(src, trg, src_lang, trg_lang):
if src.lower() == trg.lower():
return "IDENTICAL"
src_toks = src.split()
trg_toks = trg.split()
src_len = len(src_toks)
trg_len = len(trg_toks)
if not src_len or not trg_len:
return "EMPTY"
# https://stackoverflow.com/questions/23680976/python-removing-non-latin-characters
# if re.search(u'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', src):
# return "SRC_NON_LATIN"
# if re.search(u'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', trg):
# return "TRG_NON_LATIN"
ratio_len = src_len / float(trg_len)
if ratio_len < RATIO_LENGTH or ratio_len > (1.0 / RATIO_LENGTH):
return "RATIO_LENGTH"
if src_len < MIN_LENGTH or trg_len < MIN_LENGTH:
return "TOO_SHORT"
if src_len > MAX_LENGTH or trg_len > MAX_LENGTH:
return "TOO_LONG"
if src_lang in CHARS:
num_alpha = sum(
[1 if re.match(CHARS[src_lang], t, re.IGNORECASE) else 0 for t in src_toks]
)
if num_alpha / float(src_len) < RATIO_ALPHA_WORDS:
return "RATIO_ALPHA_SRC"
char_alpha = len(re.findall(CHARS[src_lang], src, re.IGNORECASE))
if char_alpha / float(len(src.replace(" ", ""))) < RATIO_ALPHA_CHARS:
return "RATIO_CHARS_SRC"
if trg_lang in CHARS:
num_alpha = sum(
[1 if re.match(CHARS[trg_lang], t, re.IGNORECASE) else 0 for t in trg_toks]
)
if num_alpha / float(trg_len) < RATIO_ALPHA_WORDS:
return "RATIO_ALPHA_TRG"
char_alpha = len(re.findall(CHARS[trg_lang], trg, re.IGNORECASE))
if char_alpha / float(len(trg.replace(" ", ""))) < RATIO_ALPHA_CHARS:
return "RATIO_CHARS_TRG"
return None
def parse_user_args():
parser = argparse.ArgumentParser()
parser.add_argument("-l1", "--src-lang", default="es")
parser.add_argument("-l2", "--trg-lang", default="en")
parser.add_argument("--debug", action="store_true")
return parser.parse_args()
if __name__ == "__main__":
main()