pipeline/clean/tools/clean_mono.py (44 lines of code) (raw):

#!/usr/bin/env python # -*- coding: utf-8 -*- import argparse import re import sys MIN_LENGTH = 2 # minimum number of words in a sentence MAX_LENGTH = 150 # maximum number of words in a sentence RATIO_ALPHA_WORDS = 0.4 # minimum fraction of "real" words in a sentence RATIO_ALPHA_CHARS = 0.5 # minimum fraction of alpha characters in a sentence from clean_parallel import CHARS def main(): args = parse_user_args() for i, line in enumerate(sys.stdin): src = line.strip() if not src: continue skip = clean_mono(src, args.lang) if skip: if args.debug: sys.stderr.write("{}\t{}\n".format(skip, src)) continue sys.stdout.write("{}\n".format(src)) def clean_mono(src, lang): # TODO: move mono cleaning to OpusCleaner # when it support this https://github.com/hplt-project/OpusCleaner/issues/141 # treat individual characters as tokens for CJK src_toks = src.split() if lang not in {"zh", "ja", "ko"} else src src_len = len(src_toks) if not src_len: return "EMPTY" if src_len < MIN_LENGTH: return "TOO_SHORT" if src_len > MAX_LENGTH: return "TOO_LONG" if lang in CHARS: num_alpha = sum([1 if re.match(CHARS[lang], t, re.IGNORECASE) else 0 for t in src_toks]) if num_alpha / float(src_len) < RATIO_ALPHA_WORDS: return "RATIO_ALPHA" char_alpha = len(re.findall(CHARS[lang], src, re.IGNORECASE)) if char_alpha / float(len(src.replace(" ", ""))) < RATIO_ALPHA_CHARS: return "RATIO_CHARS" return None def parse_user_args(): parser = argparse.ArgumentParser() parser.add_argument("-l", "--lang", default="en") parser.add_argument("--debug", action="store_true") return parser.parse_args() if __name__ == "__main__": main()