in source/lib/text_processing.py [0:0]
def Token(inp_fname, out_fname, lang='en',
lower_case=True, romanize=False, descape=False,
verbose=False, over_write=False, gzip=False):
assert lower_case, 'lower case is needed by all the models'
assert not over_write, 'over-write is not yet implemented'
if not os.path.isfile(out_fname):
cat = 'zcat ' if gzip else 'cat '
roman = lang if romanize else 'none'
# handle some iso3 langauge codes
if lang in ('cmn', 'wuu', 'yue'):
lang = 'zh'
if lang in ('jpn'):
lang = 'ja'
if verbose:
print(' - Tokenizer: {} in language {} {} {}'
.format(os.path.basename(inp_fname), lang,
'(gzip)' if gzip else '',
'(de-escaped)' if descape else '',
'(romanized)' if romanize else ''))
run(cat + inp_fname
+ '|' + REM_NON_PRINT_CHAR
+ '|' + NORM_PUNC + lang
+ ('|' + DESCAPE if descape else '')
+ '|' + MOSES_TOKENIZER + lang
+ ('| python3 -m jieba -d ' if lang == 'zh' else '')
+ ('|' + MECAB + '/bin/mecab -O wakati -b 50000 ' if lang == 'ja' else '')
+ '|' + ROMAN_LC + roman
+ '>' + out_fname,
env=dict(os.environ, LD_LIBRARY_PATH=MECAB + '/lib'),
shell=True)
elif not over_write and verbose:
print(' - Tokenizer: {} exists already'
.format(os.path.basename(out_fname), lang))