in learn_bpe.py [0:0]
def main():
# Full compatibility with original implementation.
# I do not know exactly why this is different to the standard file objects,
# but some special utf-8 symbols are handled differently if the codecs call
# is not present.
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
arg_parser = create_arg_parser()
args = arg_parser.parse_args()
# Full compatibility with original implementation
for i in range(len(args.input)):
if args.input[i].name != '<stdin>':
args.input[i] = codecs.open(args.input[i].name, encoding='utf-8')
if args.output.name != '<stdout>':
args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
learn_bpe(args.input, args.output, args.symbols,
probabilistic=args.probabilistic,
frac_stopping=args.frac_stopping,
frac_stopping_average_n=args.frac_stopping_average,
min_frequency=args.min_frequency,
is_dict=args.dict_input,
total_symbols=args.total_symbols,
verbose=args.verbose,
)