in pytorch_translate/preprocess.py [0:0]
def preprocess_corpora(args, dictionary_cls=Dictionary):
if pytorch_translate_data.is_latent_variable(args):
return
if (
args.train_source_binary_path is not None
and args.train_target_binary_path is not None
):
if (
isinstance(
utils.maybe_parse_collection_argument(args.train_source_binary_path),
str,
)
and isinstance(
utils.maybe_parse_collection_argument(args.train_target_binary_path),
str,
)
and not args.fairseq_data_format
):
args.train_source_binary_path = maybe_generate_temp_file_path(
args.train_source_binary_path
)
args.train_target_binary_path = maybe_generate_temp_file_path(
args.train_target_binary_path
)
if not args.fairseq_data_format:
args.eval_source_binary_path = maybe_generate_temp_file_path(
args.eval_source_binary_path
)
args.eval_target_binary_path = maybe_generate_temp_file_path(
args.eval_target_binary_path
)
# Additional text preprocessing options could be added here before
# binarizing.
if pytorch_translate_data.is_multilingual(args):
preprocess_corpora_multilingual(args)
elif pytorch_translate_data.is_multilingual_many_to_one(args):
preprocess_corpora_multilingual_many_to_one(args, dictionary_cls)
else:
# Vocabs are built before preprocessing because we might need to use
# both monolingual and bilingual corpora sources to build the vocab
# (in the case of semisupervised training)
dictionaries = build_vocabs(args=args, dictionary_cls=dictionary_cls)
source_dict = dictionaries["source_dict"]
char_source_dict = dictionaries["char_source_dict"]
target_dict = dictionaries["target_dict"]
char_target_dict = dictionaries["char_target_dict"]
if char_target_dict is not None:
print("char_target_dict is not None --> should use it!")
preprocess_bilingual_corpora(
args=args,
source_dict=source_dict,
char_source_dict=char_source_dict,
target_dict=target_dict,
char_target_dict=char_target_dict,
)
# Binarize additional monolingual corpora for the semisupervised translation
# task
if (
args.task == constants.SEMI_SUPERVISED_TASK
or args.task == constants.DENOISING_AUTOENCODER_TASK
):
args.train_mono_source_binary_path = maybe_generate_temp_file_path(
output_path=getattr(args, "train_mono_source_binary_path", None)
)
args.train_mono_target_binary_path = maybe_generate_temp_file_path(
output_path=getattr(args, "train_mono_target_binary_path", None)
)
preprocess_monolingual_corpora(
args,
source_dict=source_dict,
char_source_dict=char_source_dict,
target_dict=target_dict,
char_target_dict=char_target_dict,
)