def preprocess_corpora()

in pytorch_translate/preprocess.py [0:0]


def preprocess_corpora(args, dictionary_cls=Dictionary):
    if pytorch_translate_data.is_latent_variable(args):
        return
    if (
        args.train_source_binary_path is not None
        and args.train_target_binary_path is not None
    ):
        if (
            isinstance(
                utils.maybe_parse_collection_argument(args.train_source_binary_path),
                str,
            )
            and isinstance(
                utils.maybe_parse_collection_argument(args.train_target_binary_path),
                str,
            )
            and not args.fairseq_data_format
        ):
            args.train_source_binary_path = maybe_generate_temp_file_path(
                args.train_source_binary_path
            )
            args.train_target_binary_path = maybe_generate_temp_file_path(
                args.train_target_binary_path
            )
    if not args.fairseq_data_format:
        args.eval_source_binary_path = maybe_generate_temp_file_path(
            args.eval_source_binary_path
        )
        args.eval_target_binary_path = maybe_generate_temp_file_path(
            args.eval_target_binary_path
        )

    # Additional text preprocessing options could be added here before
    # binarizing.
    if pytorch_translate_data.is_multilingual(args):
        preprocess_corpora_multilingual(args)
    elif pytorch_translate_data.is_multilingual_many_to_one(args):
        preprocess_corpora_multilingual_many_to_one(args, dictionary_cls)
    else:

        # Vocabs are built before preprocessing because we might need to use
        # both monolingual and bilingual corpora sources to build the vocab
        # (in the case of semisupervised training)
        dictionaries = build_vocabs(args=args, dictionary_cls=dictionary_cls)
        source_dict = dictionaries["source_dict"]
        char_source_dict = dictionaries["char_source_dict"]
        target_dict = dictionaries["target_dict"]
        char_target_dict = dictionaries["char_target_dict"]

        if char_target_dict is not None:
            print("char_target_dict is not None --> should use it!")

        preprocess_bilingual_corpora(
            args=args,
            source_dict=source_dict,
            char_source_dict=char_source_dict,
            target_dict=target_dict,
            char_target_dict=char_target_dict,
        )
        # Binarize additional monolingual corpora for the semisupervised translation
        # task
        if (
            args.task == constants.SEMI_SUPERVISED_TASK
            or args.task == constants.DENOISING_AUTOENCODER_TASK
        ):
            args.train_mono_source_binary_path = maybe_generate_temp_file_path(
                output_path=getattr(args, "train_mono_source_binary_path", None)
            )
            args.train_mono_target_binary_path = maybe_generate_temp_file_path(
                output_path=getattr(args, "train_mono_target_binary_path", None)
            )
            preprocess_monolingual_corpora(
                args,
                source_dict=source_dict,
                char_source_dict=char_source_dict,
                target_dict=target_dict,
                char_target_dict=char_target_dict,
            )