sockeye/prepare_data.py [25:74]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
logger = logging.getLogger(__name__)


def main():
    params = arguments.ConfigArgumentParser(description='Preprocesses and shards training data.')
    arguments.add_prepare_data_cli_args(params)
    args = params.parse_args()
    prepare_data(args)


def prepare_data(args: argparse.Namespace):
    output_folder = os.path.abspath(args.output)
    os.makedirs(output_folder, exist_ok=True)
    setup_main_logger(console=not args.quiet,
                      file_logging=not args.no_logfile,
                      path=os.path.join(output_folder, C.LOG_NAME))
    utils.log_basic_info(args)
    arguments.save_args(args, os.path.join(output_folder, C.ARGS_STATE_NAME))
    utils.seed_rngs(args.seed)

    minimum_num_shards = args.min_num_shards
    samples_per_shard = args.num_samples_per_shard
    bucketing = not args.no_bucketing
    bucket_width = args.bucket_width
    bucket_scaling = args.bucket_scaling

    source_paths = [args.source] + args.source_factors
    source_factor_vocab_paths = [args.source_factor_vocabs[i] if i < len(args.source_factor_vocabs)
                                 else None for i in range(len(args.source_factors))]
    source_vocab_paths = [args.source_vocab] + source_factor_vocab_paths
    target_paths = [args.target] + args.target_factors
    target_factor_vocab_paths = [args.target_factor_vocabs[i] if i < len(args.target_factor_vocabs)
                                 else None for i in range(len(args.target_factors))]
    target_vocab_paths = [args.target_vocab] + target_factor_vocab_paths

    num_words_source, num_words_target = args.num_words
    num_words_source = num_words_source if num_words_source > 0 else None
    num_words_target = num_words_target if num_words_target > 0 else None

    word_min_count_source, word_min_count_target = args.word_min_count
    max_seq_len_source, max_seq_len_target = args.max_seq_len
    # The maximum length is the length before we add the BOS/EOS symbols
    max_seq_len_source = max_seq_len_source + C.SPACE_FOR_XOS
    max_seq_len_target = max_seq_len_target + C.SPACE_FOR_XOS
    logger.info("Adjusting maximum length to reserve space for a BOS/EOS marker. New maximum length: (%d, %d)",
                max_seq_len_source, max_seq_len_target)

    # Split input into shards and randomly assign data to shards
    with utils.smart_open(source_paths[0], mode='rb') as infile:
        num_sents = sum(1 for _ in infile)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


sockeye/prepare_data_pt.py [25:74]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
logger = logging.getLogger(__name__)


def main():
    params = arguments.ConfigArgumentParser(description='Preprocesses and shards training data.')
    arguments.add_prepare_data_cli_args(params)
    args = params.parse_args()
    prepare_data(args)


def prepare_data(args: argparse.Namespace):
    output_folder = os.path.abspath(args.output)
    os.makedirs(output_folder, exist_ok=True)
    setup_main_logger(console=not args.quiet,
                      file_logging=not args.no_logfile,
                      path=os.path.join(output_folder, C.LOG_NAME))
    utils.log_basic_info(args)
    arguments.save_args(args, os.path.join(output_folder, C.ARGS_STATE_NAME))
    utils.seed_rngs(args.seed)

    minimum_num_shards = args.min_num_shards
    samples_per_shard = args.num_samples_per_shard
    bucketing = not args.no_bucketing
    bucket_width = args.bucket_width
    bucket_scaling = args.bucket_scaling

    source_paths = [args.source] + args.source_factors
    source_factor_vocab_paths = [args.source_factor_vocabs[i] if i < len(args.source_factor_vocabs)
                                 else None for i in range(len(args.source_factors))]
    source_vocab_paths = [args.source_vocab] + source_factor_vocab_paths
    target_paths = [args.target] + args.target_factors
    target_factor_vocab_paths = [args.target_factor_vocabs[i] if i < len(args.target_factor_vocabs)
                                 else None for i in range(len(args.target_factors))]
    target_vocab_paths = [args.target_vocab] + target_factor_vocab_paths

    num_words_source, num_words_target = args.num_words
    num_words_source = num_words_source if num_words_source > 0 else None
    num_words_target = num_words_target if num_words_target > 0 else None

    word_min_count_source, word_min_count_target = args.word_min_count
    max_seq_len_source, max_seq_len_target = args.max_seq_len
    # The maximum length is the length before we add the BOS/EOS symbols
    max_seq_len_source = max_seq_len_source + C.SPACE_FOR_XOS
    max_seq_len_target = max_seq_len_target + C.SPACE_FOR_XOS
    logger.info("Adjusting maximum length to reserve space for a BOS/EOS marker. New maximum length: (%d, %d)",
                max_seq_len_source, max_seq_len_target)

    # Split input into shards and randomly assign data to shards
    with utils.smart_open(source_paths[0], mode='rb') as infile:
        num_sents = sum(1 for _ in infile)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -