scripts/launch_cnndm.py [76:144]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    }

    if exp_type == "local":
        train_instance_type = 'local'
        train_instance_count = 1

        train_path = "<data-bin location>"
        init_path = "<pretrained bart.large location>"
        output_path = '<model output location>'
        ngpus = 4 # modify based on the number of GPUs on the local machine.

        cmd = ['python', 'train.py', ]
        cmd += ['--save_dir', output_path]
        cmd += ['--train', train_path]
        cmd += ['--pretrained_path', init_path]
        cmd += ['--ngpus', '{}'.format(ngpus)]

        for key, value in hyperparameters.items():
            key = key.replace('_', '-')
            cmd.append('--{}'.format(key))
            cmd.append(str(value))
        stdout_fptr = open(output_path + "/Job_0.stdout", 'wt', encoding='utf-8')
        process = Popen(cmd, stdout=PIPE,
                        stderr=open(output_path + "/Job_0.stderr", 'wt', encoding='utf-8'),
                        encoding='utf-8',
                        bufsize=0,
                        )

        while process.poll() is None:
            line = process.stdout.readline()
            _write_screen_and_file(line, stdout_fptr)
        line = process.stdout.read()

        # special log writing for job_idx == 0
        _write_screen_and_file(line, stdout_fptr)

        if process.returncode != 0:
            raise Exception('job 0 terminated with non-zero returncode')
    else:
        train_instance_type = 'ml.p3.16xlarge'
        train_instance_count = 1
        train_path = "s3://path/to/data_bin"
        init_path = "s3://path/to/bart.large"
        image_name = "<docker-image-name>"
        output_path = "s3://path/to/output"
        role = "<sagemaker-execution-role>"
        estimator = Estimator(role=role,
                              train_instance_count=train_instance_count,
                              train_instance_type=train_instance_type,
                              train_volume_size=150,
                              image_name=image_name+':latest',
                              hyperparameters=hyperparameters,
                              base_job_name=job_name,
                              train_max_run=5 * 24 * 60 * 60,
                              output_path=output_path,
                              metric_definitions=[
                                  {'Name': 'train:loss', 'Regex': ' loss=([0-9\\.]+)'},
                              ],
                              )

        print("Start training")
        estimator.fit(
            inputs={
                "train": train_path,
                "init": init_path,
            },
            logs=True,
            wait=False,
        )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


scripts/launch_multitask_cnndm.py [78:144]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    }

    if exp_type == "local":
        train_instance_type = 'local'
        train_instance_count = 1

        train_path = "<data-bin location>"
        init_path = "<pretrained bart.large location>"
        output_path = '<model output location>'
        ngpus = 4 # modify based on the number of GPUs on the local machine.

        cmd = ['python', 'train.py', ]
        cmd += ['--save_dir', output_path]
        cmd += ['--train', train_path]
        cmd += ['--pretrained_path', init_path]
        cmd += ['--ngpus', '{}'.format(ngpus)]
        for key, value in hyperparameters.items():
            key = key.replace('_', '-')
            cmd.append('--{}'.format(key))
            cmd.append(str(value))
        stdout_fptr = open(output_path + "/Job_0.stdout", 'wt', encoding='utf-8')
        process = Popen(cmd, stdout=PIPE,
                        stderr=open(output_path + "/Job_0.stderr", 'wt', encoding='utf-8'),
                        encoding='utf-8',
                        bufsize=0,
                        )
        while process.poll() is None:
            line = process.stdout.readline()
            _write_screen_and_file(line, stdout_fptr)
        line = process.stdout.read()

        # special log writing for job_idx == 0
        _write_screen_and_file(line, stdout_fptr)

        if process.returncode != 0:
            raise Exception('job 0 terminated with non-zero returncode')
    else:
        train_instance_type = 'ml.p3.16xlarge'
        train_instance_count = 1
        train_path = "s3://path/to/data_bin"
        init_path = "s3://path/to/bart.large"
        image_name = "<docker-image-name>"
        output_path = "s3://path/to/output"
        role = "<sagemaker-execution-role>"
        estimator = Estimator(role=role,
                              train_instance_count=train_instance_count,
                              train_instance_type=train_instance_type,
                              train_volume_size=150,
                              image_name=image_name+':latest',
                              hyperparameters=hyperparameters,
                              base_job_name=job_name,
                              train_max_run=5 * 24 * 60 * 60,
                              output_path=output_path,
                              metric_definitions=[
                                  {'Name': 'train:loss', 'Regex': ' loss=([0-9\\.]+)'},
                              ],
                              )

        print("Start training")
        estimator.fit(
            inputs={
                "train": train_path,
                "init": init_path,
            },
            logs=True,
            wait=False,
        )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -