def main()

in PyTorchClassification/run_training.py [0:0]


def main():
    # Output directory for all the models after successful training
    output_dir = 'result/run1'
    # Paramas shared across all runs, e.g. the image directory
    shared_params = ['--data_root', '/data/animals2/species_extended/',
                     '--train_file', 'trainval_animalsExtended2017.json',
                     '--val_file', 'minival_animalsExtended2017.json',
                     '--label_smoothing', '0.15']
    # Name tags for the different models that we will train
    tags = []
    # The run specific parameters, should correspond to the order in TAGS
    params = []

    ### Preparing the training configurations
    # For each model training, we define a tag and the parameters
    # The examples below are for configured for a single GPU
    # For multiple GPUs,, you might want to divide learning rate
    # and batch size by the number of GPUs and enable --sync_bn for 
    # identical results

    # For the best results with ResNeXt, train this setting on a machine with only one GPU
    tags.append('resnext_448_init')
    params.append(['--model_type', 'resnext101', 
                '--image_size', '448', 
                '--epochs', '1', 
                '--epoch_decay', '4', 
                '--lr_decay', '0.1', 
                '--lr', '0.01',
                '--warm_up_iterations', '0',
                '--train_logits_only', 
                '--batch_size', '16',
                '--use_onevsall_loss'])

    tags.append('resnext_448')
    params.append(['--model_type', 'resnext101',
                '--image_size', '448',
                '--epochs', '100',
                '--epoch_decay', '30',
                '--lr_decay', '0.1',
                '--lr', '0.01',
                '--batch_size', '16',
                '--warm_up_iterations', '3200',
                '--use_onevsall_loss',
                '--resume', get_best_model_path(output_dir, 'resnext_448_init')])

    ### Inception V4 224px training
    '''
    tags.append('inc4_299_init')
    params.append(['--model_type', 'inceptionv4',
                '--image_size', '299',
                '--epochs', '1',
                '--epoch_decay', '4',
                '--lr_decay', '0.94',
                '--lr', '0.00225',
                '--batch_size', '16'])

    tags.append('inc4_299')
    params.append(['--model_type', 'inceptionv4', 
                '--image_size', '299', 
                '--epochs', '25', 
                '--epoch_decay', '4', 
                '--lr_decay', '0.94', 
                '--lr', '0.0045',
                '--batch_size', '32',
                '--resume', get_best_model_path(output_dir, 'inc4_299_init')])

    ### Inception V4 448px training
    # we could add the parameter --start_epoch 0 to reset the learning rate if needed
    # With the configuration below, we will traing from epoch 25 to 50 with 448px input
    # If you add --start_epoch 0, you probably want to set --epochs 25 to keep the training
    # duration the same
    tags.append('inc4_448')
    params.append(['--model_type', 'inceptionv4', 
                '--image_size', '448', 
                '--epochs', '50', 
                '--epoch_decay', '4', 
                '--lr_decay', '0.94', 
                '--lr', '0.0045',
                '--batch_size', '32',
                '--resume', get_best_model_path(output_dir, 'inc4_299')])
    '''
    ### Inception V4 560px training
    # Train this on a machine with two GPUs for best results, starting directly at 560px
    tags.append('inc4_560_init')
    params.append(['--model_type', 'inceptionv4', 
                '--image_size', '560', 
                '--epochs', '1', 
                '--epoch_decay', '4', 
                '--lr_decay', '0.94', 
                '--warm_up_iterations', '0',
                '--train_logits_only',
                '--lr', '0.00225',
                '--batch_size', '16'])

    tags.append('inc4_560')
    params.append(['--model_type', 'inceptionv4',
                '--image_size', '560',
                '--epochs', '250',
                '--epoch_decay', '4',
                '--lr_decay', '0.94',
                '--lr', '0.00225',
                '--batch_size', '16',
                '--resume', get_best_model_path(output_dir, 'inc4_560_init')])

    ### Example of fine-tuning of Inception V4 560px on validation data
    # we probably don't need this here as we will fine-tune the whole ensemble
    # This code is here just for reference
    #tags.append('inc4_560_valft')
    #params.append(['--model_type', 'inceptionv4', 
    #            '--image_size', '560', 
    #            '--epochs', '250', 
    #            '--epoch_decay', '4', 
    #            '--lr_decay', '0.94', 
    #            '--lr', '0.0045', 
    #            '--batch_size', '32',
    #            '--resume', get_best_model_path(output_dir, 'inc4_560'),
    #            '--train_file', 'val_wo_minival2017.json'])

    # Train the ensemble
    ### Inception V4 560px + ResNeXt 448px training
    # We want the learning rate to smoothly continue where the ResNeXt model training finished,
    # so we chose the starting epoch to be where the ResNeXt training finished and
    # leave the initial learning rate untouched
    tags.append('inc4_496_resnext_496_ft')
    params.append(['--model_type', 'inceptionv4_resnext101',
                '--image_size', '496', '496',
                '--epochs', '150',
                '--start_epoch', '100',
                '--epoch_decay', '30',
                '--lr_decay', '0.1',
                '--lr', '0.005',
                '--batch_size', '8',
                '--resume', get_best_model_path(output_dir, 'inc4_560'), get_best_model_path(output_dir, 'resnext_448')])

    # Checking if everything is set up properly
    assert len(tags) == len(params)

    ### The actual training
    for tag, param in zip(tags, params):
        print('Starting training of', tag)
        result_dir = get_result_dir(output_dir, tag)
        model_best = get_best_model_path(output_dir, tag)
        if os.path.isfile(model_best):
            print('Found existing trained model at {}, skipping the training of {}'.format(model_best, tag))
        else:
            # Check for checkpoint
            checkpoint_file = 'checkpoint.pth.tar'
            if os.path.isfile(checkpoint_file):
                resume_param = ['--resume', checkpoint_file]
            else:
                resume_param = []
            subprocess.run(['python', 
                             '-m', 'torch.distributed.launch', 
                             '--nproc_per_node={}'.format(torch.cuda.device_count()),
                             'train.py']
                           + param + shared_params + resume_param, check=True)
            assert os.path.isfile('model_best.pth.tar'), 'ERROR: The training did not produce model_best.pth.tar, ' + \
                                                         'You might need to adjust learning parameters.'
            print('Seems training finished, moving trained models and log directory to', result_dir)
            os.makedirs(result_dir, exist_ok=True)
            shutil.move('model_best.pth.tar', result_dir)
            shutil.move('checkpoint.pth.tar', result_dir)
            shutil.move('log', result_dir)