PyTorchClassification/run_cct_training.py [45:84]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    assert len(tags) == len(params)

    ### The actual training
    for tag, param in zip(tags, params):
        print('Starting training of', tag)
        result_dir = get_result_dir(output_dir, tag)
        model_best = get_best_model_path(output_dir, tag)
        if os.path.isfile(model_best):
            print('Found existing trained model at {}, skipping the training of {}'.format(model_best, tag))
        else:
            # Check for checkpoint
            checkpoint_file = 'checkpoint.pth.tar'
            if os.path.isfile(checkpoint_file):
                resume_param = ['--resume', checkpoint_file]
            else:
                resume_param = []
            subprocess.run(['python', 
                             '-m', 'torch.distributed.launch', 
                             '--nproc_per_node={}'.format(torch.cuda.device_count()),
                             'train.py']
                           + param + shared_params + resume_param, check=True)
            assert os.path.isfile('model_best.pth.tar'), 'ERROR: The training did not produce model_best.pth.tar, ' + \
                                                         'You might need to adjust learning parameters.'
            print('Seems training finished, moving trained models and log directory to', result_dir)
            os.makedirs(result_dir, exist_ok=True)
            shutil.move('model_best.pth.tar', result_dir)
            shutil.move('checkpoint.pth.tar', result_dir)
            shutil.move('log', result_dir)


def get_result_dir(output_dir, tag):
    ''' Returns the directory, where we will store all models and logs after successful training '''
    return os.path.join(output_dir, tag)

def get_best_model_path(output_dir, tag):
    ''' Returns the path, where we will store the best model after successful training '''
    return os.path.join(get_result_dir(output_dir, tag), 'model_best.pth.tar')

if __name__ == '__main__':
    main()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



PyTorchClassification/run_training.py [141:180]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    assert len(tags) == len(params)

    ### The actual training
    for tag, param in zip(tags, params):
        print('Starting training of', tag)
        result_dir = get_result_dir(output_dir, tag)
        model_best = get_best_model_path(output_dir, tag)
        if os.path.isfile(model_best):
            print('Found existing trained model at {}, skipping the training of {}'.format(model_best, tag))
        else:
            # Check for checkpoint
            checkpoint_file = 'checkpoint.pth.tar'
            if os.path.isfile(checkpoint_file):
                resume_param = ['--resume', checkpoint_file]
            else:
                resume_param = []
            subprocess.run(['python', 
                             '-m', 'torch.distributed.launch', 
                             '--nproc_per_node={}'.format(torch.cuda.device_count()),
                             'train.py']
                           + param + shared_params + resume_param, check=True)
            assert os.path.isfile('model_best.pth.tar'), 'ERROR: The training did not produce model_best.pth.tar, ' + \
                                                         'You might need to adjust learning parameters.'
            print('Seems training finished, moving trained models and log directory to', result_dir)
            os.makedirs(result_dir, exist_ok=True)
            shutil.move('model_best.pth.tar', result_dir)
            shutil.move('checkpoint.pth.tar', result_dir)
            shutil.move('log', result_dir)


def get_result_dir(output_dir, tag):
    ''' Returns the directory, where we will store all models and logs after successful training '''
    return os.path.join(output_dir, tag)

def get_best_model_path(output_dir, tag):
    ''' Returns the path, where we will store the best model after successful training '''
    return os.path.join(get_result_dir(output_dir, tag), 'model_best.pth.tar')

if __name__ == '__main__':
    main()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



