in PyTorchClassification/run_snakes_training.py [0:0]
def main():
# Output directory for all the models after successful training
output_dir = 'result/snakes'
# Parameters shared across all runs, e.g. the image directory
shared_params = ['--data_root', 'data',
'--train_file', 'train.json',
'--val_file', 'valid.json',
'--print_freq', '100',
'--label_smoothing', '0.15',
'--use_onevsall_loss']
# Name tags for the different models that we will train
tags = []
# The run specific parameters, should correspond to the order in TAGS
params = []
# Preparing the training configurations
# For each model training, we define a tag and the parameters
tags.append('resnext_224_init')
params.append(['--model_type', 'resnext101',
'--image_size', '224',
'--epochs', '4',
'--epoch_decay', '2',
'--lr_decay', '0.5',
'--lr', '0.01',
'--warm_up_iterations', '0',
'--train_logits_only',
'--batch_size', '128',
'--fp16'])
tags.append('resnext_224')
params.append(['--model_type', 'resnext101',
'--image_size', '224',
'--epochs', '50',
'--epoch_decay', '5',
'--lr_decay', '0.5',
'--lr', '0.01',
'--warm_up_iterations', '50',
'--batch_size', '128',
'--fp16',
'--resume', get_best_model_path(output_dir, 'resnext_224_init')])
tags.append('resnext_448')
params.append(['--model_type', 'resnext101',
'--image_size', '448',
'--start_epoch', '0',
'--epochs', '30',
'--epoch_decay', '5',
'--lr_decay', '0.7',
'--lr', '0.005',
'--warm_up_iterations', '10',
'--batch_size', '32',
'--fp16',
'--resume', get_best_model_path(output_dir, 'resnext_224')])
# Inceptionv4
tags.append('inc4_299_init')
params.append(['--model_type', 'inceptionv4',
'--image_size', '299',
'--epochs', '4',
'--epoch_decay', '2',
'--lr_decay', '0.94',
'--lr', '0.05',
'--warm_up_iterations', '0',
'--train_logits_only',
'--batch_size', '128',
'--fp16'])
tags.append('inc4_299')
params.append(['--model_type', 'inceptionv4',
'--image_size', '299',
'--epochs', '25',
'--epoch_decay', '4',
'--lr_decay', '0.94',
'--lr', '0.005',
'--warm_up_iterations', '10',
'--batch_size', '128',
'--fp16',
'--resume', get_best_model_path(output_dir, 'inc4_299_init')])
tags.append('inc4_488')
params.append(['--model_type', 'inceptionv4',
'--image_size', '488',
'--epochs', '50',
'--epoch_decay', '4',
'--lr_decay', '0.94',
'--lr', '0.005',
'--warm_up_iterations', '20',
'--batch_size', '32',
'--fp16',
'--resume', get_best_model_path(output_dir, 'inc4_299')])
# Checking if everything is set up properly
assert len(tags) == len(params)
# The actual training
for tag, param in zip(tags, params):
print('Starting training of', tag)
result_dir = get_result_dir(output_dir, tag)
model_best = get_best_model_path(output_dir, tag)
if os.path.isfile(model_best):
print('Found existing trained model at {}, skipping the training of {}'.format(
model_best, tag))
else:
# Check for checkpoint
checkpoint_file = 'checkpoint.pth.tar'
if os.path.isfile(checkpoint_file):
resume_param = ['--resume', checkpoint_file]
else:
resume_param = []
subprocess.run(['python',
'-m', 'torch.distributed.launch',
'--nproc_per_node={}'.format(
torch.cuda.device_count()),
'train.py']
+ param + shared_params + resume_param, check=True)
assert os.path.isfile('model_best.pth.tar'), 'ERROR: The training did not produce model_best.pth.tar, ' + \
'You might need to adjust learning parameters.'
print(
'Seems training finished, moving trained models and log directory to', result_dir)
os.makedirs(result_dir, exist_ok=True)
shutil.move('model_best.pth.tar', result_dir)
shutil.move('checkpoint.pth.tar', result_dir)
shutil.move('log', result_dir)