in PyTorchClassification/train.py [0:0]
def main():
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('--data_root', required=True,
type=str, metavar='DATASET_ROOT', help='Path to the root directory of the dataset.')
parser.add_argument('--model_type', default=ModelType.resnext101,
metavar='ARCH', type=ModelType.from_string, choices=list(ModelType),
help='model architecture: ' + ' | '.join([m.name for m in ModelType]) +
' (default: resnext101)')
parser.add_argument('--image_size', default=224, nargs='+',
type=int, metavar='RESOLUTION', help='The side length of the CNN input image ' + \
'(default: 224). For ensembles, provide one resolution for each network.')
parser.add_argument('--epochs', default=200,
type=int, metavar='N', help='Number of total epochs to run.')
parser.add_argument('--start_epoch', default=None,
type=int, metavar='N', help='Override starting epoch, useful on restarts.')
parser.add_argument('--batch_size', default=32,
type=int, metavar='N', help='mini-batch size (default: 32), which is the number of '+ \
'images per GPU in a single forward / backward pass.')
parser.add_argument('--lr', '--learning-rate', default=0.0045,
type=float, metavar='LR', help='initial learning rate (default: 0.0045). The learning rate ' + \
'is scaled linearly with the number of GPUs as the batch size also scales this way.')
parser.add_argument('--lr_decay', default=0.94,
type=float, metavar='LR_DECAY', help='The factor by which the learning rate is reduced ' + \
'every --epoch_decay epochs (default: 0.94)')
parser.add_argument('--epoch_decay', default=4,
type=int, metavar='EPOCH_DECAY', help='The number of epochs after which the learning rate ' + \
'is decayed by a factor of --lr_decay (default: 4)')
parser.add_argument('--momentum', default=0.9,
type=float, metavar='M', help='momentum (default: 0.9)')
parser.add_argument('--weight_decay', default=1e-4,
type=float, metavar='W', help='weight decay (default: 1e-4)')
parser.add_argument('--label_smoothing', default=0.15,
type=float, metavar='SMOOTHING', help='Replaces the hard one-hot target in training by a ' + \
'a probability distribution, which has 1-SMOOTHING probability on the ground-truth class ' + \
'and a taxonomically aware amount of probability across all other classes (default: 0.15)')
parser.add_argument('--resume', default=None, nargs='+',
type=str, metavar='PATH', help='Path to a checkpoint to resume from (default: none). Can ' + \
'be multiple checkpoints when training an ensemble from two different checkpoints.')
parser.add_argument('--warm_up_iterations', default=1600,
type=int, metavar='ITERATIONS', help='Performs this number of iterations in the beginning ' + \
'of the training with a very low learning rate in order to avoid accuracy drops when ' + \
'resuming from a checkpoint (default: 1600)')
parser.add_argument('--use_onevsall_loss', action='store_true',
help='If set, uses a binary cross-entropy loss for each element instead of multi-class ' + \
'cross-entropy. Requires label smoothing > 0.')
parser.add_argument('--bg_classes', default=None, nargs='+',
type=int, metavar='BG_CLASS', help='Allows to provide the class IDs for background classes. ' + \
'Requires label smoothing > 0 and one-vs-all loss to be set. ')
parser.add_argument('--train_logits_only', action='store_true',
help='If set, only the last linear layer is trained.')
parser.add_argument('--reset_classifier', action='store_true',
help='If set, reinitializes the classifier of the network before training.')
parser.add_argument('--workers', default=8,
type=int, metavar='N', help='number of data loading workers (default: 8). If 0, the ' + \
' data loading and preprocessing is done in the main thread on demand.')
parser.add_argument('--print_freq', default=1000,
type=int, metavar='N', help='Frequency of printing out stats in console (default: 1000)')
parser.add_argument('--top_prec', default=1,
type=int, metavar='TOPK', help='Uses the TOPK accucary to select the best model (default: 1)')
parser.add_argument('--evaluate', action='store_true',
help='Only evaluates the model on validation set')
parser.add_argument('--multi_crop', action='store_true',
help='Whether to use multi-crop during evaluation. We use 12 crops in total.')
parser.add_argument('--val_bounding_boxes', default=None,
type=str, metavar='BBOX_FILE', help='Path to bounding box file generated by our RCNN code ' + \
'(default: None). These boxes will be used as first crop during validation. ')
parser.add_argument('--save_preds', action='store_true',
help='If --evaluate and --save_preds is set, we will write out the predictions ' + \
'for each validation image in the Kaggle format. For Kaggle submission, you want to ' + \
'adjust --val_file to the json containing the test data instead of validation data.')
parser.add_argument('--save_preds_file', default='test_preds.csv',
type=str, metavar='FILEPATH', help='If --evaluate and --save_preds is set, then this is the ' + \
'file name to store the predictions to (default: test_preds.csv)')
parser.add_argument('--save_conf_matrix', action='store_true',
help='If --evaluate and --save_conf_matrix is set, we will write out the confusion ' + \
'matrix computed from the validation predictions.')
parser.add_argument('--annotation_format', default='2017',
type=str, dest='year', metavar='VERSION', help='Version of the dataset format, 2017 or 2018. ' + \
'')
parser.add_argument('--train_file', default='trainval2017.json',
type=str, metavar='TRAIN_FILE', help='Name of the json file containing the training annotation ' + \
'(default: trainval2017.json). Should be located within the dataset root directory.')
parser.add_argument('--val_file', default='minival2017.json',
type=str, metavar='VAL_FILE', help='Name of the json file containing the validation annotation ' + \
'(default: minival2017.json). Should be located within the dataset root directory.')
# For multi-GPU and half-precision training
parser.add_argument('--fp16', action='store_true',
help='Run model fp16 mode.')
parser.add_argument('--static-loss-scale', type=float, default=1,
help='Static loss scale, positive power of 2 values can improve fp16 convergence.')
parser.add_argument('--dynamic-loss-scale', action='store_true',
help='Use dynamic loss scaling. If supplied, this argument supersedes ' +
'--static-loss-scale.')
parser.add_argument('--prof', dest='prof', action='store_true',
help='Only runs 10 iterations in each iteration for testing and profiling.')
parser.add_argument('--dist-url', default='env://', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--world-size', default=1, type=int,
help='Number of GPUs to use. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
parser.add_argument("--local_rank", default=0, type=int,
help='Used for multi-process training. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
parser.add_argument('--sync_bn', action='store_true',
help='Enables the synchronization of the BN computations across GPUs.')
global args
args = parser.parse_args()
assert args.bg_classes is None or (args.label_smoothing > 0 and args.use_onevsall_loss), \
'The use of a background class requires label_smoothing > 0 and --use_onevsall_loss'
# Prepare logging
log_dir = './log/{}_{}_gpu{}'.format(args.model_type.name, datetime.datetime.now().strftime('%b%d_%H-%M-%S'), args.local_rank)
# The summary file will contain most of the print outputs for convenience
global log_summary_file
log_summary_file = os.path.join(log_dir, 'summary.txt')
# The object for logging tensorboard events
global writer
writer = tensorboardX.SummaryWriter(log_dir=log_dir)
# Copy all python files to the log directly
log_py_dir = os.path.join(log_dir, 'code')
os.makedirs(log_py_dir)
for fi in glob.glob('./*.py'):
shutil.copyfile(fi, os.path.join(log_py_dir, fi))
# Preparations for using multiple GPU and half precision, see https://github.com/NVIDIA/apex
args.distributed = False
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
args.gpu = 0
args.world_size = 1
if args.distributed:
args.gpu = args.local_rank % torch.cuda.device_count()
torch.cuda.set_device(args.gpu)
dist.init_process_group(backend=args.dist_backend,
init_method=args.dist_url)
args.world_size = torch.distributed.get_world_size()
if args.fp16:
assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."
if args.static_loss_scale != 1.0:
if not args.fp16:
print("Warning: if --fp16 is not used, static_loss_scale will be ignored.")
best_prec1 = 0
best_prec3 = 0
best_prec5 = 0
# data loading code
train_dataset = data_loader.JSONDataset(args.data_root,
os.path.join(args.data_root, args.train_file),
args.image_size,
is_train=True,
dataFormat2017 = (args.year == "2017"),
percentUse=0.1 if args.prof else 100,
label_smoothing = args.label_smoothing,
bg_classes = args.bg_classes)
# We use balanced sampling of all classes, i.e. each class has the same probability to be present in a batch
if args.distributed:
train_sampler = data_loader.DistributedBalancedSampler(train_dataset)
else:
train_sampler = data_loader.DistributedBalancedSampler(train_dataset, 1, 0)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=True, sampler=train_sampler)
num_classes = train_dataset.get_num_classes()
# Write out the list of classnames
classname_list = [train_dataset.classnames[cid] for cid in range(num_classes)]
with open(os.path.join(log_dir, 'classnames.txt'), 'w', encoding='utf-8') as outfile:
outfile.write('\n'.join(classname_list))
val_dataset = data_loader.JSONDataset(args.data_root,
os.path.join(args.data_root, args.val_file),
args.image_size,
is_train=False,
dataFormat2017 = (args.year == "2017"),
percentUse=0.1 if args.prof else 100,
multi_crop=args.multi_crop,
bbox_predictions=args.val_bounding_boxes if args.val_bounding_boxes else None,
label_smoothing = args.label_smoothing)
val_loader = torch.utils.data.DataLoader(val_dataset,
batch_size=8 if args.evaluate else args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True)
print_log("num classes: %d" % (num_classes))
# build model
model = ClassificationModel(args.resume, args.image_size, True, args.model_type, train_dataset.classnames)
if args.reset_classifier:
model.model.last_linear = nn.Linear(model.model.last_linear.in_features, num_classes).cuda()
# define loss function (criterion) and optimizer
if args.use_onevsall_loss:
assert args.label_smoothing > 0, 'One-vs-all loss requires label smoothing larger than 0.'
criterion = criterions.BCEWithLogitsLoss2().cuda()
elif args.label_smoothing > 0:
criterion = criterions.KLDivLoss2().cuda()
else:
criterion = nn.CrossEntropyLoss().cuda()
# define learnable parameters and their learning rate multipliers
if hasattr(model, 'model') and hasattr(model.model, 'last_linear'):
# First collect the parameters of the base model and the last linear layer
base_params = list(set(filter(lambda p: p.requires_grad, model.parameters()))
- set(filter(lambda p: p.requires_grad, model.model.last_linear.parameters())))
classifier_params = list(filter(lambda p: p.requires_grad, model.model.last_linear.parameters()))
# If we train only the last layer, it is usually safe to use a ten times larger learning rate than usual
if args.train_logits_only:
trainable_params = [dict(params=classifier_params, lr_mult=10)]
print_log("Increasing learning rate of classifier by a factor of 10, because only the classifier is trained.")
# If we reset the classifier, then increase the learning rate of the last linear layer by 10
# This is the case if we do not resume training, because then we will start from an pre-trained ImageNet model and
# automatically reset the classifier to have the appropriate number of output elements
# This is also the case, when we pass the flag args.reset_classifier
elif not args.resume or args.reset_classifier:
trainable_params = [dict(params=base_params, lr_mult=1),
dict(params=classifier_params, lr_mult=10)]
print_log("Increasing learning rate of classifier by a factor of 10, because the classifier is re-initialized.")
# Otherwise, just use the same learning rate everywhere
else:
trainable_params = [dict(params=list(filter(lambda p: p.requires_grad, model.parameters())), lr_mult=1)]
elif args.train_logits_only:
raise Exception('Could not fine the final linear layer, hence the parameter --train_logits_only can not be used.')
else:
trainable_params = [dict(params=list(filter(lambda p: p.requires_grad, model.parameters())), lr_mult=1)]
print_log("training %d params" % len(trainable_params))
optimizer = torch.optim.SGD(trainable_params, args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
if args.fp16:
optimizer = FP16_Optimizer(optimizer,
static_loss_scale=args.static_loss_scale,
dynamic_loss_scale=args.dynamic_loss_scale)
# load pretrained model
ckpt_epoch = None
# Update tensorboard log with old topk values and restore optimizer
# This makes only sense if we continue from a single checkpoint file
# as we will always start a new model when there are more files
if args.resume and len(args.resume) == 1 and not args.reset_classifier:
best_prec1, best_prec3, best_prec5, ckpt_epoch = model.loadOptimizer(optimizer)
writer.add_scalars('validation/topk', {'top1':best_prec1,
'top3':best_prec3,
'top5':best_prec5},
len(train_loader) * ckpt_epoch)
else:
writer.add_scalars('validation/topk', {'top1':0, 'top3':0, 'top5':0},
len(train_loader) * (args.start_epoch if args.start_epoch is not None else 0))
# Set starting epoch
if args.start_epoch is not None:
start_epoch = args.start_epoch
elif ckpt_epoch:
start_epoch = ckpt_epoch
else:
start_epoch = 0
if args.sync_bn:
print("Enabling the synchronization of BN across GPUs")
model = apex.parallel.convert_syncbn_model(model)
if args.fp16:
model = network_to_half(model)
if args.distributed:
# By default, apex.parallel.DistributedDataParallel overlaps communication with
# computation in the backward pass.
# model = DDP(model)
# delay_allreduce delays all communication to the end of the backward pass.
model = DDP(model, delay_allreduce=True)
cudnn.benchmark = True
if args.evaluate:
# write predictions to file
if args.save_preds:
prec1, prec3, prec5, preds, im_ids = validate(val_loader, model, criterion, 0, True, True)
with open(args.save_preds_file, 'w') as opfile:
opfile.write('id,predicted\n')
for ii in range(len(im_ids)):
opfile.write(str(im_ids[ii]) + ',' + ' '.join(str(x) for x in preds[ii,:]) + '\n')
else:
prec1, prec3, prec5 = validate(val_loader, model, criterion, 0, True)
if args.save_conf_matrix:
test_labels = np.array(val_dataset.get_labels())
unique_labels = np.unique(val_dataset.get_labels() + train_dataset.get_labels())
# As we turned shuffle off, we can just compare the plain labels
import sklearn.metrics
cm = sklearn.metrics.confusion_matrix(test_labels, preds[:,0], labels=unique_labels)
cm = cm / cm.sum(axis=1, keepdims=True)
np.savetxt('conf.csv', cm, fmt='%.3f', delimiter=',')
return
def to_md(code):
return str(code).replace('\n','\n\t') #'```python\n' + str(code) + '\n```'
writer.add_text('args/instance',to_md(args.__dict__), start_epoch*len(train_loader))
print_log('Arguments / configuration: \n' + to_md(args.__dict__))
writer.add_text('train.py',to_md(open(__file__,'rt').read()), start_epoch*len(train_loader))
for epoch in range(start_epoch, args.epochs):
writer.add_scalar('epoch', epoch, len(train_loader)*epoch)
train_sampler.set_epoch(epoch)
adjust_learning_rate(optimizer, epoch)
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch, None)
# evaluate on remaining 10% validation set
prec1, prec3, prec5 = validate(val_loader, model, criterion, epoch, len(train_loader)*(epoch + 1), False)
# remember best prec and save checkpoint
if (args.top_prec == 1):
is_best = prec1 > best_prec1
elif (args.top_prec == 3):
is_best = prec3 > best_prec3
else:
is_best = prec5 > best_prec5
best_prec1 = max(prec1, best_prec1)
best_prec3 = max(prec3, best_prec3)
best_prec5 = max(prec5, best_prec5)
if args.local_rank == 0:
save_model({
'epoch': epoch + 1,
'args': args,
'state_dict': model.state_dict(),
'best_prec1': best_prec1,
'best_prec3': best_prec3,
'best_prec5': best_prec5,
'optimizer' : optimizer.state_dict(),
'classnames' : train_dataset.classnames,
'num_classes' : num_classes,
'model_type' : args.model_type,
}, is_best)