torchbenchmark/models/moco/__init__.py (91 lines of code) (raw):

#!/usr/bin/env python # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved from argparse import Namespace import random import torch import torch.nn as nn import torch.nn.parallel import torch.backends.cudnn as cudnn import torch.distributed as dist import torch.optim import torch.utils.data import torch.utils.data.distributed import torchvision.models as models from typing import Tuple from .moco.builder import MoCo from .main_moco import adjust_learning_rate from ...util.model import BenchmarkModel from torchbenchmark.tasks import OTHER cudnn.deterministic = False cudnn.benchmark = True class Model(BenchmarkModel): task = OTHER.OTHER_TASKS # Original train batch size: 32 # Paper and code uses batch size of 256 for 8 GPUs. # Source: https://arxiv.org/pdf/1911.05722.pdf DEFAULT_TRAIN_BSIZE = 32 DEFAULT_EVAL_BSIZE = 32 def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]): super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args) self.opt = Namespace(**{ 'arch': 'resnet50', 'epochs': 2, 'start_epoch': 0, 'lr': 0.03, 'schedule': [120, 160], 'momentum': 0.9, 'weight_decay': 1e-4, 'gpu': None, 'moco_dim': 128, 'moco_k': 32000, 'moco_m': 0.999, 'moco_t': 0.07, 'mlp': False, 'aug_plus': False, 'cos': False, 'fake_data': True, 'distributed': True, }) try: dist.init_process_group(backend='nccl', init_method='tcp://localhost:10001', world_size=1, rank=0) except RuntimeError: pass # already initialized? if device == "cpu": raise NotImplementedError("DistributedDataParallel/allgather requires cuda") self.model = MoCo( models.__dict__[self.opt.arch], self.opt.moco_dim, self.opt.moco_k, self.opt.moco_m, self.opt.moco_t, self.opt.mlp) self.model.to(self.device) self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[0]) # Define loss function (criterion) and optimizer self.criterion = nn.CrossEntropyLoss().to(self.device) self.optimizer = torch.optim.SGD(self.model.parameters(), self.opt.lr, momentum=self.opt.momentum, weight_decay=self.opt.weight_decay) def collate_train_fn(data): ind = data[0] return [batches[2 * ind], batches[2 * ind + 1]], 0 batches = [] for i in range(4): batches.append(torch.randn(self.batch_size, 3, 224, 224).to(self.device)) self.example_inputs = torch.utils.data.DataLoader( range(2), collate_fn=collate_train_fn) for i, (images, _) in enumerate(self.example_inputs): images[0] = images[0].cuda(device=0, non_blocking=True) images[1] = images[1].cuda(device=0, non_blocking=True) def get_module(self): """ Recommended Returns model, example_inputs model should be torchscript model if self.jit is True. Both model and example_inputs should be on self.device properly. `model(*example_inputs)` should execute one step of model forward. """ images = [] for (i, _) in self.example_inputs: images = (i[0], i[1]) return self.model, images def train(self, niter=1): """ Recommended Runs training on model for `niter` times. When `niter` is left to its default value, it should run for at most two minutes, and be representative of the performance of a traditional training loop. One iteration should be sufficient to warm up the model for the purpose of profiling. Avoid unnecessary benchmark noise by keeping any tensor creation, memcopy operations in __init__. Leave warmup to the caller (e.g. don't do it inside) """ self.model.train() for e in range(niter): adjust_learning_rate(self.optimizer, e, self.opt) for i, (images, _) in enumerate(self.example_inputs): # compute output output, target = self.model(im_q=images[0], im_k=images[1]) loss = self.criterion(output, target) # compute gradient and do SGD step self.optimizer.zero_grad() loss.backward() self.optimizer.step() def eval(self, niter=1) -> Tuple[torch.Tensor]: """ Recommended Run evaluation on model for `niter` inputs. One iteration should be sufficient to warm up the model for the purpose of profiling. In most cases this can use the `get_module` API but in some cases libraries do not have a single Module object used for inference. In these case, you can write a custom eval function. Avoid unnecessary benchmark noise by keeping any tensor creation, memcopy operations in __init__. Leave warmup to the caller (e.g. don't do it inside) """ for i in range(niter): for i, (images, _) in enumerate(self.example_inputs): out = self.model(im_q=images[0], im_k=images[1]) return out