in metaicl/model.py [0:0]
def do_train(self, data, batch_size, num_training_steps, save_period, log_period,
gradient_accumulation_steps=1, max_grad_norm=1.0):
dataloader = data.get_dataloader(batch_size, is_training=True)
n_trainable_params = len([param for param in self.model.parameters() if param.requires_grad])
n_gpus = torch.cuda.device_count()
self.logger.info("Training {} parameters on {} examples for {} steps using {} GPUs".format(
n_trainable_params, len(data), num_training_steps, self.n_gpu))
global_step = 0
train_losses = []
best_accuracy = -1
stop_training=False
for epoch in range(num_training_steps):
for batch in dataloader:
global_step += 1
input_ids=batch[0].to(self.device)
attention_mask=batch[1].to(self.device)
token_type_ids=batch[2].to(self.device)
if len(batch)==3:
labels=None
else:
labels=batch[3].to(self.device)
loss = self.run_model(input_ids, attention_mask, token_type_ids, labels=labels)
loss = loss.mean()
if torch.isnan(loss).data:
print ("Stop training because loss=%s" % (loss.data))
stop_training=True
break
train_losses.append(loss.detach().cpu())
if self.fp16:
from apex import amp
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
if global_step % gradient_accumulation_steps == 0:
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
self.optimizer.step() # We have accumulated enought gradients
if self.scheduler is not None:
self.scheduler.step()
self.model.zero_grad()
if global_step % log_period == 0:
self.logger.info("local rank %d\tglobal step %d\ttrain loss %.2f" % (self.local_rank, global_step, np.mean(train_losses)))
train_losses = []
if global_step % save_period == 0:
self.save(global_step)
if global_step==num_training_steps:
break
if global_step==num_training_steps:
break
self.logger.info("Finish training")