in code/train_deploy.py [0:0]
def train(args):
is_distributed = len(args.hosts) > 1 and args.backend is not None
logger.debug("Distributed training - %s", is_distributed)
use_cuda = args.num_gpus > 0
logger.debug("Number of gpus available - %d", args.num_gpus)
device = torch.device("cuda" if use_cuda else "cpu")
if is_distributed:
# Initialize the distributed environment.
world_size = len(args.hosts)
os.environ["WORLD_SIZE"] = str(world_size)
host_rank = args.hosts.index(args.current_host)
os.environ["RANK"] = str(host_rank)
dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size)
logger.info(
"Initialized the distributed environment: '%s' backend on %d nodes. "
"Current host rank is %d. Number of gpus: %d",
args.backend, dist.get_world_size(),
dist.get_rank(), args.num_gpus
)
# set the seed for generating random numbers
torch.manual_seed(args.seed)
if use_cuda:
torch.cuda.manual_seed(args.seed)
train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed)
test_loader = _get_test_data_loader(args.test_batch_size, args.test)
logger.debug(
"Processes {}/{} ({:.0f}%) of train data".format(
len(train_loader.sampler),
len(train_loader.dataset),
100.0 * len(train_loader.sampler) / len(train_loader.dataset),
)
)
logger.debug(
"Processes {}/{} ({:.0f}%) of test data".format(
len(test_loader.sampler),
len(test_loader.dataset),
100.0 * len(test_loader.sampler) / len(test_loader.dataset),
)
)
logger.info("Starting BertForSequenceClassification\n")
model = BertForSequenceClassification.from_pretrained(
"bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
num_labels=args.num_labels, # The number of output labels--2 for binary classification.
output_attentions=False, # Whether the model returns attentions weights.
output_hidden_states=False, # Whether the model returns all hidden-states.
)
model = model.to(device)
if is_distributed and use_cuda:
# multi-machine multi-gpu case
model = torch.nn.parallel.DistributedDataParallel(model)
else:
# single-machine multi-gpu case or single-machine or multi-machine cpu case
model = torch.nn.DataParallel(model)
optimizer = AdamW(
model.parameters(),
lr=2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
eps=1e-8, # args.adam_epsilon - default is 1e-8.
)
logger.info("End of defining BertForSequenceClassification\n")
for epoch in range(1, args.epochs + 1):
total_loss = 0
model.train()
for step, batch in enumerate(train_loader):
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
model.zero_grad()
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
loss = outputs[0]
total_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# modified based on their gradients, the learning rate, etc.
optimizer.step()
if step % args.log_interval == 0:
logger.info(
"Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}".format(
epoch,
step * len(batch[0]),
len(train_loader.sampler),
100.0 * step / len(train_loader),
loss.item(),
)
)
logger.info("Average training loss: %f\n", total_loss / len(train_loader))
test(model, test_loader, device)
logger.info("Saving tuned model.")
model_2_save = model.module if hasattr(model, "module") else model
model_2_save.save_pretrained(save_directory=args.model_dir)