in code/run_glue.py [0:0]
def main(args):
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
logger.info("device: {} n_gpu: {}, 16-bits training: {}".format(
device, n_gpu, args.fp16))
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
if not args.do_train and not args.do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
if args.do_train:
logger.addHandler(logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w'))
else:
logger.addHandler(logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w'))
logger.info(args)
task_name = args.task_name.lower()
if task_name not in PROCESSORS:
raise ValueError("Task not found: %s" % (task_name))
processor = PROCESSORS[task_name]()
label_list = processor.get_labels()
id2label = {i: label for i, label in enumerate(label_list)}
num_labels = len(label_list)
eval_metric = EVAL_METRICS[task_name]
tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case)
if args.do_train or (not args.eval_test):
if task_name == "mnli":
eval_examples = processor.get_dev_examples(args.data_dir, eval_set=args.eval_set)
else:
eval_examples = processor.get_dev_examples(args.data_dir)
eval_features = convert_examples_to_features(
eval_examples, label_list, args.max_seq_length, tokenizer, OUTPUT_MODES[task_name])
logger.info("***** Dev *****")
logger.info(" Num examples = %d", len(eval_examples))
logger.info(" Batch size = %d", args.eval_batch_size)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
if OUTPUT_MODES[task_name] == "classification":
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
elif OUTPUT_MODES[task_name] == "regression":
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
if args.fp16:
all_label_ids = all_label_ids.half()
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size)
eval_label_ids = all_label_ids
if args.do_train:
train_examples = processor.get_train_examples(args.data_dir)
train_features = convert_examples_to_features(
train_examples, label_list, args.max_seq_length, tokenizer, OUTPUT_MODES[task_name])
if args.train_mode == 'sorted' or args.train_mode == 'random_sorted':
train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask))
else:
random.shuffle(train_features)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
if OUTPUT_MODES[task_name] == "classification":
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
elif OUTPUT_MODES[task_name] == "regression":
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
if args.fp16:
all_label_ids = all_label_ids.half()
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size, drop_last=True)
train_batches = [batch for batch in train_dataloader]
eval_step = max(1, len(train_batches) // args.eval_per_epoch)
num_train_optimization_steps = \
len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
logger.info("***** Training *****")
logger.info(" Num examples = %d", len(train_examples))
logger.info(" Batch size = %d", args.train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps)
best_result = None
lrs = [args.learning_rate] if args.learning_rate else \
[1e-6, 2e-6, 3e-6, 5e-6, 1e-5, 2e-5, 3e-5, 5e-5]
for lr in lrs:
cache_dir = args.cache_dir if args.cache_dir else \
PYTORCH_PRETRAINED_BERT_CACHE
model = BertForSequenceClassification.from_pretrained(
args.model, cache_dir=cache_dir, num_labels=num_labels)
if args.fp16:
model.half()
model.to(device)
if n_gpu > 1:
model = torch.nn.DataParallel(model)
# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer
if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer
if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16:
try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex"
"to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=lr,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=lr,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0
nb_tr_steps = 0
nb_tr_examples = 0
tr_loss = 0
start_time = time.time()
for epoch in range(int(args.num_train_epochs)):
model.train()
logger.info("Start epoch #{} (lr = {})...".format(epoch, lr))
if args.train_mode == 'random' or args.train_mode == 'random_sorted':
random.shuffle(train_batches)
for step, batch in enumerate(train_batches):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
logits = model(input_ids, segment_ids, input_mask, labels=None)
if OUTPUT_MODES[task_name] == "classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
elif OUTPUT_MODES[task_name] == "regression":
loss_fct = MSELoss()
loss = loss_fct(logits.view(-1), label_ids.view(-1))
if n_gpu > 1:
loss = loss.mean()
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
optimizer.backward(loss)
else:
loss.backward()
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16:
lr_this_step = lr * \
warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
optimizer.zero_grad()
global_step += 1
if (step + 1) % eval_step == 0:
logger.info('Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}'.format(
epoch, step + 1, len(train_dataloader),
time.time() - start_time, tr_loss / nb_tr_steps))
save_model = False
if args.do_eval:
preds, result = evaluate(task_name, model, device,
eval_dataloader, eval_label_ids, num_labels)
model.train()
result['global_step'] = global_step
result['epoch'] = epoch
result['learning_rate'] = lr
result['batch_size'] = args.train_batch_size
logger.info("First 20 predictions:")
for pred, label in zip(preds[:20], eval_label_ids.numpy()[:20]):
if OUTPUT_MODES[task_name] == 'classification':
sign = u'\u2713' if pred == label else u'\u2718'
logger.info("pred = %s, label = %s %s" % (id2label[pred], id2label[label], sign))
else:
logger.info("pred = %.4f, label = %.4f" % (pred, label))
if (best_result is None) or (result[eval_metric] > best_result[eval_metric]):
best_result = result
save_model = True
logger.info("!!! Best dev %s (lr=%s, epoch=%d): %.2f" %
(eval_metric, str(lr), epoch, result[eval_metric] * 100.0))
else:
save_model = True
if save_model:
model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(args.output_dir)
if best_result:
output_eval_file = os.path.join(args.output_dir, EVAL_FILE)
with open(output_eval_file, "w") as writer:
for key in sorted(result.keys()):
writer.write("%s = %s\n" % (key, str(result[key])))
if args.do_eval:
if args.eval_test:
if task_name == "mnli":
eval_examples = processor.get_test_examples(args.data_dir, eval_set=args.eval_set)
else:
eval_examples = processor.get_test_examples(args.data_dir)
eval_features = convert_examples_to_features(
eval_examples, label_list, args.max_seq_length, tokenizer, OUTPUT_MODES[task_name])
logger.info("***** Test *****")
logger.info(" Num examples = %d", len(eval_examples))
logger.info(" Batch size = %d", args.eval_batch_size)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size)
eval_label_ids = None
model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
if args.fp16:
model.half()
model.to(device)
preds, result = evaluate(task_name, model, device, eval_dataloader, eval_label_ids, num_labels)
pred_file = os.path.join(args.output_dir, PRED_FILE)
with open(pred_file, "w") as f_out:
f_out.write("index\tprediction\n")
for i, pred in enumerate(preds):
if OUTPUT_MODES[task_name] == 'classification':
f_out.write("%d\t%s\n" % (i, id2label[pred]))
else:
f_out.write("%d\t%.6f\n" % (i, pred))
output_eval_file = os.path.join(args.output_dir, TEST_FILE)
with open(output_eval_file, "w") as writer:
for key in sorted(result.keys()):
writer.write("%s = %s\n" % (key, str(result[key])))