in biolm/run_classification.py [0:0]
def evaluate(args, model, tokenizer, prefix=""):
eval_task_names = (args.task_name,)
eval_outputs_dirs = (args.output_dir,)
results = {}
if args.output_mode == 'multilabel_classification':
loss_fn = nn.BCEWithLogitsLoss()
else:
loss_fn = None
for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
eval_dataset, examples = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True, return_examples=True)
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
os.makedirs(eval_output_dir)
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
# Note that DistributedSampler samples randomly
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
# multi-gpu eval
if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
model = torch.nn.DataParallel(model)
# Eval!
logger.info("***** Running evaluation {} *****".format(prefix))
logger.info(" Num examples = %d", len(eval_dataset))
logger.info(" Batch size = %d", args.eval_batch_size)
eval_loss = 0.0
nb_eval_steps = 0
preds = None
out_label_ids = None
for batch in tqdm(eval_dataloader, desc="Evaluating"):
model.eval()
batch = tuple(t.to(args.device) for t in batch)
with torch.no_grad():
if args.output_mode == 'multilabel_classification':
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
modified_inputs ={"input_ids": batch[0], "attention_mask": batch[1]}
if args.model_type != "distilbert":
modified_inputs["token_type_ids"] = (
batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
outputs = model(**modified_inputs)
logits = outputs[0]
tmp_eval_loss = loss_fn(logits, inputs['labels'].to(outputs[0]))
else:
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
if args.model_type != "distilbert":
inputs["token_type_ids"] = (
batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
outputs = model(**inputs)
tmp_eval_loss, logits = outputs[:2]
eval_loss += tmp_eval_loss.mean().item()
nb_eval_steps += 1
if preds is None:
preds = logits.detach().cpu().numpy()
out_label_ids = inputs["labels"].detach().cpu().numpy()
else:
preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
eval_loss = eval_loss / nb_eval_steps
if args.output_mode == "classification":
preds = np.argmax(preds, axis=1)
elif args.output_mode == "regression":
preds = np.squeeze(preds)
elif args.output_mode == 'multilabel_classification':
preds = torch.sigmoid(torch.tensor(preds)).numpy()
result = compute_metrics(eval_task, preds, out_label_ids, examples)
results.update(result)
results['eval_loss'] = eval_loss
output_eval_file = os.path.join(eval_output_dir, prefix, "test_results.txt" if args.do_test else "eval_results.txt")
with open(output_eval_file, "w") as writer:
# with open('bioroberta_chemprot.txt', 'w') as writer:
logger.info("***** Eval results {} *****".format(prefix))
for key in sorted(result.keys()):
logger.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
with open(output_eval_file.replace('_results.txt', '_predictions.tsv'), "w") as writer:
for example, pred in zip(examples, preds):
if args.output_mode == "classification":
pred_label = args.label_list[pred]
elif args.output_mode == "regression":
pred_label = str(pred)
elif args.output_mode == 'multilabel_classification':
pred_label = [args.label_list[ind] for ind, p in enumerate(pred) if p > 0.5]
writer.write(f'{example.guid}\t{pred_label}\n')
return results