in blink/biencoder/train_biencoder.py [0:0]
def main(params):
model_output_path = params["output_path"]
if not os.path.exists(model_output_path):
os.makedirs(model_output_path)
logger = utils.get_logger(params["output_path"])
# Init model
reranker = BiEncoderRanker(params)
tokenizer = reranker.tokenizer
model = reranker.model
device = reranker.device
n_gpu = reranker.n_gpu
if params["gradient_accumulation_steps"] < 1:
raise ValueError(
"Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
params["gradient_accumulation_steps"]
)
)
# An effective batch size of `x`, when we are accumulating the gradient accross `y` batches will be achieved by having a batch size of `z = x / y`
# args.gradient_accumulation_steps = args.gradient_accumulation_steps // n_gpu
params["train_batch_size"] = (
params["train_batch_size"] // params["gradient_accumulation_steps"]
)
train_batch_size = params["train_batch_size"]
eval_batch_size = params["eval_batch_size"]
grad_acc_steps = params["gradient_accumulation_steps"]
# Fix the random seeds
seed = params["seed"]
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if reranker.n_gpu > 0:
torch.cuda.manual_seed_all(seed)
# Load train data
train_samples = utils.read_dataset("train", params["data_path"])
logger.info("Read %d train samples." % len(train_samples))
train_data, train_tensor_data = data.process_mention_data(
train_samples,
tokenizer,
params["max_context_length"],
params["max_cand_length"],
context_key=params["context_key"],
silent=params["silent"],
logger=logger,
debug=params["debug"],
)
if params["shuffle"]:
train_sampler = RandomSampler(train_tensor_data)
else:
train_sampler = SequentialSampler(train_tensor_data)
train_dataloader = DataLoader(
train_tensor_data, sampler=train_sampler, batch_size=train_batch_size
)
# Load eval data
# TODO: reduce duplicated code here
valid_samples = utils.read_dataset("valid", params["data_path"])
logger.info("Read %d valid samples." % len(valid_samples))
valid_data, valid_tensor_data = data.process_mention_data(
valid_samples,
tokenizer,
params["max_context_length"],
params["max_cand_length"],
context_key=params["context_key"],
silent=params["silent"],
logger=logger,
debug=params["debug"],
)
valid_sampler = SequentialSampler(valid_tensor_data)
valid_dataloader = DataLoader(
valid_tensor_data, sampler=valid_sampler, batch_size=eval_batch_size
)
# evaluate before training
results = evaluate(
reranker, valid_dataloader, params, device=device, logger=logger,
)
number_of_samples_per_dataset = {}
time_start = time.time()
utils.write_to_file(
os.path.join(model_output_path, "training_params.txt"), str(params)
)
logger.info("Starting training")
logger.info(
"device: {} n_gpu: {}, distributed training: {}".format(device, n_gpu, False)
)
optimizer = get_optimizer(model, params)
scheduler = get_scheduler(params, optimizer, len(train_tensor_data), logger)
model.train()
best_epoch_idx = -1
best_score = -1
num_train_epochs = params["num_train_epochs"]
for epoch_idx in trange(int(num_train_epochs), desc="Epoch"):
tr_loss = 0
results = None
if params["silent"]:
iter_ = train_dataloader
else:
iter_ = tqdm(train_dataloader, desc="Batch")
for step, batch in enumerate(iter_):
batch = tuple(t.to(device) for t in batch)
context_input, candidate_input, _, _ = batch
loss, _ = reranker(context_input, candidate_input)
# if n_gpu > 1:
# loss = loss.mean() # mean() to average on multi-gpu.
if grad_acc_steps > 1:
loss = loss / grad_acc_steps
tr_loss += loss.item()
if (step + 1) % (params["print_interval"] * grad_acc_steps) == 0:
logger.info(
"Step {} - epoch {} average loss: {}\n".format(
step,
epoch_idx,
tr_loss / (params["print_interval"] * grad_acc_steps),
)
)
tr_loss = 0
loss.backward()
if (step + 1) % grad_acc_steps == 0:
torch.nn.utils.clip_grad_norm_(
model.parameters(), params["max_grad_norm"]
)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
if (step + 1) % (params["eval_interval"] * grad_acc_steps) == 0:
logger.info("Evaluation on the development dataset")
evaluate(
reranker, valid_dataloader, params, device=device, logger=logger,
)
model.train()
logger.info("\n")
logger.info("***** Saving fine - tuned model *****")
epoch_output_folder_path = os.path.join(
model_output_path, "epoch_{}".format(epoch_idx)
)
utils.save_model(model, tokenizer, epoch_output_folder_path)
output_eval_file = os.path.join(epoch_output_folder_path, "eval_results.txt")
results = evaluate(
reranker, valid_dataloader, params, device=device, logger=logger,
)
ls = [best_score, results["normalized_accuracy"]]
li = [best_epoch_idx, epoch_idx]
best_score = ls[np.argmax(ls)]
best_epoch_idx = li[np.argmax(ls)]
logger.info("\n")
execution_time = (time.time() - time_start) / 60
utils.write_to_file(
os.path.join(model_output_path, "training_time.txt"),
"The training took {} minutes\n".format(execution_time),
)
logger.info("The training took {} minutes\n".format(execution_time))
# save the best model in the parent_dir
logger.info("Best performance in epoch: {}".format(best_epoch_idx))
params["path_to_model"] = os.path.join(
model_output_path,
"epoch_{}".format(best_epoch_idx),
WEIGHTS_NAME,
)
reranker = load_biencoder(params)
utils.save_model(reranker.model, tokenizer, model_output_path)
if params["evaluate"]:
params["path_to_model"] = model_output_path
evaluate(params, logger=logger)