in training/BertFinetuneTraining.py [0:0]
def compute(highlighted_train_set_path, non_highlighted_train_set_path, validation_set_path, test_set_path, no_runs, train_size, boostrap_split, train_split, validation_split, unlabelled_split,
results_folder, score_to_optimize, dim_target=2):
model_class = BertFinetune
dataset_class = BertFinetuneDataset
highlighted_train_set = dataset_class(highlighted_train_set_path)
non_highlighted_train_set = dataset_class(non_highlighted_train_set_path)
test_set = dataset_class(test_set_path)
all_train_dataset = ConcatDataset((highlighted_train_set, non_highlighted_train_set))
train_set = Subset(all_train_dataset, train_split)
if validation_set_path is not None:
validation_set = dataset_class(validation_set_path)
# Validation split is not interesting if we have an explicit validation set
unlabelled_set = Subset(all_train_dataset, validation_split + unlabelled_split)
else:
validation_set = Subset(all_train_dataset, validation_split)
unlabelled_set = Subset(all_train_dataset, unlabelled_split)
if torch.cuda.is_available():
device = 'cuda:1'
print('Using cuda')
else:
device = 'cpu'
best_vl_scores = {'accuracy': 0,
'precision': 0,
'recall': 0,
'f1': 0}
# These are our hyper-parameters
best_params = None
batch_sizes = [16] # Cannot use more than this ow memory error on Spouse
model = None
for batch_size in batch_sizes:
for learning_rate in [2e-5, 3e-5, 5e-5]:
for num_epochs in [10, 2, 4]:
vl_scores = {'accuracy': 0,
'precision': 0,
'recall': 0,
'f1': 0}
for run in range(no_runs):
train_loader = DataLoader(train_set, batch_size=batch_size,
collate_fn=custom_collate, shuffle=True)
valid_loader = DataLoader(validation_set, batch_size=batch_size,
collate_fn=custom_collate, shuffle=False)
test_loader = DataLoader(test_set, batch_size=batch_size, collate_fn=custom_collate, shuffle=False)
model = model_class(input_size, dim_target)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.)
# gamma = decaying factor
scheduler = StepLR(optimizer, step_size=50, gamma=1.) # Useless scheduler
print('Start training')
train_model(train_loader, model, optimizer, scheduler,
max_epochs=num_epochs, device=device)
vl_acc, vl_pr, vl_rec, vl_f1 = predict(valid_loader, model, device)
print('End of prediction')
vl_scores['accuracy'] = vl_scores['accuracy'] + float(vl_acc)
vl_scores['precision'] = vl_scores['precision'] + float(vl_pr)
vl_scores['recall'] = vl_scores['recall'] + float(vl_rec)
vl_scores['f1'] = vl_scores['f1'] + float(vl_f1)
del model
del vl_acc
del vl_pr
del vl_rec
del vl_f1
del train_loader
del valid_loader
del test_loader
del scheduler
del optimizer
torch.cuda.empty_cache()
# AVERAGE OVER RUNS
for key in ['accuracy', 'precision', 'recall', 'f1']:
vl_scores[key] = vl_scores[key] / no_runs
if vl_scores[score_to_optimize] > best_vl_scores[score_to_optimize]:
print(f'Best on VL score: {vl_scores}')
best_vl_scores = deepcopy(vl_scores)
best_params = deepcopy(
{'learning_rate': learning_rate,
'train_split': train_split, 'valid_split': validation_split,
'batch_size': batch_size, 'epochs': num_epochs,
})
te_scores = {
'best_params': best_params,
'best_vl_scores': best_vl_scores,
'test_scores': {'accuracy': 0,
'precision': 0,
'recall': 0,
'f1': 0}
}
model = None
for run in range(no_runs):
if model is not None:
del model
torch.cuda.empty_cache()
train_loader = DataLoader(train_set, batch_size=batch_size,
collate_fn=custom_collate, shuffle=True)
valid_loader = DataLoader(validation_set, batch_size=batch_size,
collate_fn=custom_collate, shuffle=False)
test_loader = DataLoader(test_set, batch_size=batch_size, collate_fn=custom_collate, shuffle=False)
model = model_class(input_size, dim_target)
optimizer = torch.optim.Adam(model.parameters(), lr=best_params['learning_rate'],
weight_decay=0.)
scheduler = StepLR(optimizer, step_size=50, gamma=1.) # Useless scheduler for now
epoch_losses = train_model(train_loader, model, optimizer, scheduler,
max_epochs=best_params['epochs'], device=device)
te_acc, te_pr, te_rec, te_f1 = predict(test_loader, model, device)
te_scores['test_scores']['accuracy'] = te_scores['test_scores']['accuracy'] + float(te_acc)
te_scores['test_scores']['precision'] = te_scores['test_scores']['precision'] + float(te_pr)
te_scores['test_scores']['recall'] = te_scores['test_scores']['recall'] + float(te_rec)
te_scores['test_scores']['f1'] = te_scores['test_scores']['f1'] + float(te_f1)
del model
del te_acc
del te_pr
del te_rec
del te_f1
del train_loader
del valid_loader
del test_loader
del scheduler
del optimizer
torch.cuda.empty_cache()
# AVERAGE OVER RUNS
for key in ['accuracy', 'precision', 'recall', 'f1']:
te_scores['test_scores'][key] = te_scores['test_scores'][key] / no_runs
print(f'Best VL scores found is {best_vl_scores}')
print(f'Best TE scores found is {te_scores["test_scores"]}')
print(f'End of model assessment for train size {train_size}, test results are {te_scores}')
if not os.path.exists(results_folder):
os.makedirs(results_folder)
with open(Path(results_folder, f'BertBaseline_size_{train_size}_runs_{no_runs}_test_results_bootstrap_{boostrap_split}.json'), 'w') as f:
json.dump(te_scores, f)