in submitit/submitit_train_qa.py [0:0]
def grid_search(args):
cluster_cfg = ClusterConfig(dist_backend="nccl", dist_url="")
date_curr = date.today().strftime("%m-%d-%Y")
log_dir = os.path.join(args.output_dir, date_curr)
TrainerConfig = namedtuple("TrainerConfig", sorted(vars(args)))
train_cfg = TrainerConfig(**vars(args))
# Create the executor
print("Create the submitit Executor (can take time on FB cluster)")
# Note that the folder will depend on the job_id, to easily track experiments
executor = submitit.AutoExecutor(folder=get_shared_folder() / "%j")
num_gpus_per_node = 8
executor.update_parameters(
mem_gb=400,
gpus_per_node=num_gpus_per_node,
tasks_per_node=1, # one task per GPU
cpus_per_task=10,
nodes=1,
timeout_min=60*72,
slurm_partition="learnfair",
slurm_signal_delay_s=120,
slurm_constraint='volta32gb'
)
# Launch one job per grid position
grid_meta = {
"num_train_epochs": (7, lambda val: f'epoch{val}'),
"learning_rate": ([2e-5, 5e-5, 3e-5], lambda val: f'lr{val}'),
"seed": ([42,5], lambda val: f'seed{val}'),
"rank_drop": (0, lambda val: f'rdrop{val}'),
"qa_drop": (0, lambda val: f'qadrop{val}'),
# "max_seq_len": (512, lambda val: f'c_len{val}'),
# "max_q_len": (100, lambda val: f'q_len{val}'),
"weight_decay": (0, lambda val: f'decay{val}'),
"num_q_per_gpu": (2, lambda val: f'qpergpu{val}'), # how many questions per gpu
"gradient_accumulation_steps": (8, lambda val: f'aggstep{val}'),
"max_grad_norm": (2, lambda val: f'clip{val}'),
"eval_period": (250, lambda val: f'evalper{val}'),
"predict_batch_size": (1024, lambda val: f'evalbsize{val}'),
"neg_num": (5, lambda val: f'negnum{val}'),
"warmup_ratio": ([0.1, 0.2], lambda val: f'warmup{val}'),
"use_adam": (True, lambda val: f'adam{val}'),
"sp_weight": ([0.05, 0.025], lambda val: f'spweight{val}'),
"shared_norm": (False, lambda val: f'sn{val}'),
}
grid = {k:v[0] for k, v in grid_meta.items()}
save_key = {k:v[1] for k, v in grid_meta.items()}
hyper_parameters = list(grid_parameters(grid))
jobs = []
for i, grid_data in enumerate(hyper_parameters):
cluster_cfg = cluster_cfg._replace(dist_url=get_init_file().as_uri())
train_cfg = train_cfg._replace(**grid_data)
run_name = f"{train_cfg.prefix}"
for k, v in grid_data.items():
run_name += "-" + save_key[k](v)
train_cfg = train_cfg._replace(output_dir=os.path.join(log_dir, run_name))
# Chronos needs a different job name each time
executor.update_parameters(name=f"sweep_{i:02d}_{uuid.uuid4().hex}")
trainer = Trainer(train_cfg, cluster_cfg)
job = executor.submit(trainer)
jobs.append(job)
print(f"Run {i:02d} submitted with train cfg: {train_cfg}, cluster cfg: {cluster_cfg}")
print(f"Submitted jobs ids: {','.join([str(job.job_id) for job in jobs])}")
# Wait for the master's results of each job
results = [job.task(0).result() for job in jobs]
print(f"Jobs results: {results}")
best_job = np.argmax(results)
print(f"Best configuration: {hyper_parameters[best_job]} (val acc = {results[best_job]:.1%})")