in run_with_submitit.py [0:0]
def main():
args = parse_args()
shared_folder = get_shared_folder()
copy_py(shared_folder)
os.chdir(shared_folder)
grid = {
'model': ['convit_base'],
}
def dict_product(d):
keys = d.keys()
for element in itertools.product(*d.values()):
yield dict(zip(keys, element))
for params in dict_product(grid):
name = '_'.join(['{}_{}'.format(k,v) for k,v in params.items()])
args.shared_dir = shared_folder
args.job_dir = shared_folder / name
if os.path.exists(args.job_dir / 'checkpoint.pth'):
args.resume = args.job_dir / 'checkpoint.pth'
# Note that the folder will depend on the job_id, to easily track experiments
executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
num_gpus_per_node = args.ngpus
nodes = args.nodes
timeout_min = args.timeout
partition = args.partition
args.use_volta32 = True
kwargs = {}
if args.use_volta32:
kwargs['slurm_constraint'] = 'volta32gb'
if args.comment:
kwargs['slurm_comment'] = args.comment
executor.update_parameters(
mem_gb= 80 * num_gpus_per_node,
gpus_per_node=num_gpus_per_node,
tasks_per_node=num_gpus_per_node, # one task per GPU
cpus_per_task=10,
nodes=nodes,
timeout_min=timeout_min, # max is 60 * 72
slurm_partition=partition,
slurm_signal_delay_s=120,
**kwargs
)
for k,v in params.items():
setattr(args,k,v)
executor.update_parameters(name=name)
args.dist_url = get_init_file(shared_folder).as_uri()
args.output_dir = args.job_dir
trainer = Trainer(args)
job = executor.submit(trainer)
print("Submitted job_id:", job.job_id)