in main_distributed.py [0:0]
def launch():
executor = submitit.AutoExecutor(folder=args.folder)
executor.update_parameters(
slurm_partition=args.partition,
slurm_constraint=args.device,
slurm_comment='running PAWS code',
slurm_mem='450G',
timeout_min=args.time,
nodes=args.nodes,
tasks_per_node=args.tasks_per_node,
cpus_per_task=10,
gpus_per_node=args.tasks_per_node)
config_fnames = [args.fname]
if args.batch_launch:
with open(args.fname, 'r') as y_file:
config_fnames = yaml.load(y_file, Loader=yaml.FullLoader)
jobs, trainers = [], []
with executor.batch():
for cf in config_fnames:
fb_trainer = Trainer(args.sel, cf)
job = executor.submit(fb_trainer,)
trainers.append(fb_trainer)
jobs.append(job)
for job in jobs:
print(job.job_id)