def main()

in run_with_submitit.py [0:0]


def main():
    
    args = parse_args()
    shared_folder = get_shared_folder()
    copy_py(shared_folder)
    os.chdir(shared_folder)

    grid = {
        'model': ['convit_base'],
    }

    def dict_product(d):
        keys = d.keys()
        for element in itertools.product(*d.values()):
            yield dict(zip(keys, element))

    for params in dict_product(grid):

        name = '_'.join(['{}_{}'.format(k,v) for k,v in params.items()])            
        args.shared_dir = shared_folder
        args.job_dir = shared_folder / name
        if os.path.exists(args.job_dir / 'checkpoint.pth'):
            args.resume = args.job_dir / 'checkpoint.pth'
                          

        # Note that the folder will depend on the job_id, to easily track experiments
        executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)

        num_gpus_per_node = args.ngpus
        nodes = args.nodes
        timeout_min = args.timeout
        partition = args.partition

        args.use_volta32 = True
        
        kwargs = {}
        if args.use_volta32:
            kwargs['slurm_constraint'] = 'volta32gb'
        if args.comment:
            kwargs['slurm_comment'] = args.comment

        executor.update_parameters(
            mem_gb= 80 * num_gpus_per_node,
            gpus_per_node=num_gpus_per_node,
            tasks_per_node=num_gpus_per_node,  # one task per GPU
            cpus_per_task=10,
            nodes=nodes,
            timeout_min=timeout_min,  # max is 60 * 72
            slurm_partition=partition,
            slurm_signal_delay_s=120,
            **kwargs
        )

        for k,v in params.items():
            setattr(args,k,v)

        executor.update_parameters(name=name)
        args.dist_url = get_init_file(shared_folder).as_uri()
        args.output_dir = args.job_dir

        trainer = Trainer(args)
        job = executor.submit(trainer)

        print("Submitted job_id:", job.job_id)