def run_training()

in scripts/run_training.py [0:0]


def run_training(sweep_dir, Algorithms, archs, nalg_seeds, ndata_seeds, force):
  sweep_path = Path(sweep_dir)
  sweep_path.mkdir(parents=True, exist_ok=True)

  train_cfg = OmegaConf.create(TRAINING_CFG)
  dataset = datasets.load_partitions(train_cfg.dataset.name,
                            root=os.getenv('DATASETS_ROOT',
                                          '/checkpoint/ishmaelb/data/datasets/ILSVRC2012'),
                            clustering_file=str(sweep_path / 'clustering.pkl'),
                            equalize_partitions=True)[('train', 'in')]

  executor = utils.get_slurm_executor(copy.deepcopy(train_cfg.slurm),
                                log_folder= sweep_path / 'logs' / 'run_training')

  data_seeds = range(ndata_seeds)
  alg_seeds = range(nalg_seeds)
  sns = [False, True]

  args_iter = itertools.product(
     data_seeds, Algorithms, alg_seeds, archs, sns
  )
  jobs, paths = [], []
  with executor.batch():
    for (data_seed, Algorithm, alg_seed, arch, sn) in args_iter:

      _train_cfg = OmegaConf.create(TRAINING_CFG)
      _train_cfg.dataset.name = dataset.name
      _train_cfg.dataset.seed = data_seed
      _train_cfg.algorithm.sn = sn
      _train_cfg.algorithm.name = Algorithm.__name__
      _train_cfg.algorithm.arch = arch
      _train_cfg.algorithm.seed = alg_seed

      output_dir = f'{Algorithm.__name__}_{dataset.name}_{arch}_{sn}_{data_seed}_{alg_seed}'
      model_path = sweep_path / output_dir
      model_path.mkdir(parents=True, exist_ok=True)
      (model_path / '.algorithm').touch()
      _train_cfg.output_dir = str(model_path.absolute())
      OmegaConf.set_struct(_train_cfg, True)

      if utils.train_done(model_path) and not force:
        print(f'{output_dir} is done. Skipping')
        continue
      with open(model_path / 'train_cfg.yaml', 'w') as fp:
        OmegaConf.save(_train_cfg, f=fp.name)

      worker = workers.Trainer()
      worker_args = (_train_cfg, Algorithm, dataset)
      job = executor.submit(worker, *worker_args)
      jobs += [job]
      paths += [model_path]
      utils.write_trace('train.pending', dir_=str(model_path))

  beholder = utils.Beholder(list(zip(jobs, paths)), stem='train')
  beholder.start()

  finished_jobs, jobs = utils.handle_jobs(jobs)
  import ipdb; ipdb.set_trace()

  return finished_jobs, jobs