in scripts/run_training.py [0:0]
def run_training(sweep_dir, Algorithms, archs, nalg_seeds, ndata_seeds, force):
sweep_path = Path(sweep_dir)
sweep_path.mkdir(parents=True, exist_ok=True)
train_cfg = OmegaConf.create(TRAINING_CFG)
dataset = datasets.load_partitions(train_cfg.dataset.name,
root=os.getenv('DATASETS_ROOT',
'/checkpoint/ishmaelb/data/datasets/ILSVRC2012'),
clustering_file=str(sweep_path / 'clustering.pkl'),
equalize_partitions=True)[('train', 'in')]
executor = utils.get_slurm_executor(copy.deepcopy(train_cfg.slurm),
log_folder= sweep_path / 'logs' / 'run_training')
data_seeds = range(ndata_seeds)
alg_seeds = range(nalg_seeds)
sns = [False, True]
args_iter = itertools.product(
data_seeds, Algorithms, alg_seeds, archs, sns
)
jobs, paths = [], []
with executor.batch():
for (data_seed, Algorithm, alg_seed, arch, sn) in args_iter:
_train_cfg = OmegaConf.create(TRAINING_CFG)
_train_cfg.dataset.name = dataset.name
_train_cfg.dataset.seed = data_seed
_train_cfg.algorithm.sn = sn
_train_cfg.algorithm.name = Algorithm.__name__
_train_cfg.algorithm.arch = arch
_train_cfg.algorithm.seed = alg_seed
output_dir = f'{Algorithm.__name__}_{dataset.name}_{arch}_{sn}_{data_seed}_{alg_seed}'
model_path = sweep_path / output_dir
model_path.mkdir(parents=True, exist_ok=True)
(model_path / '.algorithm').touch()
_train_cfg.output_dir = str(model_path.absolute())
OmegaConf.set_struct(_train_cfg, True)
if utils.train_done(model_path) and not force:
print(f'{output_dir} is done. Skipping')
continue
with open(model_path / 'train_cfg.yaml', 'w') as fp:
OmegaConf.save(_train_cfg, f=fp.name)
worker = workers.Trainer()
worker_args = (_train_cfg, Algorithm, dataset)
job = executor.submit(worker, *worker_args)
jobs += [job]
paths += [model_path]
utils.write_trace('train.pending', dir_=str(model_path))
beholder = utils.Beholder(list(zip(jobs, paths)), stem='train')
beholder.start()
finished_jobs, jobs = utils.handle_jobs(jobs)
import ipdb; ipdb.set_trace()
return finished_jobs, jobs