in launch.py [0:0]
def construct_cmd(args):
"""Construct the cmd as provided in args."""
if args.cfg:
assert args.cfg.startswith('expts'), 'Must be wrt this directory'
agent_folder = '{}/{}'.format(BASE_RUN_DIR,
args.cfg if args.cfg else 'default')
if args.kill:
slurm_ids = os.listdir(os.path.join(agent_folder, '.submitit/'))
shall = input("Kill %s (y/N) " % slurm_ids).lower() == 'y'
if shall:
return 'scancel {}'.format(' '.join(slurm_ids))
if args.tb: # Run tensorboard only
# Clear the cli and just run tensorboard
cli = ('cd {agent_folder}; tensorboard --logdir . --port {port} '
'--max_reload_threads 10 --window_title {name} ').format(
agent_folder=agent_folder,
port=get_free_port(),
name=args.cfg)
return cli
if args.fl: # Visualize the folder only
# Clear the cli and just run tensorboard
cli = 'cd {}; python -m http.server {}'.format(agent_folder,
get_free_port())
return cli
if args.delete:
cli = 'rm -r {f}/* {f}/.*'.format(f=agent_folder)
shall = input("Run %s (y/N) " % cli).lower() == 'y'
if shall:
return cli
return ''
# Else, it is the general train command
run_id, cli_stuff = read_file_into_cli(args.cfg,
running_local=args.local,
run_id=args.run_id)
cli_stuff = [escape_str(el) for el in cli_stuff]
cli_stuff = ' '.join(cli_stuff)
if args.debug:
if args.test:
# If args.test, then might be testing a model from other dir
agent_folder = os.path.join(agent_folder, str(run_id))
else:
agent_folder = os.path.join(agent_folder, 'local')
# Delete the sync file if it exists
clear_cmd = f'find {agent_folder} -iname sync_file_init -delete'
print(f'Clearing out the sync files using: {clear_cmd}')
subprocess.call(clear_cmd, shell=True)
cli = (
'export NCCL_SOCKET_IFNAME=; export GLOO_SOCKET_IFNAME=; '
' HYDRA_FULL_ERROR=1 '
' {} train_net.py hydra.run.dir={} ').format(
'kernprof -l ' if args.profile else 'python ', agent_folder)
cli += cli_stuff
if args.test:
cli += ' test_only=True '
if args.local:
cli += (' hydra.launcher.nodes=1 '
f' hydra.launcher.gpus_per_node={num_gpus()} '
' hydra/launcher=submitit_local ')
else:
cli += (' hydra.launcher.max_num_timeout=3 ')
if args.partition is not None and not args.local:
cli += f' +hydra.launcher.partition="{args.partition}" '
if args.debug:
cli += (' data_train.workers=0 data_eval.workers=0 ')
cli += ' ' + ' '.join(args.rest)
# This must go at the end, the other args must go before
if not args.debug:
cli += ' -m '
return cli