in scripts/kube_dist_trainer.py [0:0]
def run_job(dryrun: bool = False) -> None:
register_gpu_resource()
build = build_and_push_image()
image = build.torchx_image
runner = get_runner("kubeflow-dist-runner")
storage_path = os.getenv("INTEGRATION_TEST_STORAGE", "/tmp/storage")
root = os.path.join(storage_path, build.id)
output_path = os.path.join(root, "output")
args = ("--output_path", output_path)
train_app = trainer_dist(
image=image,
output_path=output_path,
resource="GPU_X1",
nnodes=2,
nproc_per_node=1,
)
print(f"Starting Trainer with args: {args}")
cfg = {
"namespace": "default",
"queue": "default",
}
print("Submitting pods")
if dryrun:
dryrun_info = runner.dryrun(train_app, "kubernetes", cfg)
print(f"Dryrun info: {dryrun_info}")
else:
app_handle = runner.run(train_app, "kubernetes", cfg)
print(app_handle)
runner.wait(app_handle)
final_status = runner.status(app_handle)
print(f"Final status: {final_status}")
if none_throws(final_status).state != AppState.SUCCEEDED:
raise Exception(f"Dist app failed with status: {final_status}")