def run_job()

in scripts/kube_dist_trainer.py [0:0]


def run_job(dryrun: bool = False) -> None:
    register_gpu_resource()
    build = build_and_push_image()
    image = build.torchx_image
    runner = get_runner("kubeflow-dist-runner")

    storage_path = os.getenv("INTEGRATION_TEST_STORAGE", "/tmp/storage")
    root = os.path.join(storage_path, build.id)
    output_path = os.path.join(root, "output")

    args = ("--output_path", output_path)
    train_app = trainer_dist(
        image=image,
        output_path=output_path,
        resource="GPU_X1",
        nnodes=2,
        nproc_per_node=1,
    )
    print(f"Starting Trainer with args: {args}")
    cfg = {
        "namespace": "default",
        "queue": "default",
    }
    print("Submitting pods")
    if dryrun:
        dryrun_info = runner.dryrun(train_app, "kubernetes", cfg)
        print(f"Dryrun info: {dryrun_info}")
    else:
        app_handle = runner.run(train_app, "kubernetes", cfg)
        print(app_handle)
        runner.wait(app_handle)
        final_status = runner.status(app_handle)
        print(f"Final status: {final_status}")
        if none_throws(final_status).state != AppState.SUCCEEDED:
            raise Exception(f"Dist app failed with status: {final_status}")