def wait_all_pods_ready()

in xlml/utils/gke.py [0:0]


  def wait_all_pods_ready(name: str):
    client = get_authenticated_client(gcp.project_name, gcp.zone, cluster_name)

    batch_api = kubernetes.client.BatchV1Api(client)
    job = batch_api.read_namespaced_job(namespace='default', name=name)

    # TODO(wcromar): Handle other conditions (e.g. unschedulablility)
    logging.info(f'Job status: {job.status}')
    if job.status.failed:
      raise RuntimeError(f'Job has {job.status.failed} failed pods.')

    core_api = kubernetes.client.CoreV1Api(client)
    pod_label_selector = f'batch.kubernetes.io/job-name={name}'
    pods = core_api.list_namespaced_pod(
        namespace='default', label_selector=pod_label_selector
    )

    if len(pods.items) != body['spec']['parallelism']:
      logging.info('Waiting for all pods to be created...')
      return False

    return True