in xlml/utils/gke.py [0:0]
def wait_all_pods_ready(name: str):
client = get_authenticated_client(gcp.project_name, gcp.zone, cluster_name)
batch_api = kubernetes.client.BatchV1Api(client)
job = batch_api.read_namespaced_job(namespace='default', name=name)
# TODO(wcromar): Handle other conditions (e.g. unschedulablility)
logging.info(f'Job status: {job.status}')
if job.status.failed:
raise RuntimeError(f'Job has {job.status.failed} failed pods.')
core_api = kubernetes.client.CoreV1Api(client)
pod_label_selector = f'batch.kubernetes.io/job-name={name}'
pods = core_api.list_namespaced_pod(
namespace='default', label_selector=pod_label_selector
)
if len(pods.items) != body['spec']['parallelism']:
logging.info('Waiting for all pods to be created...')
return False
return True