in torchx/schedulers/docker_scheduler.py [0:0]
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
roles = {}
roles_statuses = {}
states = []
containers = self._get_containers(app_id)
for container in containers:
role = container.labels[LABEL_ROLE_NAME]
replica_id = container.labels[LABEL_REPLICA_ID]
if role not in roles:
roles[role] = Role(
name=role,
num_replicas=0,
image=container.image,
)
roles_statuses[role] = RoleStatus(role, [])
roles[role].num_replicas += 1
if container.status == "exited":
# docker doesn't have success/failed states -- we have to call
# `wait()` to get the exit code to determine that
status = container.wait(timeout=10)
if status["StatusCode"] == 0:
state = AppState.SUCCEEDED
else:
state = AppState.FAILED
else:
state = CONTAINER_STATE[container.status]
roles_statuses[role].replicas.append(
ReplicaStatus(
id=int(replica_id),
role=role,
state=state,
hostname=container.name,
)
)
states.append(state)
state = AppState.UNKNOWN
if all(is_terminal(state) for state in states):
if all(state == AppState.SUCCEEDED for state in states):
state = AppState.SUCCEEDED
else:
state = AppState.FAILED
else:
state = next(state for state in states if not is_terminal(state))
return DescribeAppResponse(
app_id=app_id,
roles=list(roles.values()),
roles_statuses=list(roles_statuses.values()),
state=state,
)