in xlml/apis/task.py [0:0]
def _get_job_manifest(self):
# pylint: disable=line-too-long
accelerator = self.task_test_config.accelerator
return {
"apiVersion": "batch/v1",
"kind": "Job",
"metadata": {
"generateName": f"{self.task_test_config.test_name}",
"labels": {
"accelerator": accelerator.name,
"benchmarkId": self.task_test_config.benchmark_id,
},
},
"spec": {
"activeDeadlineSeconds": int(
self.task_test_config.timeout.total_seconds()
)
or 3600,
"backoffLimit": 0,
"completionMode": "Indexed",
"completions": self.task_test_config.num_hosts,
"parallelism": self.task_test_config.num_hosts,
"template": {
"metadata": {
# Matches `headless-svc` in GKE cluster.
# See deployments directory.
"labels": {"headless-svc": "true"},
},
"spec": {
"subdomain": "headless-svc",
"nodeSelector": {
"cloud.google.com/gke-accelerator": (
accelerator.accelerator_type
),
},
"restartPolicy": "Never",
"containers": [
{
"name": "main",
"image": self.task_test_config.docker_image,
"imagePullPolicy": "Always",
"command": shlex.split(
self.task_test_config.setup_script
),
"args": shlex.split(
self.task_test_config.test_script
),
"resources": {
"limits": {
"nvidia.com/gpu": accelerator.count,
}
},
"env": [
{
"name": "POD_NAME",
"valueFrom": {
"fieldRef": {
"fieldPath": "metadata.name"
}
},
},
{
"name": "POD_NAMESPACE",
"valueFrom": {
"fieldRef": {
"fieldPath": "metadata.namespace"
}
},
},
{
"name": "JOB_NAME",
"valueFrom": {
"fieldRef": {
"fieldPath": (
"metadata.labels['job-name']"
)
}
},
},
],
"volumeMounts": [
{
"mountPath": "/dev/shm",
"name": "dshm",
"readOnly": False,
},
],
},
],
"volumes": [
{"emptyDir": {"medium": "Memory"}, "name": "dshm"},
],
},
},
},
}