in ts/nni_manager/training_service/kubernetes/adl/adlTrainingService.ts [105:233]
public async submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
if (this.kubernetesCRDClient === undefined) {
throw new Error('Adl job operator client is undefined');
}
if (this.adlTrialConfig === undefined) {
throw new Error('Adl trial config is undefined');
}
if (this.kubernetesRestServerPort === undefined) {
const restServer: AdlJobRestServer = component.get(AdlJobRestServer);
this.kubernetesRestServerPort = restServer.clusterRestServerPort;
}
const trialJobId: string = uniqueString(5);
const adlJobName: string = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase();
const initStatus: TrialJobStatus = 'WAITING';
const codeDir = this.adlTrialConfig.codeDir;
const outputDir = "output"
const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail(
trialJobId,
initStatus,
Date.now(),
codeDir,
form,
adlJobName,
outputDir
);
// Create adljob
const job: any = JSON.parse(this.jobTemplateStr);
job.metadata.name = adlJobName
job.metadata.labels.app = this.NNI_KUBERNETES_TRIAL_LABEL
job.metadata.labels.expId = this.experimentId
job.metadata.labels.trialId = trialJobId
if (this.adlTrialConfig.adaptive !== undefined){
job.spec.preemptible = this.adlTrialConfig.adaptive
}
job.spec.template.spec.containers[0]
.image = this.adlTrialConfig.image;
job.spec.template.spec.volumes[0]
.persistentVolumeClaim.claimName = adlJobName
job.spec.template.spec.volumes[1]
.persistentVolumeClaim.claimName = this.tensorboardName
job.spec.template.spec.volumes[2]
.configMap.name = adlJobName
// Handle Pod Resource
let cpu: number = 1;
let memory: string = "1Gi";
if (this.adlTrialConfig.cpuNum !== undefined) {
cpu = this.adlTrialConfig.cpuNum;
}
if (this.adlTrialConfig.memorySize !== undefined) {
memory = this.adlTrialConfig.memorySize;
}
job.spec.template.spec.containers[0]
.resources.requests.memory = memory;
job.spec.template.spec.containers[0]
.resources.requests.cpu = cpu;
job.spec.template.spec.containers[0]
.resources.limits["nvidia.com/gpu"] = this.adlTrialConfig.gpuNum;
// Handle imagePullSecrets
if (this.adlTrialConfig.imagePullSecrets !== undefined) {
job.spec.template.spec.imagePullSecrets = job.spec.template.spec
.imagePullSecrets.concat(this.adlTrialConfig.imagePullSecrets);
}
// Handle NFS
if (this.adlTrialConfig.nfs !== undefined) {
job.spec.template.spec.volumes.push({
"name": "nfs",
"nfs": {
"server": this.adlTrialConfig.nfs.server,
"path": this.adlTrialConfig.nfs.path,
"readOnly": false
}
});
job.spec.template.spec.containers[0].volumeMounts.push({
"name": "nfs",
"mountPath": this.adlTrialConfig.nfs.containerMountPath
});
}
await this.kubernetesCRDClient.createKubernetesJob(job);
const k8sadlJob: any = await this.kubernetesCRDClient.getKubernetesJob(adlJobName);
// Create pvc
const pvc: any = JSON.parse(this.pvcTemplateStr);
pvc.metadata.name = adlJobName;
pvc.metadata.ownerReferences[0].name = adlJobName;
pvc.metadata.ownerReferences[0].uid = k8sadlJob.metadata.uid;
if (this.adlTrialConfig.checkpoint != undefined) {
pvc.spec.resources.requests.storage = this.adlTrialConfig
.checkpoint.storageSize;
pvc.spec.storageClassName = this.adlTrialConfig.checkpoint.storageClass;
}
else {
pvc.spec.resources.requests.storage = "1Gi"
pvc.spec.storageClassName = await this.genericK8sClient.getStorageClass();
}
await this.genericK8sClient.createPersistentVolumeClaim(pvc);
// prepare the runscript and convert it to configmap and mount it
const configmap: any = JSON.parse(this.configmapTemplateStr);
configmap.metadata.name = adlJobName;
configmap.metadata.ownerReferences[0].name = adlJobName;
configmap.metadata.ownerReferences[0].uid = k8sadlJob.metadata.uid;
configmap.data["run.sh"] = await this.prepareRunScript(
trialJobId, form, codeDir, outputDir)
const cleanupScriptTemplate: string =
`#!/bin/bash
ps aux | grep "python3 -m nni.tools.trial_tool.trial_keeper" | awk '{print $2}' | xargs kill -2
while true;
do
proc=\`ps aux | grep "python3 -m nni.tools.trial_tool.trial_keeper" | awk '{print $2}' | grep "" -c\`
if (( $proc == 1 )); then
exit 0
else
echo "waiting"
fi
sleep 1
done
`;
configmap.data["cleanup.sh"] = cleanupScriptTemplate
await this.genericK8sClient.createConfigMap(configmap)
// Set trial job detail until create Adl job successfully
this.trialJobsMap.set(trialJobId, trialJobDetail);
return Promise.resolve(trialJobDetail);
}