public async submitTrialJob()

in ts/nni_manager/training_service/kubernetes/adl/adlTrainingService.ts [105:233]


    public async submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
        if (this.kubernetesCRDClient === undefined) {
            throw new Error('Adl job operator client is undefined');
        }

        if (this.adlTrialConfig === undefined) {
            throw new Error('Adl trial config is undefined');
        }

        if (this.kubernetesRestServerPort === undefined) {
            const restServer: AdlJobRestServer = component.get(AdlJobRestServer);
            this.kubernetesRestServerPort = restServer.clusterRestServerPort;
        }

        const trialJobId: string = uniqueString(5);
        const adlJobName: string = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase();
        const initStatus: TrialJobStatus = 'WAITING';
        const codeDir = this.adlTrialConfig.codeDir;
        const outputDir = "output"
        const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail(
            trialJobId,
            initStatus,
            Date.now(),
            codeDir,
            form,
            adlJobName,
            outputDir
        );

        // Create adljob
        const job: any = JSON.parse(this.jobTemplateStr);
        job.metadata.name = adlJobName
        job.metadata.labels.app = this.NNI_KUBERNETES_TRIAL_LABEL
        job.metadata.labels.expId = this.experimentId
        job.metadata.labels.trialId = trialJobId
        if (this.adlTrialConfig.adaptive !== undefined){
            job.spec.preemptible = this.adlTrialConfig.adaptive
        }
        job.spec.template.spec.containers[0]
            .image = this.adlTrialConfig.image;
        job.spec.template.spec.volumes[0]
            .persistentVolumeClaim.claimName = adlJobName
        job.spec.template.spec.volumes[1]
            .persistentVolumeClaim.claimName = this.tensorboardName
        job.spec.template.spec.volumes[2]
            .configMap.name = adlJobName
        // Handle Pod Resource
        let cpu: number = 1;
        let memory: string = "1Gi";
        if (this.adlTrialConfig.cpuNum !== undefined) {
            cpu = this.adlTrialConfig.cpuNum;
        }
        if (this.adlTrialConfig.memorySize !== undefined) {
            memory = this.adlTrialConfig.memorySize;
        }
        job.spec.template.spec.containers[0]
            .resources.requests.memory = memory;
        job.spec.template.spec.containers[0]
            .resources.requests.cpu = cpu;
        job.spec.template.spec.containers[0]
            .resources.limits["nvidia.com/gpu"] = this.adlTrialConfig.gpuNum;
        // Handle imagePullSecrets
        if (this.adlTrialConfig.imagePullSecrets !== undefined) {
            job.spec.template.spec.imagePullSecrets = job.spec.template.spec
                .imagePullSecrets.concat(this.adlTrialConfig.imagePullSecrets);
        }
        // Handle NFS
        if (this.adlTrialConfig.nfs !== undefined) {
            job.spec.template.spec.volumes.push({
                "name": "nfs",
                "nfs": {
                    "server": this.adlTrialConfig.nfs.server,
                    "path": this.adlTrialConfig.nfs.path,
                    "readOnly": false
                }
            });
            job.spec.template.spec.containers[0].volumeMounts.push({
                "name": "nfs",
                "mountPath": this.adlTrialConfig.nfs.containerMountPath
            });
        }
        await this.kubernetesCRDClient.createKubernetesJob(job);
        const k8sadlJob: any = await this.kubernetesCRDClient.getKubernetesJob(adlJobName);

        // Create pvc
        const pvc: any = JSON.parse(this.pvcTemplateStr);
        pvc.metadata.name = adlJobName;
        pvc.metadata.ownerReferences[0].name = adlJobName;
        pvc.metadata.ownerReferences[0].uid = k8sadlJob.metadata.uid;
        if (this.adlTrialConfig.checkpoint != undefined) {
            pvc.spec.resources.requests.storage = this.adlTrialConfig
                .checkpoint.storageSize;
            pvc.spec.storageClassName = this.adlTrialConfig.checkpoint.storageClass;
        }
        else {
            pvc.spec.resources.requests.storage = "1Gi"
            pvc.spec.storageClassName = await this.genericK8sClient.getStorageClass();
        }
        await this.genericK8sClient.createPersistentVolumeClaim(pvc);

        // prepare the runscript and convert it to configmap and mount it
        const configmap: any = JSON.parse(this.configmapTemplateStr);
        configmap.metadata.name = adlJobName;
        configmap.metadata.ownerReferences[0].name = adlJobName;
        configmap.metadata.ownerReferences[0].uid = k8sadlJob.metadata.uid;
        configmap.data["run.sh"] = await this.prepareRunScript(
            trialJobId, form, codeDir, outputDir)
        const cleanupScriptTemplate: string =
`#!/bin/bash
ps aux | grep "python3 -m nni.tools.trial_tool.trial_keeper" | awk '{print $2}' | xargs kill -2
while true;
do
    proc=\`ps aux | grep "python3 -m nni.tools.trial_tool.trial_keeper" | awk '{print $2}' | grep "" -c\`
    if (( $proc == 1  )); then
        exit 0
    else
        echo "waiting"
    fi
    sleep 1
done
`;
        configmap.data["cleanup.sh"] = cleanupScriptTemplate
        await this.genericK8sClient.createConfigMap(configmap)

        // Set trial job detail until create Adl job successfully
        this.trialJobsMap.set(trialJobId, trialJobDetail);

        return Promise.resolve(trialJobDetail);
    }