public scheduleMachine()

in ts/nni_manager/training_service/reusable/gpuScheduler.ts [56:191]


    public scheduleMachine(environments: EnvironmentInformation[], constraint: PlacementConstraint,
        defaultRequiredGPUNum: number | undefined, trialDetail: TrialDetail): GpuScheduleResult {
        if (constraint.type == 'None' || constraint.type == 'GPUNumber') {
            let requiredGPUNum = 0;
            if (constraint.type == 'None') {
                if (defaultRequiredGPUNum === undefined) {
                    requiredGPUNum = 0;
                } else {
                    requiredGPUNum = defaultRequiredGPUNum;
                }
            } else if (constraint.type == 'GPUNumber') {
                const gpus = constraint.gpus as Array<number>;
                // TODO: remove the following constraint when supporting distributed trial
                if (gpus.length != 1) {
                    throw new Error("Placement constraint of GPUNumber must have exactly one number.");
                }
                requiredGPUNum = gpus[0];
            }

            assert(requiredGPUNum >= 0);
            // Step 1: Check if required GPU number not exceeds the total GPU number in all machines
            const eligibleEnvironments: EnvironmentInformation[] = environments.filter((environment: EnvironmentInformation) =>
                environment.defaultGpuSummary === undefined || requiredGPUNum === 0 ||
                (requiredGPUNum !== undefined && environment.defaultGpuSummary.gpuCount >= requiredGPUNum));
            if (eligibleEnvironments.length === 0) {
                // If the required gpu number exceeds the upper limit of all machine's GPU number
                // Return REQUIRE_EXCEED_TOTAL directly
                return ({
                    resultType: ScheduleResultType.REQUIRE_EXCEED_TOTAL,
                    gpuIndices: undefined,
                    environment: undefined,
                });
            }

            // Step 2: Allocate Host/GPU for specified trial job
            // Currenty the requireGPUNum parameter for all trial jobs are identical.
            if (requiredGPUNum > 0) {
                // Trial job requires GPU
                const result: GpuScheduleResult | undefined = this.scheduleGPUHost(environments, requiredGPUNum, trialDetail);
                if (result !== undefined) {
                    return result;
                }
            } else {
                // Trail job does not need GPU
                const allocatedRm: EnvironmentInformation = this.selectMachine(environments, environments);

                return this.allocateHost(requiredGPUNum, allocatedRm, [], trialDetail);
            }

            return {
                resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
                gpuIndices: undefined,
                environment: undefined,
            };
        } else {
            assert(constraint.type === 'Device')
            if (constraint.gpus.length == 0) {
                throw new Error("Device constraint is used but no device is specified.");
            }
            const gpus = constraint.gpus as Array<[string, number]>;
            const selectedHost = gpus[0][0];

            const differentHosts: Array<[string, number]> = gpus.filter((gpuTuple: [string, number]) => gpuTuple[0] != selectedHost);
            if (differentHosts.length >= 1) {
                //TODO: remove this constraint when supporting multi-host placement
                throw new Error("Device constraint does not support using multiple hosts")
            }
            if (environments.length == 0) {
                return {
                    resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
                    gpuIndices: undefined,
                    environment: undefined,
                };
            }
            for (const environment of environments) {
                if(!('rmMachineMeta' in environment)){
                    //TODO: remove this constraint when supporting other training services
                    throw new Error(`Environment Device placement constraint only supports remote training service for now.`);
                }
            }
            //TODO: 
            const eligibleEnvironments: EnvironmentInformation[] = environments.filter(
                (environment: EnvironmentInformation) =>
                    (environment as RemoteMachineEnvironmentInformation).rmMachineMeta != undefined &&
                    (environment as RemoteMachineEnvironmentInformation).rmMachineMeta?.host == selectedHost);
            if (eligibleEnvironments.length === 0) {
                throw new Error(`The the required host (host: ${selectedHost}) is not found.`);
            }
            const selectedEnvironment = eligibleEnvironments[0];
            const availableResources = this.gpuResourceDetection([selectedEnvironment]);
            const selectedGPUs: Array<GPUInfo> = [];

            if (selectedEnvironment.defaultGpuSummary === undefined) {
                //GPU summary may not be ready, retry until it is ready
                return {
                    resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
                    gpuIndices: undefined,
                    environment: undefined,
                };
            }
            for (const gpuTuple of gpus) {
                const gpuIdx: number = gpuTuple[1];
                if (gpuIdx >= selectedEnvironment.defaultGpuSummary.gpuCount) {
                    throw new Error(`The gpuIdx of placement constraint ${gpuIdx} exceeds gpuCount of the host ${selectedHost}`);
                }

                if (availableResources.has(selectedEnvironment)) {
                    for (const gpuInfo of availableResources.get(selectedEnvironment)!) {
                        if (gpuInfo.index === gpuIdx) {
                            selectedGPUs.push(gpuInfo);
                        }
                    }
                }
            }
            if (selectedGPUs.length === constraint.gpus.length) {
                for (const gpuInfo of selectedGPUs) {
                    let num = selectedEnvironment.defaultGpuSummary?.assignedGpuIndexMap.get(gpuInfo.index);
                    if (num === undefined) {
                        num = 0;
                    }
                    selectedEnvironment.defaultGpuSummary?.assignedGpuIndexMap.set(gpuInfo.index, num + 1);
                }
                return {
                    resultType: ScheduleResultType.SUCCEED,
                    environment: selectedEnvironment,
                    gpuIndices: selectedGPUs,
                };
            } else {
                return {
                    resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
                    gpuIndices: undefined,
                    environment: undefined,
                };
            }
        }
    }