in ts/nni_manager/training_service/reusable/gpuScheduler.ts [56:191]
public scheduleMachine(environments: EnvironmentInformation[], constraint: PlacementConstraint,
defaultRequiredGPUNum: number | undefined, trialDetail: TrialDetail): GpuScheduleResult {
if (constraint.type == 'None' || constraint.type == 'GPUNumber') {
let requiredGPUNum = 0;
if (constraint.type == 'None') {
if (defaultRequiredGPUNum === undefined) {
requiredGPUNum = 0;
} else {
requiredGPUNum = defaultRequiredGPUNum;
}
} else if (constraint.type == 'GPUNumber') {
const gpus = constraint.gpus as Array<number>;
// TODO: remove the following constraint when supporting distributed trial
if (gpus.length != 1) {
throw new Error("Placement constraint of GPUNumber must have exactly one number.");
}
requiredGPUNum = gpus[0];
}
assert(requiredGPUNum >= 0);
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const eligibleEnvironments: EnvironmentInformation[] = environments.filter((environment: EnvironmentInformation) =>
environment.defaultGpuSummary === undefined || requiredGPUNum === 0 ||
(requiredGPUNum !== undefined && environment.defaultGpuSummary.gpuCount >= requiredGPUNum));
if (eligibleEnvironments.length === 0) {
// If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly
return ({
resultType: ScheduleResultType.REQUIRE_EXCEED_TOTAL,
gpuIndices: undefined,
environment: undefined,
});
}
// Step 2: Allocate Host/GPU for specified trial job
// Currenty the requireGPUNum parameter for all trial jobs are identical.
if (requiredGPUNum > 0) {
// Trial job requires GPU
const result: GpuScheduleResult | undefined = this.scheduleGPUHost(environments, requiredGPUNum, trialDetail);
if (result !== undefined) {
return result;
}
} else {
// Trail job does not need GPU
const allocatedRm: EnvironmentInformation = this.selectMachine(environments, environments);
return this.allocateHost(requiredGPUNum, allocatedRm, [], trialDetail);
}
return {
resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
gpuIndices: undefined,
environment: undefined,
};
} else {
assert(constraint.type === 'Device')
if (constraint.gpus.length == 0) {
throw new Error("Device constraint is used but no device is specified.");
}
const gpus = constraint.gpus as Array<[string, number]>;
const selectedHost = gpus[0][0];
const differentHosts: Array<[string, number]> = gpus.filter((gpuTuple: [string, number]) => gpuTuple[0] != selectedHost);
if (differentHosts.length >= 1) {
//TODO: remove this constraint when supporting multi-host placement
throw new Error("Device constraint does not support using multiple hosts")
}
if (environments.length == 0) {
return {
resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
gpuIndices: undefined,
environment: undefined,
};
}
for (const environment of environments) {
if(!('rmMachineMeta' in environment)){
//TODO: remove this constraint when supporting other training services
throw new Error(`Environment Device placement constraint only supports remote training service for now.`);
}
}
//TODO:
const eligibleEnvironments: EnvironmentInformation[] = environments.filter(
(environment: EnvironmentInformation) =>
(environment as RemoteMachineEnvironmentInformation).rmMachineMeta != undefined &&
(environment as RemoteMachineEnvironmentInformation).rmMachineMeta?.host == selectedHost);
if (eligibleEnvironments.length === 0) {
throw new Error(`The the required host (host: ${selectedHost}) is not found.`);
}
const selectedEnvironment = eligibleEnvironments[0];
const availableResources = this.gpuResourceDetection([selectedEnvironment]);
const selectedGPUs: Array<GPUInfo> = [];
if (selectedEnvironment.defaultGpuSummary === undefined) {
//GPU summary may not be ready, retry until it is ready
return {
resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
gpuIndices: undefined,
environment: undefined,
};
}
for (const gpuTuple of gpus) {
const gpuIdx: number = gpuTuple[1];
if (gpuIdx >= selectedEnvironment.defaultGpuSummary.gpuCount) {
throw new Error(`The gpuIdx of placement constraint ${gpuIdx} exceeds gpuCount of the host ${selectedHost}`);
}
if (availableResources.has(selectedEnvironment)) {
for (const gpuInfo of availableResources.get(selectedEnvironment)!) {
if (gpuInfo.index === gpuIdx) {
selectedGPUs.push(gpuInfo);
}
}
}
}
if (selectedGPUs.length === constraint.gpus.length) {
for (const gpuInfo of selectedGPUs) {
let num = selectedEnvironment.defaultGpuSummary?.assignedGpuIndexMap.get(gpuInfo.index);
if (num === undefined) {
num = 0;
}
selectedEnvironment.defaultGpuSummary?.assignedGpuIndexMap.set(gpuInfo.index, num + 1);
}
return {
resultType: ScheduleResultType.SUCCEED,
environment: selectedEnvironment,
gpuIndices: selectedGPUs,
};
} else {
return {
resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
gpuIndices: undefined,
environment: undefined,
};
}
}
}