in ts/nni_manager/training_service/reusable/trialDispatcher.ts [398:625]
private async trialManagementLoop(): Promise<void> {
const interval = 1;
while (!this.stopping) {
let totalInterval = 1000;
while (totalInterval > 0) {
if (this.shouldUpdateTrials) {
this.shouldUpdateTrials = false;
break;
}
totalInterval -= interval;
await delay(interval);
}
const toRefreshedTrials: TrialDetail[] = [];
for (const trial of this.trials.values()) {
if (trial.status === "RUNNING" || trial.status === "WAITING" || trial.status === "UNKNOWN") {
toRefreshedTrials.push(trial);
}
}
if (toRefreshedTrials.length == 0) {
continue;
}
let waitingTrials: TrialDetail[] = [];
let liveTrialsCount = 0;
for (const trial of toRefreshedTrials) {
const currentStatus = trial.status;
switch (currentStatus) {
case "RUNNING":
{
const environment = trial.environment;
if (environment === undefined) {
this.log.error(`found running trial ${trial.id} has no environment, set trial to UNKNOWN.`);
trial.status = "UNKNOWN";
liveTrialsCount++;
continue;
}
if (environment.environmentService === undefined) {
throw new Error(`${environment.id} does not has environment service!`);
}
trial.url = environment.trackingUrl;
const environmentStatus = environment.status;
// any node exit, then make sure the whole trial stopped.
if (trial.nodes.size > 0) {
const completedCount = trial.nodes.size;
let finalStatus: TrialJobStatus = "SUCCEEDED";
let lastTimestamp: number | undefined;
this.log.debug(`found ${completedCount} completed trial node(s), nodeCount: ${environment.nodeCount}`);
// if some trial processes doesn't exit, kill it for next one.
// for example, in horovod, it's just sleep command, has no impact on trial result.
if (environment.nodeCount > completedCount) {
this.log.info(`stop partial completed trial ${trial.id}`);
await environment.environmentService.getCommandChannel.sendCommand(environment, KILL_TRIAL_JOB, trial.id);
}
for (const node of trial.nodes.values()) {
if (node.status === "FAILED") {
finalStatus = "FAILED";
}
if (node.endTime !== undefined) {
if (lastTimestamp === undefined) {
lastTimestamp = node.endTime
} else {
lastTimestamp = Math.max(node.endTime, lastTimestamp);
}
}
}
trial.status = finalStatus;
if (lastTimestamp === undefined) {
trial.endTime = lastTimestamp;
}
this.releaseEnvironment(trial);
} else if (environmentStatus !== "RUNNING") {
this.log.error(`found running trial ${trial.id} on '${environment.envId}' with '${environmentStatus}', set trial to environment status.`);
this.releaseEnvironment(trial);
trial.status = environmentStatus;
} else {
liveTrialsCount++;
}
}
break;
case "WAITING":
case "UNKNOWN":
// deal it later, if there is free environment.
waitingTrials.push(trial);
liveTrialsCount++;
break;
}
}
let liveEnvironmentsCount = 0;
const reusableEnvironments: EnvironmentInformation[] = [];
for (const environment of this.environments.values()) {
if (environment.isAlive === true) {
liveEnvironmentsCount++;
if (environment.status === "RUNNING" && environment.isRunnerReady) {
// if environment is not reusable and used, stop and not count as idle;
const reuseMode = Array.isArray(this.config.trainingService) || (this.config.trainingService as any).reuseMode;
if (
0 === environment.runningTrialCount &&
reuseMode === false &&
environment.assignedTrialCount > 0
) {
if (environment.environmentService === undefined) {
throw new Error(`${environment.id} does not has environment service!`);
}
await environment.environmentService.stopEnvironment(environment);
continue;
}
// if gpu scheduler is not enabled, and there is running trial, skip it.
if (false === this.enableGpuScheduler && environment.runningTrialCount > 0) {
continue;
}
reusableEnvironments.push(environment);
}
}
}
let neededEnvironmentCount = 0;
if (true === this.enableGpuScheduler) {
let noGpuAvailable: boolean = false;
while (waitingTrials.length > 0) {
// skip following trials, if first trial doesn't find available GPU.
if (true === noGpuAvailable) {
// break loop to try next time.
break;
}
const trial = waitingTrials.shift();
if (undefined === trial) {
throw new Error(`TrialDispatcher: waiting trial shouldn't be undefined!`);
}
const defaultGpuNum = this.config.trialGpuNumber;
const result = this.gpuScheduler.scheduleMachine(reusableEnvironments, trial.form.placementConstraint!, defaultGpuNum, trial);
switch (result.resultType) {
case ScheduleResultType.REQUIRE_EXCEED_TOTAL:
{
if (liveEnvironmentsCount == 0) {
this.log.debug(`TrialDispatcher: no live environment, so request one.`);
neededEnvironmentCount = 1;
waitingTrials = [];
this.isLoggedNoGpuAvailable = false;
} else if (reusableEnvironments.length > 0) {
const errorMessage: string = `TrialDispatcher: REQUIRE_EXCEED_TOTAL Required GPU number ${defaultGpuNum} is too large, no machine can meet`;
this.log.error(errorMessage);
throw new NNIError(NNIErrorNames.RESOURCE_NOT_AVAILABLE, errorMessage);
} else {
if (false === this.isLoggedNoGpuAvailable) {
this.log.debug(`TrialDispatcher: wait GPU, live environment ${liveEnvironmentsCount}, no reusable, REQUIRE_EXCEED_TOTAL.`)
this.isLoggedNoGpuAvailable = true;
}
}
break;
}
case ScheduleResultType.TMP_NO_AVAILABLE_GPU:
{
if (false === this.isLoggedNoGpuAvailable) {
this.log.debug(`TrialDispatcher: wait GPU, live environment ${liveEnvironmentsCount}, reusable ${reusableEnvironments.length}, TMP_NO_AVAILABLE_GPU.`)
this.isLoggedNoGpuAvailable = true;
}
// if some environment is alive, but not ready, no need to create more.
if (liveEnvironmentsCount <= reusableEnvironments.length) {
neededEnvironmentCount = 1;
this.isLoggedNoGpuAvailable = false;
this.log.info(`TrialDispatcher: ${liveEnvironmentsCount} live env, and ${reusableEnvironments.length} reusable, but no GPU available so request a new one.`);
}
noGpuAvailable = true;
}
break
case ScheduleResultType.SUCCEED:
{
const environment = result.environment;
if (undefined === environment) {
throw new Error(`TrialDispatcher: scheduled env shouldn't be undefined!`);
}
trial.assignedGpus = result.gpuIndices;
await this.allocateEnvironment(trial, environment);
this.isLoggedNoGpuAvailable = false;
}
break
default:
throw new Error(`TrialDispatcher: Unknown gpu schecduler type: ${result.resultType}`);
}
}
} else {
while (reusableEnvironments.length > 0 && waitingTrials.length > 0) {
const trial = waitingTrials.shift();
const idleEnvironment = reusableEnvironments.shift();
if (trial !== undefined && idleEnvironment != undefined) {
await this.allocateEnvironment(trial, idleEnvironment);
}
}
neededEnvironmentCount = liveTrialsCount - liveEnvironmentsCount;
}
if (neededEnvironmentCount > 0) {
let requestedCount = 0;
let hasMoreEnvironments = false;
for (let index = 0; index < neededEnvironmentCount; index++) {
const environmentService: EnvironmentService | undefined = this.selectEnvironmentService();
if (environmentService !== undefined) {
hasMoreEnvironments = true;
await this.requestEnvironment(environmentService);
requestedCount++;
this.isLoggedNoMoreEnvironment = false;
} else {
if (this.isLoggedNoMoreEnvironment === false) {
this.isLoggedNoMoreEnvironment = true;
this.log.info(`no more environment so far, so skip to request environment.`)
}
}
}
if (hasMoreEnvironments === true || requestedCount > 0) {
this.log.info(`requested new environment, live trials: ${liveTrialsCount}, ` +
`live environments: ${liveEnvironmentsCount}, neededEnvironmentCount: ${neededEnvironmentCount}, ` +
`requestedCount: ${requestedCount}`);
}
}
}
}