func()

in pkg/shim/scheduler.go [154:197]


func (ss *KubernetesShim) recoverSchedulerState() {
	// run recovery process in a go routine
	// do not block main thread
	go func() {
		log.Log(log.ShimScheduler).Info("recovering scheduler states")
		// step 1: recover all applications
		// this step, we collect all the existing allocated pods from api-server,
		// identify the scheduling identity (aka applicationInfo) from the pod,
		// and then add these applications to the scheduler.
		if err := ss.appManager.WaitForRecovery(); err != nil {
			// failed
			log.Log(log.ShimScheduler).Error("scheduler recovery failed", zap.Error(err))
			dispatcher.Dispatch(ShimSchedulerEvent{
				event: RecoverSchedulerFailed,
			})
			return
		}

		// step 2: recover existing allocations
		// this step, we collect all existing allocations (allocated pods) from api-server,
		// rerun the scheduling for these allocations in order to restore scheduler-state,
		// the rerun is like a replay, not a actual scheduling procedure.
		recoverableAppManagers := make([]interfaces.Recoverable, 0)
		for _, appMgr := range ss.appManager.GetAllManagers() {
			if m, ok := appMgr.(interfaces.Recoverable); ok {
				recoverableAppManagers = append(recoverableAppManagers, m)
			}
		}
		if err := ss.context.WaitForRecovery(recoverableAppManagers, 5*time.Minute); err != nil {
			// failed
			log.Log(log.ShimScheduler).Error("scheduler recovery failed", zap.Error(err))
			dispatcher.Dispatch(ShimSchedulerEvent{
				event: RecoverSchedulerFailed,
			})
			return
		}

		// success
		log.Log(log.ShimScheduler).Info("scheduler recovery succeed")
		dispatcher.Dispatch(ShimSchedulerEvent{
			event: RecoverSchedulerSucceed,
		})
	}()
}