in pkg/shim/scheduler.go [154:197]
func (ss *KubernetesShim) recoverSchedulerState() {
// run recovery process in a go routine
// do not block main thread
go func() {
log.Log(log.ShimScheduler).Info("recovering scheduler states")
// step 1: recover all applications
// this step, we collect all the existing allocated pods from api-server,
// identify the scheduling identity (aka applicationInfo) from the pod,
// and then add these applications to the scheduler.
if err := ss.appManager.WaitForRecovery(); err != nil {
// failed
log.Log(log.ShimScheduler).Error("scheduler recovery failed", zap.Error(err))
dispatcher.Dispatch(ShimSchedulerEvent{
event: RecoverSchedulerFailed,
})
return
}
// step 2: recover existing allocations
// this step, we collect all existing allocations (allocated pods) from api-server,
// rerun the scheduling for these allocations in order to restore scheduler-state,
// the rerun is like a replay, not a actual scheduling procedure.
recoverableAppManagers := make([]interfaces.Recoverable, 0)
for _, appMgr := range ss.appManager.GetAllManagers() {
if m, ok := appMgr.(interfaces.Recoverable); ok {
recoverableAppManagers = append(recoverableAppManagers, m)
}
}
if err := ss.context.WaitForRecovery(recoverableAppManagers, 5*time.Minute); err != nil {
// failed
log.Log(log.ShimScheduler).Error("scheduler recovery failed", zap.Error(err))
dispatcher.Dispatch(ShimSchedulerEvent{
event: RecoverSchedulerFailed,
})
return
}
// success
log.Log(log.ShimScheduler).Info("scheduler recovery succeed")
dispatcher.Dispatch(ShimSchedulerEvent{
event: RecoverSchedulerSucceed,
})
}()
}