pkg/jobmgr/goalstate/update

// Copyright (c) 2019 Uber Technologies, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package goalstate import ( "context" "time" pbjob "github.com/uber/peloton/.gen/peloton/api/v0/job" "github.com/uber/peloton/.gen/peloton/api/v0/peloton" pbtask "github.com/uber/peloton/.gen/peloton/api/v0/task" pbupdate "github.com/uber/peloton/.gen/peloton/api/v0/update" "github.com/uber/peloton/.gen/peloton/private/models" "github.com/uber/peloton/pkg/common/goalstate" "github.com/uber/peloton/pkg/common/taskconfig" "github.com/uber/peloton/pkg/common/util" "github.com/uber/peloton/pkg/jobmgr/cached" jobmgrcommon "github.com/uber/peloton/pkg/jobmgr/common" "github.com/uber/peloton/pkg/jobmgr/task" log "github.com/sirupsen/logrus" "go.uber.org/yarpc/yarpcerrors" ) // UpdateRun is responsible to check which instances have been updated, // start the next set of instances to update and update the state // of the job update in cache and DB. func UpdateRun(ctx context.Context, entity goalstate.Entity) error { updateEnt := entity.(*updateEntity) goalStateDriver := updateEnt.driver cachedWorkflow, cachedJob, err := fetchWorkflowAndJobFromCache( ctx, updateEnt.jobID, updateEnt.id, goalStateDriver) if err != nil || cachedWorkflow == nil || cachedJob == nil { log.WithFields(log.Fields{ "update_id": updateEnt.id.GetValue(), }).WithError(err).Info("unable to run update") goalStateDriver.mtx.updateMetrics.UpdateRunFail.Inc(1) return err } // TODO: remove after recovery is done when reading state if cachedWorkflow.GetState().State == pbupdate.State_INVALID { return UpdateReload(ctx, entity) } instancesCurrent, instancesDoneFromLastRun, instancesFailedFromLastRun, err := cached.GetUpdateProgress( ctx, cachedJob.ID(), cachedWorkflow, cachedWorkflow.GetGoalState().JobVersion, cachedWorkflow.GetInstancesCurrent(), goalStateDriver.taskStore, ) if err != nil { goalStateDriver.mtx.updateMetrics.UpdateRunFail.Inc(1) return err } instancesFailed := append( cachedWorkflow.GetInstancesFailed(), instancesFailedFromLastRun...) instancesDone := append( cachedWorkflow.GetInstancesDone(), instancesDoneFromLastRun...) // number of failed instances in the workflow exceeds limit and // max instance retries is set, process the failed workflow and // return directly // TODO: use job SLA if GetMaxFailureInstances is not set if cachedWorkflow.GetUpdateConfig().GetMaxFailureInstances() != 0 && uint32(len(instancesFailed)) >= cachedWorkflow.GetUpdateConfig().GetMaxFailureInstances() { err := processFailedUpdate( ctx, cachedJob, cachedWorkflow, instancesDone, instancesFailed, instancesCurrent, goalStateDriver, ) if err != nil { goalStateDriver.mtx.updateMetrics.UpdateRunFail.Inc(1) } return err } instancesToAdd, instancesToUpdate, instancesToRemove := getInstancesForUpdateRun( ctx, cachedJob, cachedWorkflow, instancesCurrent, instancesDone, instancesFailed, ) instancesToAdd, instancesToUpdate, instancesToRemove, instancesRemovedDone, err := confirmInstancesStatus( ctx, cachedJob, cachedWorkflow, instancesToAdd, instancesToUpdate, instancesToRemove, ) if err != nil { goalStateDriver.mtx.updateMetrics.UpdateRunFail.Inc(1) return err } instancesDone = append(instancesDone, instancesRemovedDone...) if err := processUpdate( ctx, cachedJob, cachedWorkflow, instancesToAdd, instancesToUpdate, instancesToRemove, goalStateDriver, ); err != nil { goalStateDriver.mtx.updateMetrics.UpdateRunFail.Inc(1) return err } if err := writeUpdateProgress( ctx, cachedJob, cachedWorkflow, cachedWorkflow.GetState().State, instancesDone, instancesFailed, instancesCurrent, instancesToAdd, instancesToUpdate, instancesToRemove, ); err != nil { goalStateDriver.mtx.updateMetrics.UpdateRunFail.Inc(1) return err } if err := postUpdateAction( ctx, cachedJob, cachedWorkflow, instancesToAdd, instancesToUpdate, instancesToRemove, instancesDone, instancesFailed, goalStateDriver); err != nil { goalStateDriver.mtx.updateMetrics.UpdateRunFail.Inc(1) return err } // TODO (varung): // - Use len for instances current // - Remove instances_added, instances_removed and instances_updated log.WithFields(log.Fields{ "update_id": updateEnt.id.GetValue(), "job_id": cachedJob.ID().GetValue(), "update_type": cachedWorkflow.GetWorkflowType().String(), "instances_current": cachedWorkflow.GetInstancesCurrent(), "instances_failed": len(cachedWorkflow.GetInstancesFailed()), "instances_done": len(cachedWorkflow.GetInstancesDone()), "instances_added": len(cachedWorkflow.GetInstancesAdded()), "instances_removed": len(cachedWorkflow.GetInstancesRemoved()), "instances_updated": len(cachedWorkflow.GetInstancesUpdated()), }).Info("update running") goalStateDriver.mtx.updateMetrics.UpdateRun.Inc(1) return nil } // processFailedUpdate is called when the update fails due to // too many instances fail during the process. It update the // state to failed and enqueue it to goal state engine directly. func processFailedUpdate( ctx context.Context, cachedJob cached.Job, cachedUpdate cached.Update, instancesDone []uint32, instancesFailed []uint32, instancesCurrent []uint32, driver *driver, ) error { // rollback the update if RollbackOnFailure is set and // the update itself is not a rollback if cachedUpdate.GetUpdateConfig().RollbackOnFailure && !isUpdateRollback(cachedUpdate) { // write the progress first, because when rollback happens, // workflow does not know the newly finished/failed instances. cachedJob.WriteWorkflowProgress( ctx, cachedUpdate.ID(), cachedUpdate.GetState().State, instancesDone, instancesFailed, instancesCurrent, ) if err := cachedJob.RollbackWorkflow(ctx); err != nil { log.WithFields(log.Fields{ "update_id": cachedUpdate.ID().GetValue(), "job_id": cachedJob.ID().GetValue(), }).WithError(err). Info("fail to rollback update") return err } cachedConfig, err := cachedJob.GetConfig(ctx) if err != nil { log.WithFields(log.Fields{ "update_id": cachedUpdate.ID().GetValue(), "job_id": cachedJob.ID().GetValue(), }).WithError(err). Info("fail to get job config to rollback update") return err } if err := handleUnchangedInstancesInUpdate( ctx, cachedUpdate, cachedJob, cachedConfig, ); err != nil { log.WithFields(log.Fields{ "update_id": cachedUpdate.ID().GetValue(), "job_id": cachedJob.ID().GetValue(), }).WithError(err). Info("fail to update unchanged instances to rollback update") return err } log.WithFields(log.Fields{ "update_id": cachedUpdate.ID().GetValue(), "job_id": cachedJob.ID().GetValue(), }).Info("update rolling back") } else { if err := cachedJob.WriteWorkflowProgress( ctx, cachedUpdate.ID(), pbupdate.State_FAILED, instancesDone, instancesFailed, instancesCurrent, ); err != nil { return err } } driver.EnqueueUpdate(cachedJob.ID(), cachedUpdate.ID(), time.Now()) return nil } // isUpdateRollback returns if an update is a rolling back to a // previous version func isUpdateRollback(cachedUpdate cached.Update) bool { if cachedUpdate.GetWorkflowType() != models.WorkflowType_UPDATE { return false } return cachedUpdate.GetState().State == pbupdate.State_ROLLING_BACKWARD } // postUpdateAction performs actions after update run is finished for // one run of UpdateRun. Its job: // 1. Enqueue update if update is completed finished // 2. Enqueue update if any task updated/removed in this run has already // been updated/killed func postUpdateAction( ctx context.Context, cachedJob cached.Job, cachedUpdate cached.Update, instancesAddedInCurrentRun []uint32, instancesUpdatedInCurrentRun []uint32, instancesRemovedInCurrentRun []uint32, instancesDone []uint32, instancesFailed []uint32, goalStateDriver Driver, ) error { // update finishes, reenqueue the update if len(cachedUpdate.GetGoalState().Instances) == len(instancesDone)+len(instancesFailed) { goalStateDriver.EnqueueUpdate( cachedJob.ID(), cachedUpdate.ID(), time.Now()) return nil } instancesInCurrentRun := append(instancesAddedInCurrentRun, append(instancesUpdatedInCurrentRun, instancesRemovedInCurrentRun...)...) // if any of the task updated/removed in this round is a killed task or // has already finished update/kill, reenqueue the update, because // more instances can be updated without receiving task event. for _, instanceID := range instancesInCurrentRun { cachedTask := cachedJob.GetTask(instanceID) if cachedTask == nil { continue } runtime, err := cachedTask.GetRuntime(ctx) if err != nil { return err } // directly begin the next update because some tasks have already completed update // and more update can begin without waiting. if isTaskUpdateCompleted(cachedUpdate, runtime) || isTaskTerminated(runtime) { goalStateDriver.EnqueueUpdate( cachedJob.ID(), cachedUpdate.ID(), time.Now()) return nil } } return nil } // A special case is that UpdateRun is retried multiple times. And // the task updated in the run have already finished update. // As a result, no more task event would be received, so JobMgr // needs to deal with this case separately. func isTaskUpdateCompleted(cachedUpdate cached.Update, runtime *pbtask.RuntimeInfo) bool { return runtime.GetState() == pbtask.TaskState_RUNNING && runtime.GetConfigVersion() == runtime.GetDesiredConfigVersion() && runtime.GetConfigVersion() == cachedUpdate.GetGoalState().JobVersion } // isTaskTerminated returns whether a task is terminated and would // not be started again func isTaskTerminated(runtime *pbtask.RuntimeInfo) bool { return util.IsPelotonStateTerminal(runtime.GetState()) && util.IsPelotonStateTerminal(runtime.GetGoalState()) } func writeUpdateProgress( ctx context.Context, cachedJob cached.Job, cachedUpdate cached.Update, updateState pbupdate.State, instancesDone []uint32, instancesFailed []uint32, previousInstancesCurrent []uint32, instancesAdded []uint32, instancesUpdated []uint32, instancesRemoved []uint32, ) error { newInstancesCurrent := append(previousInstancesCurrent, instancesAdded...) newInstancesCurrent = append(newInstancesCurrent, instancesUpdated...) newInstancesCurrent = append(newInstancesCurrent, instancesRemoved...) // update the state of the job update return cachedJob.WriteWorkflowProgress( ctx, cachedUpdate.ID(), updateState, instancesDone, instancesFailed, newInstancesCurrent, ) } func processUpdate( ctx context.Context, cachedJob cached.Job, cachedUpdate cached.Update, instancesToAdd []uint32, instancesToUpdate []uint32, instancesToRemove []uint32, goalStateDriver *driver) error { // no action needed if there is no instances to update/add if len(instancesToUpdate)+len(instancesToAdd)+len(instancesToRemove) == 0 { return nil } jobConfig, _, err := goalStateDriver.jobConfigOps.Get( ctx, cachedJob.ID(), cachedUpdate.GetGoalState().JobVersion) if err != nil { return err } err = addInstancesInUpdate( ctx, cachedJob, instancesToAdd, jobConfig, goalStateDriver) if err != nil { return err } err = processInstancesInUpdate( ctx, cachedJob, cachedUpdate, instancesToUpdate, jobConfig, goalStateDriver, ) if err != nil { return err } err = removeInstancesInUpdate( ctx, cachedJob, instancesToRemove, jobConfig, goalStateDriver, ) return err } // addInstancesInUpdate will add instances specified in instancesToAdd // in cachedJob. // It would create and send the new tasks to resmgr. And if the job // is set to KILLED goal state, the function would reset the goal state // to the default goal state. func addInstancesInUpdate( ctx context.Context, cachedJob cached.Job, instancesToAdd []uint32, jobConfig *pbjob.JobConfig, goalStateDriver *driver) error { var tasks []*pbtask.TaskInfo runtimes := make(map[uint32]*pbtask.RuntimeInfo) if len(instancesToAdd) == 0 { return nil } // move job goal state from KILLED to RUNNING jobRuntime, err := cachedJob.GetRuntime(ctx) if err != nil { return err } // now lets add the new instances for _, instID := range instancesToAdd { runtime, err := getTaskRuntimeIfExisted(ctx, cachedJob, instID) if err != nil { return err } if runtime != nil { if runtime.GetState() == pbtask.TaskState_INITIALIZED { // runtime is initialized, do not create the task again and directly // send to ResMgr taskInfo := &pbtask.TaskInfo{ JobId: cachedJob.ID(), InstanceId: instID, Runtime: runtime, Config: taskconfig.Merge( jobConfig.GetDefaultConfig(), jobConfig.GetInstanceConfig()[instID]), } tasks = append(tasks, taskInfo) } else { log.WithFields(log.Fields{ "job_id": cachedJob.ID().GetValue(), "instance_id": instID, "state": runtime.GetState().String(), }).Info( "task added in update has non-nil runtime in uninitialized state") } } else { // runtime is nil, initialize the runtime runtime := task.CreateInitializingTask( cachedJob.ID(), instID, jobConfig) if err = updateWithRecentRunID( ctx, cachedJob.ID(), instID, runtime, goalStateDriver); err != nil { return err } runtime.ConfigVersion = jobConfig.GetChangeLog().GetVersion() runtime.DesiredConfigVersion = jobConfig.GetChangeLog().GetVersion() // job goal state is KILLED, set task cur and desired state to KILLED to // avoid unnecessary task creation if jobRuntime.GetGoalState() == pbjob.JobState_KILLED { runtime.State = pbtask.TaskState_KILLED runtime.GoalState = pbtask.TaskState_KILLED } // do not send to resmgr if task goal state is KILLED if runtime.GetGoalState() != pbtask.TaskState_KILLED { taskInfo := &pbtask.TaskInfo{ JobId: cachedJob.ID(), InstanceId: instID, Runtime: runtime, Config: taskconfig.Merge( jobConfig.GetDefaultConfig(), jobConfig.GetInstanceConfig()[instID]), } tasks = append(tasks, taskInfo) } runtimes[instID] = runtime } } // Create the tasks if len(runtimes) > 0 { if err := cachedJob.CreateTaskRuntimes(ctx, runtimes, "peloton"); err != nil { return err } } // send to resource manager return sendTasksToResMgr( ctx, cachedJob.ID(), tasks, jobConfig, goalStateDriver) } // getTaskRuntimeIfExisted returns task runtime if the task is created. // it would return nil RuntimeInfo and nil error if the task runtime does // not exist func getTaskRuntimeIfExisted( ctx context.Context, cachedJob cached.Job, instanceID uint32, ) (*pbtask.RuntimeInfo, error) { cachedTask := cachedJob.GetTask(instanceID) if cachedTask == nil { return nil, nil } runtime, err := cachedTask.GetRuntime(ctx) if yarpcerrors.IsNotFound(err) { return nil, nil } if err != nil { return nil, err } return runtime, nil } // processInstancesInUpdate update the existing instances in instancesToUpdate func processInstancesInUpdate( ctx context.Context, cachedJob cached.Job, cachedUpdate cached.Update, instancesToUpdate []uint32, jobConfig *pbjob.JobConfig, goalStateDriver *driver) error { if len(instancesToUpdate) == 0 { return nil } runtimes := make(map[uint32]jobmgrcommon.RuntimeDiff) for _, instID := range instancesToUpdate { runtimeDiff := cachedUpdate.GetRuntimeDiff(jobConfig) if runtimeDiff != nil { cachedTask, err := cachedJob.AddTask(ctx, instID) if err != nil { return err } runtime, err := cachedTask.GetRuntime(ctx) if err != nil { return err } if cachedUpdate.GetUpdateConfig().GetInPlace() { runtimeDiff[jobmgrcommon.DesiredHostField] = getDesiredHostField(runtime) } else { runtimeDiff[jobmgrcommon.DesiredHostField] = "" } if runtime.GetGoalState() == pbtask.TaskState_DELETED || cachedUpdate.GetUpdateConfig().GetStartTasks() { runtimeDiff[jobmgrcommon.GoalStateField] = pbtask.TaskState_RUNNING } runtimes[instID] = runtimeDiff } } if len(runtimes) > 0 { // we do not need to handle `instancesToBeRetried` here. Since all // instances in `instancesToUpdate` are being enqueued into Task // goalstate engine, their runtimes will be reloaded into cache when // they are evaluated. The update will be retried in the next update cycle. if _, _, err := cachedJob.PatchTasks(ctx, runtimes, false); err != nil { return err } } for _, instID := range instancesToUpdate { goalStateDriver.EnqueueTask(cachedJob.ID(), instID, time.Now()) } return nil } func getDesiredHostField(runtime *pbtask.RuntimeInfo) string { // desired host field is reset when the task runs again. // if host field is not reset when being updated, it means // either the task was in LAUNCHED/STARTING state or there // is an update overwrite when the task was killed by the prev // update. In either case, we should just reuse the previous // desired host field if len(runtime.GetDesiredHost()) != 0 { return runtime.GetDesiredHost() } // host field is set when the task is launched, // it is reset when the task is killed. For all the // states in between, the task maybe running on the host // already. Therefore, set the current host as desired host. if !util.IsPelotonStateTerminal(runtime.GetState()) { return runtime.GetHost() } return "" } // removeInstancesInUpdate kills the instances being removed in the update func removeInstancesInUpdate( ctx context.Context, cachedJob cached.Job, instancesToRemove []uint32, jobConfig *pbjob.JobConfig, goalStateDriver *driver) error { if len(instancesToRemove) == 0 { return nil } runtimes := make(map[uint32]jobmgrcommon.RuntimeDiff) for _, instID := range instancesToRemove { runtimes[instID] = jobmgrcommon.RuntimeDiff{ jobmgrcommon.GoalStateField: pbtask.TaskState_DELETED, jobmgrcommon.DesiredConfigVersionField: jobConfig.GetChangeLog().GetVersion(), jobmgrcommon.MessageField: "Task Count reduced via API", jobmgrcommon.TerminationStatusField: &pbtask.TerminationStatus{ Reason: pbtask.TerminationStatus_TERMINATION_STATUS_REASON_KILLED_FOR_UPDATE, }, jobmgrcommon.FailureCountField: uint32(0), } } if len(runtimes) > 0 { // we do not need to handle `instancesToBeRetried` here. Since the // instances are being enqueued into Task goalstate engine, their runtimes // will be reloaded into cache when they are evaluated. The update will // be retried in the next update cycle. if _, _, err := cachedJob.PatchTasks(ctx, runtimes, false); err != nil { return err } } for _, instID := range instancesToRemove { goalStateDriver.EnqueueTask(cachedJob.ID(), instID, time.Now()) } return nil } func confirmInstancesStatus( ctx context.Context, cachedJob cached.Job, cachedUpdate cached.Update, instancesToAdd []uint32, instancesToUpdate []uint32, instancesToRemove []uint32, ) ( newInstancesToAdd []uint32, newInstancesToUpdate []uint32, newInstancesToRemove []uint32, instancesDone []uint32, err error, ) { for _, instID := range instancesToAdd { var cachedTask cached.Task var runtime *pbtask.RuntimeInfo cachedTask, err = cachedJob.AddTask(ctx, instID) if err == nil { runtime, err = cachedTask.GetRuntime(ctx) if err != nil { if yarpcerrors.IsNotFound(err) { // runtime does not exist, lets try to add it newInstancesToAdd = append(newInstancesToAdd, instID) continue } // got some error, just retry later return } // instance already exists if runtime.GetConfigVersion() == cachedUpdate.GetGoalState().JobVersion { // instance exists with correct configuration version newInstancesToAdd = append(newInstancesToAdd, instID) } else { // instance exists with previous configuration version, // hence needs to be updated newInstancesToUpdate = append(newInstancesToUpdate, instID) } continue } if yarpcerrors.IsNotFound(err) || err == cached.InstanceIDExceedsInstanceCountError { // instance does not exist newInstancesToAdd = append(newInstancesToAdd, instID) continue } // got some error, just retry later return } for _, instID := range instancesToUpdate { var cachedTask cached.Task cachedTask, err = cachedJob.AddTask(ctx, instID) if err != nil { if yarpcerrors.IsNotFound(err) { // not found, add it newInstancesToAdd = append(newInstancesToAdd, instID) continue } // got some error, just retry later return } _, err = cachedTask.GetRuntime(ctx) if err != nil { if yarpcerrors.IsNotFound(err) { // not found, add it newInstancesToAdd = append(newInstancesToAdd, instID) continue } // got some error, just retry later return } newInstancesToUpdate = append(newInstancesToUpdate, instID) } for _, instID := range instancesToRemove { _, err = cachedJob.AddTask(ctx, instID) if err != nil { if yarpcerrors.IsNotFound(err) || err == cached.InstanceIDExceedsInstanceCountError { // not found, already removed instancesDone = append(instancesDone, instID) continue } return } // remove it newInstancesToRemove = append(newInstancesToRemove, instID) } // clear the error and return err = nil return } // getInstancesForUpdateRun returns the instances to update/add in // the given call of UpdateRun. func getInstancesForUpdateRun( ctx context.Context, cachedJob cached.Job, update cached.Update, instancesCurrent []uint32, instancesDone []uint32, instancesFailed []uint32, ) ( instancesToAdd []uint32, instancesToUpdate []uint32, instancesToRemove []uint32, ) { unprocessedInstancesToAdd, unprocessedInstancesToUpdate, unprocessedInstancesToRemove := getUnprocessedInstances( update, instancesCurrent, instancesDone, instancesFailed, ) if len(unprocessedInstancesToUpdate) != 0 { unprocessedInstancesToUpdate = sortInstancesByAvailability( ctx, cachedJob, unprocessedInstancesToUpdate, ) } // if batch size is 0 or updateConfig is nil, update all of the instances if update.GetUpdateConfig().GetBatchSize() == 0 { return unprocessedInstancesToAdd, unprocessedInstancesToUpdate, unprocessedInstancesToRemove } maxNumOfInstancesToProcess := int(update.GetUpdateConfig().GetBatchSize()) - len(instancesCurrent) // if instances being updated are more than batch size, do not update anything if maxNumOfInstancesToProcess <= 0 { return nil, nil, nil } // if can process all of the remaining instances if maxNumOfInstancesToProcess > len(unprocessedInstancesToAdd)+len(unprocessedInstancesToUpdate)+ len(unprocessedInstancesToRemove) { return unprocessedInstancesToAdd, unprocessedInstancesToUpdate, unprocessedInstancesToRemove } // if can process all of the instances to add, update // and part of instances to remove if maxNumOfInstancesToProcess > len(unprocessedInstancesToAdd)+len(unprocessedInstancesToUpdate) { return unprocessedInstancesToAdd, unprocessedInstancesToUpdate, unprocessedInstancesToRemove[:maxNumOfInstancesToProcess- len(unprocessedInstancesToAdd)- len(unprocessedInstancesToUpdate)] } // if can process all of the instances to add, // and part of instances to update if maxNumOfInstancesToProcess > len(unprocessedInstancesToAdd) { return unprocessedInstancesToAdd, unprocessedInstancesToUpdate[:maxNumOfInstancesToProcess-len(unprocessedInstancesToAdd)], nil } // if can process part of the instances to add return unprocessedInstancesToAdd[:maxNumOfInstancesToProcess], nil, nil } // sortInstancesByAvailability sorts the instances of the job by its availability. // The sort order is // 1. unavailable-instances // 2. killed-instances // 3. invalid-instances // 4. available-instances // This is needed because we need to try to update unhealthy instances // before healthy ones in order to keep the number of unavailable instances to // a minimum thereby giving the update workflow the best chance to progress. func sortInstancesByAvailability( ctx context.Context, cachedJob cached.Job, instances []uint32, ) []uint32 { instancesByAvailability := make(map[jobmgrcommon.InstanceAvailability_Type][]uint32) instanceAvailabilityByInstance := cachedJob.GetInstanceAvailabilityType(ctx, instances...) for _, i := range instances { availabilityType := instanceAvailabilityByInstance[i] instancesByAvailability[availabilityType] = append( instancesByAvailability[availabilityType], i, ) } var sortedInstances []uint32 sortedInstances = append( sortedInstances, instancesByAvailability[jobmgrcommon.InstanceAvailability_UNAVAILABLE]..., ) sortedInstances = append( sortedInstances, instancesByAvailability[jobmgrcommon.InstanceAvailability_KILLED]..., ) sortedInstances = append( sortedInstances, instancesByAvailability[jobmgrcommon.InstanceAvailability_INVALID]..., ) sortedInstances = append( sortedInstances, instancesByAvailability[jobmgrcommon.InstanceAvailability_AVAILABLE]..., ) return sortedInstances } // getUnprocessedInstances returns all of the // instances remaining to update/add func getUnprocessedInstances( update cached.Update, instancesCurrent []uint32, instancesDone []uint32, instancesFailed []uint32, ) (instancesRemainToAdd []uint32, instancesRemainToUpdate []uint32, instancesRemainToRemove []uint32) { instancesProcessed := append(instancesCurrent, instancesDone...) instancesProcessed = append(instancesProcessed, instancesFailed...) instancesRemainToAdd = util.SubtractSlice(update.GetInstancesAdded(), instancesProcessed) instancesRemainToUpdate = util.SubtractSlice(update.GetInstancesUpdated(), instancesProcessed) instancesRemainToRemove = util.SubtractSlice(update.GetInstancesRemoved(), instancesProcessed) return } // updateWithRecentRunID has primary use case to sync runID from persistent storage // for previously removed instance that is added back again. // // 1. Fetches most recent pod event to get last runID // 2. If RunID exists for this instance, then update the runtime with // last RunID. Primary reason to not start RunID for newly added instance // is to prevent overwriting previous pod events at storage. // 3. Starting from most recent RunID enables user to fetch sandbox logs, // state transitions for previous instance runs. func updateWithRecentRunID( ctx context.Context, jobID *peloton.JobID, instanceID uint32, runtime *pbtask.RuntimeInfo, goalStateDriver *driver) error { podEvents, err := goalStateDriver.podEventsOps.GetAll( ctx, jobID.GetValue(), instanceID) if err != nil { return err } // instance removed previously during update is being added back. if len(podEvents) > 0 { runID, err := util.ParseRunID(podEvents[0].GetTaskId().GetValue()) if err != nil { return err } runtime.MesosTaskId = util.CreateMesosTaskID( jobID, instanceID, runID+1) runtime.DesiredMesosTaskId = runtime.MesosTaskId runtime.PrevMesosTaskId = podEvents[0].GetTaskId() } return nil }

pkg/jobmgr/goalstate/update_run.go (730 lines of code) (raw):