in controllers/hyperparametertuningjob/hyperparametertuningjob_controller.go [139:223]
func (r *Reconciler) reconcileTuningJob(ctx reconcileRequestContext) error {
var err error
if ctx.TuningJob.Status.HyperParameterTuningJobStatus == "" {
if err = r.updateStatus(ctx, controllers.InitializingJobStatus); err != nil {
return err
}
}
if err = r.initializeContext(&ctx); err != nil {
return r.updateStatusAndReturnError(ctx, string(sagemaker.HyperParameterTuningJobStatusFailed), errors.Wrap(err, "Unable to initialize operator"))
}
// Add finalizer if it's not marked for deletion.
if !controllers.HasDeletionTimestamp(ctx.TuningJob.ObjectMeta) {
if !controllers.ContainsString(ctx.TuningJob.ObjectMeta.GetFinalizers(), controllers.SageMakerResourceFinalizerName) {
ctx.TuningJob.ObjectMeta.Finalizers = append(ctx.TuningJob.ObjectMeta.Finalizers, controllers.SageMakerResourceFinalizerName)
if err := r.Update(ctx, ctx.TuningJob); err != nil {
return errors.Wrap(err, "Failed to add finalizer")
}
ctx.Log.Info("Finalizer added")
}
}
// Get the HyperParameterTuningJob from SageMaker
if ctx.TuningJobDescription, err = ctx.SageMakerClient.DescribeHyperParameterTuningJob(ctx, ctx.TuningJobName); err != nil {
return r.updateStatusAndReturnError(ctx, ReconcilingTuningJobStatus, errors.Wrap(err, "Unable to describe SageMaker hyperparameter tuning job"))
}
// The resource does not exist within SageMaker yet.
if ctx.TuningJobDescription == nil {
if controllers.HasDeletionTimestamp(ctx.TuningJob.ObjectMeta) {
// Don't attempt to clean up resources as none should exist yet
return r.removeFinalizer(ctx)
}
if err = r.createHyperParameterTuningJob(ctx); err != nil {
return r.updateStatusAndReturnError(ctx, ReconcilingTuningJobStatus, errors.Wrap(err, "Unable to create hyperparameter tuning job"))
}
if ctx.TuningJobDescription, err = ctx.SageMakerClient.DescribeHyperParameterTuningJob(ctx, ctx.TuningJobName); err != nil {
return r.updateStatusAndReturnError(ctx, ReconcilingTuningJobStatus, errors.Wrap(err, "Unable to describe SageMaker hyperparameter tuning job"))
}
}
// Spawn training jobs regardless of the status
ctx.HPOTrainingJobSpawner.SpawnMissingTrainingJobs(ctx, *ctx.TuningJob)
if err = r.addBestTrainingJobToStatus(ctx); err != nil {
return r.updateStatusAndReturnError(ctx, ReconcilingTuningJobStatus, errors.Wrap(err, "Unable to add best training job to status"))
}
switch *ctx.TuningJobDescription.HyperParameterTuningJobStatus {
case sagemaker.HyperParameterTuningJobStatusInProgress:
if controllers.HasDeletionTimestamp(ctx.TuningJob.ObjectMeta) {
// Request to stop the job. If SageMaker returns a 404 then the job has already been deleted.
if _, err := ctx.SageMakerClient.StopHyperParameterTuningJob(ctx, ctx.TuningJobName); err != nil && !clientwrapper.IsStopHyperParameterTuningJob404Error(err) {
return r.updateStatusAndReturnError(ctx, ReconcilingTuningJobStatus, errors.Wrap(err, "Unable to delete hyperparameter tuning job"))
}
// Describe the new state of the job
if ctx.TuningJobDescription, err = ctx.SageMakerClient.DescribeHyperParameterTuningJob(ctx, ctx.TuningJobName); err != nil {
return r.updateStatusAndReturnError(ctx, ReconcilingTuningJobStatus, errors.Wrap(err, "Unable to describe SageMaker hyperparameter tuning job"))
}
}
case sagemaker.HyperParameterTuningJobStatusStopped, sagemaker.HyperParameterTuningJobStatusFailed, sagemaker.HyperParameterTuningJobStatusCompleted:
if controllers.HasDeletionTimestamp(ctx.TuningJob.ObjectMeta) {
return r.cleanupAndRemoveFinalizer(ctx)
}
case sagemaker.HyperParameterTuningJobStatusStopping:
break
default:
return r.updateStatusAndReturnError(ctx, ReconcilingTuningJobStatus, fmt.Errorf("Unknown Tuning Job Status: %s", *ctx.TuningJobDescription.HyperParameterTuningJobStatus))
}
status := *ctx.TuningJobDescription.HyperParameterTuningJobStatus
additional := controllers.GetOrDefault(ctx.TuningJobDescription.FailureReason, "")
if err = r.updateStatusWithAdditional(ctx, status, additional); err != nil {
return err
}
return nil
}