service/sagemaker/api_op_DescribeTrainingJob.go (325 lines of code) (raw):
// Code generated by smithy-go-codegen DO NOT EDIT.
package sagemaker
import (
"context"
"errors"
"fmt"
awsmiddleware "github.com/aws/aws-sdk-go-v2/aws/middleware"
"github.com/aws/aws-sdk-go-v2/service/sagemaker/types"
smithy "github.com/aws/smithy-go"
"github.com/aws/smithy-go/middleware"
smithytime "github.com/aws/smithy-go/time"
smithyhttp "github.com/aws/smithy-go/transport/http"
smithywaiter "github.com/aws/smithy-go/waiter"
"time"
)
// Returns information about a training job.
//
// Some of the attributes below only appear if the training job successfully
// starts. If the training job fails, TrainingJobStatus is Failed and, depending
// on the FailureReason , attributes like TrainingStartTime , TrainingTimeInSeconds
// , TrainingEndTime , and BillableTimeInSeconds may not be present in the
// response.
func (c *Client) DescribeTrainingJob(ctx context.Context, params *DescribeTrainingJobInput, optFns ...func(*Options)) (*DescribeTrainingJobOutput, error) {
if params == nil {
params = &DescribeTrainingJobInput{}
}
result, metadata, err := c.invokeOperation(ctx, "DescribeTrainingJob", params, optFns, c.addOperationDescribeTrainingJobMiddlewares)
if err != nil {
return nil, err
}
out := result.(*DescribeTrainingJobOutput)
out.ResultMetadata = metadata
return out, nil
}
type DescribeTrainingJobInput struct {
// The name of the training job.
//
// This member is required.
TrainingJobName *string
noSmithyDocumentSerde
}
type DescribeTrainingJobOutput struct {
// Information about the algorithm used for training, and algorithm metadata.
//
// This member is required.
AlgorithmSpecification *types.AlgorithmSpecification
// A timestamp that indicates when the training job was created.
//
// This member is required.
CreationTime *time.Time
// Information about the Amazon S3 location that is configured for storing model
// artifacts.
//
// This member is required.
ModelArtifacts *types.ModelArtifacts
// Resources, including ML compute instances and ML storage volumes, that are
// configured for model training.
//
// This member is required.
ResourceConfig *types.ResourceConfig
// Provides detailed information about the state of the training job. For
// detailed information on the secondary status of the training job, see
// StatusMessage under [SecondaryStatusTransition].
//
// SageMaker provides primary statuses and secondary statuses that apply to each
// of them:
//
// InProgress
// - Starting - Starting the training job.
//
// - Downloading - An optional stage for algorithms that support File training
// input mode. It indicates that data is being downloaded to the ML storage
// volumes.
//
// - Training - Training is in progress.
//
// - Interrupted - The job stopped because the managed spot training instances
// were interrupted.
//
// - Uploading - Training is complete and the model artifacts are being uploaded
// to the S3 location.
//
// Completed
// - Completed - The training job has completed.
//
// Failed
// - Failed - The training job has failed. The reason for the failure is returned
// in the FailureReason field of DescribeTrainingJobResponse .
//
// Stopped
// - MaxRuntimeExceeded - The job stopped because it exceeded the maximum allowed
// runtime.
//
// - MaxWaitTimeExceeded - The job stopped because it exceeded the maximum
// allowed wait time.
//
// - Stopped - The training job has stopped.
//
// Stopping
// - Stopping - Stopping the training job.
//
// Valid values for SecondaryStatus are subject to change.
//
// We no longer support the following secondary statuses:
//
// - LaunchingMLInstances
//
// - PreparingTraining
//
// - DownloadingTrainingImage
//
// [SecondaryStatusTransition]: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_SecondaryStatusTransition.html
//
// This member is required.
SecondaryStatus types.SecondaryStatus
// Specifies a limit to how long a model training job can run. It also specifies
// how long a managed Spot training job has to complete. When the job reaches the
// time limit, SageMaker ends the training job. Use this API to cap model training
// costs.
//
// To stop a job, SageMaker sends the algorithm the SIGTERM signal, which delays
// job termination for 120 seconds. Algorithms can use this 120-second window to
// save the model artifacts, so the results of training are not lost.
//
// This member is required.
StoppingCondition *types.StoppingCondition
// The Amazon Resource Name (ARN) of the training job.
//
// This member is required.
TrainingJobArn *string
// Name of the model training job.
//
// This member is required.
TrainingJobName *string
// The status of the training job.
//
// SageMaker provides the following training job statuses:
//
// - InProgress - The training is in progress.
//
// - Completed - The training job has completed.
//
// - Failed - The training job has failed. To see the reason for the failure, see
// the FailureReason field in the response to a DescribeTrainingJobResponse call.
//
// - Stopping - The training job is stopping.
//
// - Stopped - The training job has stopped.
//
// For more detailed information, see SecondaryStatus .
//
// This member is required.
TrainingJobStatus types.TrainingJobStatus
// The Amazon Resource Name (ARN) of an AutoML job.
AutoMLJobArn *string
// The billable time in seconds. Billable time refers to the absolute wall-clock
// time.
//
// Multiply BillableTimeInSeconds by the number of instances ( InstanceCount ) in
// your training cluster to get the total compute time SageMaker bills you if you
// run distributed training. The formula is as follows: BillableTimeInSeconds *
// InstanceCount .
//
// You can calculate the savings from using managed spot training using the
// formula (1 - BillableTimeInSeconds / TrainingTimeInSeconds) * 100 . For example,
// if BillableTimeInSeconds is 100 and TrainingTimeInSeconds is 500, the savings
// is 80%.
BillableTimeInSeconds *int32
// Contains information about the output location for managed spot training
// checkpoint data.
CheckpointConfig *types.CheckpointConfig
// Configuration information for the Amazon SageMaker Debugger hook parameters,
// metric and tensor collections, and storage paths. To learn more about how to
// configure the DebugHookConfig parameter, see [Use the SageMaker and Debugger Configuration API Operations to Create, Update, and Debug Your Training Job].
//
// [Use the SageMaker and Debugger Configuration API Operations to Create, Update, and Debug Your Training Job]: https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html
DebugHookConfig *types.DebugHookConfig
// Configuration information for Amazon SageMaker Debugger rules for debugging
// output tensors.
DebugRuleConfigurations []types.DebugRuleConfiguration
// Evaluation status of Amazon SageMaker Debugger rules for debugging on a
// training job.
DebugRuleEvaluationStatuses []types.DebugRuleEvaluationStatus
// To encrypt all communications between ML compute instances in distributed
// training, choose True . Encryption provides greater security for distributed
// training, but training might take longer. How long it takes depends on the
// amount of communication between compute instances, especially if you use a deep
// learning algorithms in distributed training.
EnableInterContainerTrafficEncryption *bool
// A Boolean indicating whether managed spot training is enabled ( True ) or not (
// False ).
EnableManagedSpotTraining *bool
// If you want to allow inbound or outbound network calls, except for calls
// between peers within a training cluster for distributed training, choose True .
// If you enable network isolation for training jobs that are configured to use a
// VPC, SageMaker downloads and uploads customer data and model artifacts through
// the specified VPC, but the training container does not have network access.
EnableNetworkIsolation *bool
// The environment variables to set in the Docker container.
//
// Do not include any security-sensitive information including account access IDs,
// secrets, or tokens in any environment fields. As part of the shared
// responsibility model, you are responsible for any potential exposure,
// unauthorized access, or compromise of your sensitive data if caused by
// security-sensitive information included in the request environment variable or
// plain text fields.
Environment map[string]string
// Associates a SageMaker job as a trial component with an experiment and trial.
// Specified when you call the following APIs:
//
// [CreateProcessingJob]
//
// [CreateTrainingJob]
//
// [CreateTransformJob]
//
// [CreateTransformJob]: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateTransformJob.html
// [CreateTrainingJob]: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateTrainingJob.html
// [CreateProcessingJob]: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateProcessingJob.html
ExperimentConfig *types.ExperimentConfig
// If the training job failed, the reason it failed.
FailureReason *string
// A collection of MetricData objects that specify the names, values, and dates
// and times that the training algorithm emitted to Amazon CloudWatch.
FinalMetricDataList []types.MetricData
// Algorithm-specific parameters.
HyperParameters map[string]string
// Contains information about the infrastructure health check configuration for
// the training job.
InfraCheckConfig *types.InfraCheckConfig
// An array of Channel objects that describes each data input channel.
InputDataConfig []types.Channel
// The Amazon Resource Name (ARN) of the SageMaker Ground Truth labeling job that
// created the transform or training job.
LabelingJobArn *string
// A timestamp that indicates when the status of the training job was last
// modified.
LastModifiedTime *time.Time
// The S3 path where model artifacts that you configured when creating the job are
// stored. SageMaker creates subfolders for model artifacts.
OutputDataConfig *types.OutputDataConfig
// Configuration information for Amazon SageMaker Debugger system monitoring,
// framework profiling, and storage paths.
ProfilerConfig *types.ProfilerConfig
// Configuration information for Amazon SageMaker Debugger rules for profiling
// system and framework metrics.
ProfilerRuleConfigurations []types.ProfilerRuleConfiguration
// Evaluation status of Amazon SageMaker Debugger rules for profiling on a
// training job.
ProfilerRuleEvaluationStatuses []types.ProfilerRuleEvaluationStatus
// Profiling status of a training job.
ProfilingStatus types.ProfilingStatus
// Configuration for remote debugging. To learn more about the remote debugging
// functionality of SageMaker, see [Access a training container through Amazon Web Services Systems Manager (SSM) for remote debugging].
//
// [Access a training container through Amazon Web Services Systems Manager (SSM) for remote debugging]: https://docs.aws.amazon.com/sagemaker/latest/dg/train-remote-debugging.html
RemoteDebugConfig *types.RemoteDebugConfig
// The number of times to retry the job when the job fails due to an
// InternalServerError .
RetryStrategy *types.RetryStrategy
// The Amazon Web Services Identity and Access Management (IAM) role configured
// for the training job.
RoleArn *string
// A history of all of the secondary statuses that the training job has
// transitioned through.
SecondaryStatusTransitions []types.SecondaryStatusTransition
// Configuration of storage locations for the Amazon SageMaker Debugger
// TensorBoard output data.
TensorBoardOutputConfig *types.TensorBoardOutputConfig
// Indicates the time when the training job ends on training instances. You are
// billed for the time interval between the value of TrainingStartTime and this
// time. For successful jobs and stopped jobs, this is the time after model
// artifacts are uploaded. For failed jobs, this is the time when SageMaker detects
// a job failure.
TrainingEndTime *time.Time
// Indicates the time when the training job starts on training instances. You are
// billed for the time interval between this time and the value of TrainingEndTime
// . The start time in CloudWatch Logs might be later than this time. The
// difference is due to the time it takes to download the training data and to the
// size of the training container.
TrainingStartTime *time.Time
// The training time in seconds.
TrainingTimeInSeconds *int32
// The Amazon Resource Name (ARN) of the associated hyperparameter tuning job if
// the training job was launched by a hyperparameter tuning job.
TuningJobArn *string
// A [VpcConfig] object that specifies the VPC that this training job has access to. For more
// information, see [Protect Training Jobs by Using an Amazon Virtual Private Cloud].
//
// [VpcConfig]: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_VpcConfig.html
// [Protect Training Jobs by Using an Amazon Virtual Private Cloud]: https://docs.aws.amazon.com/sagemaker/latest/dg/train-vpc.html
VpcConfig *types.VpcConfig
// The status of the warm pool associated with the training job.
WarmPoolStatus *types.WarmPoolStatus
// Metadata pertaining to the operation's result.
ResultMetadata middleware.Metadata
noSmithyDocumentSerde
}
func (c *Client) addOperationDescribeTrainingJobMiddlewares(stack *middleware.Stack, options Options) (err error) {
if err := stack.Serialize.Add(&setOperationInputMiddleware{}, middleware.After); err != nil {
return err
}
err = stack.Serialize.Add(&awsAwsjson11_serializeOpDescribeTrainingJob{}, middleware.After)
if err != nil {
return err
}
err = stack.Deserialize.Add(&awsAwsjson11_deserializeOpDescribeTrainingJob{}, middleware.After)
if err != nil {
return err
}
if err := addProtocolFinalizerMiddlewares(stack, options, "DescribeTrainingJob"); err != nil {
return fmt.Errorf("add protocol finalizers: %v", err)
}
if err = addlegacyEndpointContextSetter(stack, options); err != nil {
return err
}
if err = addSetLoggerMiddleware(stack, options); err != nil {
return err
}
if err = addClientRequestID(stack); err != nil {
return err
}
if err = addComputeContentLength(stack); err != nil {
return err
}
if err = addResolveEndpointMiddleware(stack, options); err != nil {
return err
}
if err = addComputePayloadSHA256(stack); err != nil {
return err
}
if err = addRetry(stack, options); err != nil {
return err
}
if err = addRawResponseToMetadata(stack); err != nil {
return err
}
if err = addRecordResponseTiming(stack); err != nil {
return err
}
if err = addSpanRetryLoop(stack, options); err != nil {
return err
}
if err = addClientUserAgent(stack, options); err != nil {
return err
}
if err = smithyhttp.AddErrorCloseResponseBodyMiddleware(stack); err != nil {
return err
}
if err = smithyhttp.AddCloseResponseBodyMiddleware(stack); err != nil {
return err
}
if err = addSetLegacyContextSigningOptionsMiddleware(stack); err != nil {
return err
}
if err = addTimeOffsetBuild(stack, c); err != nil {
return err
}
if err = addUserAgentRetryMode(stack, options); err != nil {
return err
}
if err = addCredentialSource(stack, options); err != nil {
return err
}
if err = addOpDescribeTrainingJobValidationMiddleware(stack); err != nil {
return err
}
if err = stack.Initialize.Add(newServiceMetadataMiddleware_opDescribeTrainingJob(options.Region), middleware.Before); err != nil {
return err
}
if err = addRecursionDetection(stack); err != nil {
return err
}
if err = addRequestIDRetrieverMiddleware(stack); err != nil {
return err
}
if err = addResponseErrorMiddleware(stack); err != nil {
return err
}
if err = addRequestResponseLogging(stack, options); err != nil {
return err
}
if err = addDisableHTTPSMiddleware(stack, options); err != nil {
return err
}
if err = addSpanInitializeStart(stack); err != nil {
return err
}
if err = addSpanInitializeEnd(stack); err != nil {
return err
}
if err = addSpanBuildRequestStart(stack); err != nil {
return err
}
if err = addSpanBuildRequestEnd(stack); err != nil {
return err
}
return nil
}
// TrainingJobCompletedOrStoppedWaiterOptions are waiter options for
// TrainingJobCompletedOrStoppedWaiter
type TrainingJobCompletedOrStoppedWaiterOptions struct {
// Set of options to modify how an operation is invoked. These apply to all
// operations invoked for this client. Use functional options on operation call to
// modify this list for per operation behavior.
//
// Passing options here is functionally equivalent to passing values to this
// config's ClientOptions field that extend the inner client's APIOptions directly.
APIOptions []func(*middleware.Stack) error
// Functional options to be passed to all operations invoked by this client.
//
// Function values that modify the inner APIOptions are applied after the waiter
// config's own APIOptions modifiers.
ClientOptions []func(*Options)
// MinDelay is the minimum amount of time to delay between retries. If unset,
// TrainingJobCompletedOrStoppedWaiter will use default minimum delay of 120
// seconds. Note that MinDelay must resolve to a value lesser than or equal to the
// MaxDelay.
MinDelay time.Duration
// MaxDelay is the maximum amount of time to delay between retries. If unset or
// set to zero, TrainingJobCompletedOrStoppedWaiter will use default max delay of
// 120 seconds. Note that MaxDelay must resolve to value greater than or equal to
// the MinDelay.
MaxDelay time.Duration
// LogWaitAttempts is used to enable logging for waiter retry attempts
LogWaitAttempts bool
// Retryable is function that can be used to override the service defined
// waiter-behavior based on operation output, or returned error. This function is
// used by the waiter to decide if a state is retryable or a terminal state.
//
// By default service-modeled logic will populate this option. This option can
// thus be used to define a custom waiter state with fall-back to service-modeled
// waiter state mutators.The function returns an error in case of a failure state.
// In case of retry state, this function returns a bool value of true and nil
// error, while in case of success it returns a bool value of false and nil error.
Retryable func(context.Context, *DescribeTrainingJobInput, *DescribeTrainingJobOutput, error) (bool, error)
}
// TrainingJobCompletedOrStoppedWaiter defines the waiters for
// TrainingJobCompletedOrStopped
type TrainingJobCompletedOrStoppedWaiter struct {
client DescribeTrainingJobAPIClient
options TrainingJobCompletedOrStoppedWaiterOptions
}
// NewTrainingJobCompletedOrStoppedWaiter constructs a
// TrainingJobCompletedOrStoppedWaiter.
func NewTrainingJobCompletedOrStoppedWaiter(client DescribeTrainingJobAPIClient, optFns ...func(*TrainingJobCompletedOrStoppedWaiterOptions)) *TrainingJobCompletedOrStoppedWaiter {
options := TrainingJobCompletedOrStoppedWaiterOptions{}
options.MinDelay = 120 * time.Second
options.MaxDelay = 120 * time.Second
options.Retryable = trainingJobCompletedOrStoppedStateRetryable
for _, fn := range optFns {
fn(&options)
}
return &TrainingJobCompletedOrStoppedWaiter{
client: client,
options: options,
}
}
// Wait calls the waiter function for TrainingJobCompletedOrStopped waiter. The
// maxWaitDur is the maximum wait duration the waiter will wait. The maxWaitDur is
// required and must be greater than zero.
func (w *TrainingJobCompletedOrStoppedWaiter) Wait(ctx context.Context, params *DescribeTrainingJobInput, maxWaitDur time.Duration, optFns ...func(*TrainingJobCompletedOrStoppedWaiterOptions)) error {
_, err := w.WaitForOutput(ctx, params, maxWaitDur, optFns...)
return err
}
// WaitForOutput calls the waiter function for TrainingJobCompletedOrStopped
// waiter and returns the output of the successful operation. The maxWaitDur is the
// maximum wait duration the waiter will wait. The maxWaitDur is required and must
// be greater than zero.
func (w *TrainingJobCompletedOrStoppedWaiter) WaitForOutput(ctx context.Context, params *DescribeTrainingJobInput, maxWaitDur time.Duration, optFns ...func(*TrainingJobCompletedOrStoppedWaiterOptions)) (*DescribeTrainingJobOutput, error) {
if maxWaitDur <= 0 {
return nil, fmt.Errorf("maximum wait time for waiter must be greater than zero")
}
options := w.options
for _, fn := range optFns {
fn(&options)
}
if options.MaxDelay <= 0 {
options.MaxDelay = 120 * time.Second
}
if options.MinDelay > options.MaxDelay {
return nil, fmt.Errorf("minimum waiter delay %v must be lesser than or equal to maximum waiter delay of %v.", options.MinDelay, options.MaxDelay)
}
ctx, cancelFn := context.WithTimeout(ctx, maxWaitDur)
defer cancelFn()
logger := smithywaiter.Logger{}
remainingTime := maxWaitDur
var attempt int64
for {
attempt++
apiOptions := options.APIOptions
start := time.Now()
if options.LogWaitAttempts {
logger.Attempt = attempt
apiOptions = append([]func(*middleware.Stack) error{}, options.APIOptions...)
apiOptions = append(apiOptions, logger.AddLogger)
}
out, err := w.client.DescribeTrainingJob(ctx, params, func(o *Options) {
baseOpts := []func(*Options){
addIsWaiterUserAgent,
}
o.APIOptions = append(o.APIOptions, apiOptions...)
for _, opt := range baseOpts {
opt(o)
}
for _, opt := range options.ClientOptions {
opt(o)
}
})
retryable, err := options.Retryable(ctx, params, out, err)
if err != nil {
return nil, err
}
if !retryable {
return out, nil
}
remainingTime -= time.Since(start)
if remainingTime < options.MinDelay || remainingTime <= 0 {
break
}
// compute exponential backoff between waiter retries
delay, err := smithywaiter.ComputeDelay(
attempt, options.MinDelay, options.MaxDelay, remainingTime,
)
if err != nil {
return nil, fmt.Errorf("error computing waiter delay, %w", err)
}
remainingTime -= delay
// sleep for the delay amount before invoking a request
if err := smithytime.SleepWithContext(ctx, delay); err != nil {
return nil, fmt.Errorf("request cancelled while waiting, %w", err)
}
}
return nil, fmt.Errorf("exceeded max wait time for TrainingJobCompletedOrStopped waiter")
}
func trainingJobCompletedOrStoppedStateRetryable(ctx context.Context, input *DescribeTrainingJobInput, output *DescribeTrainingJobOutput, err error) (bool, error) {
if err == nil {
v1 := output.TrainingJobStatus
expectedValue := "Completed"
var pathValue string
pathValue = string(v1)
if pathValue == expectedValue {
return false, nil
}
}
if err == nil {
v1 := output.TrainingJobStatus
expectedValue := "Stopped"
var pathValue string
pathValue = string(v1)
if pathValue == expectedValue {
return false, nil
}
}
if err == nil {
v1 := output.TrainingJobStatus
expectedValue := "Failed"
var pathValue string
pathValue = string(v1)
if pathValue == expectedValue {
return false, fmt.Errorf("waiter state transitioned to Failure")
}
}
if err != nil {
var apiErr smithy.APIError
ok := errors.As(err, &apiErr)
if !ok {
return false, fmt.Errorf("expected err to be of type smithy.APIError, got %w", err)
}
if "ValidationException" == apiErr.ErrorCode() {
return false, fmt.Errorf("waiter state transitioned to Failure")
}
}
if err != nil {
return false, err
}
return true, nil
}
// DescribeTrainingJobAPIClient is a client that implements the
// DescribeTrainingJob operation.
type DescribeTrainingJobAPIClient interface {
DescribeTrainingJob(context.Context, *DescribeTrainingJobInput, ...func(*Options)) (*DescribeTrainingJobOutput, error)
}
var _ DescribeTrainingJobAPIClient = (*Client)(nil)
func newServiceMetadataMiddleware_opDescribeTrainingJob(region string) *awsmiddleware.RegisterServiceMetadata {
return &awsmiddleware.RegisterServiceMetadata{
Region: region,
ServiceID: ServiceID,
OperationName: "DescribeTrainingJob",
}
}