service/sagemaker/api_op_CreateTrainingJob.go (162 lines of code) (raw):
// Code generated by smithy-go-codegen DO NOT EDIT.
package sagemaker
import (
"context"
"fmt"
awsmiddleware "github.com/aws/aws-sdk-go-v2/aws/middleware"
"github.com/aws/aws-sdk-go-v2/service/sagemaker/types"
"github.com/aws/smithy-go/middleware"
smithyhttp "github.com/aws/smithy-go/transport/http"
)
// Starts a model training job. After training completes, SageMaker saves the
// resulting model artifacts to an Amazon S3 location that you specify.
//
// If you choose to host your model using SageMaker hosting services, you can use
// the resulting model artifacts as part of the model. You can also use the
// artifacts in a machine learning service other than SageMaker, provided that you
// know how to use them for inference.
//
// In the request body, you provide the following:
//
// - AlgorithmSpecification - Identifies the training algorithm to use.
//
// - HyperParameters - Specify these algorithm-specific parameters to enable the
// estimation of model parameters during training. Hyperparameters can be tuned to
// optimize this learning process. For a list of hyperparameters for each training
// algorithm provided by SageMaker, see [Algorithms].
//
// Do not include any security-sensitive information including account access IDs,
//
// secrets, or tokens in any hyperparameter fields. As part of the shared
// responsibility model, you are responsible for any potential exposure,
// unauthorized access, or compromise of your sensitive data if caused by
// security-sensitive information included in the request hyperparameter variable
// or plain text fields.
//
// - InputDataConfig - Describes the input required by the training job and the
// Amazon S3, EFS, or FSx location where it is stored.
//
// - OutputDataConfig - Identifies the Amazon S3 bucket where you want SageMaker
// to save the results of model training.
//
// - ResourceConfig - Identifies the resources, ML compute instances, and ML
// storage volumes to deploy for model training. In distributed training, you
// specify more than one instance.
//
// - EnableManagedSpotTraining - Optimize the cost of training machine learning
// models by up to 80% by using Amazon EC2 Spot instances. For more information,
// see [Managed Spot Training].
//
// - RoleArn - The Amazon Resource Name (ARN) that SageMaker assumes to perform
// tasks on your behalf during model training.
//
// You must grant this role the necessary permissions so that SageMaker can
//
// successfully complete model training.
//
// - StoppingCondition - To help cap training costs, use MaxRuntimeInSeconds to
// set a time limit for training. Use MaxWaitTimeInSeconds to specify how long a
// managed spot training job has to complete.
//
// - Environment - The environment variables to set in the Docker container.
//
// Do not include any security-sensitive information including account access IDs,
//
// secrets, or tokens in any environment fields. As part of the shared
// responsibility model, you are responsible for any potential exposure,
// unauthorized access, or compromise of your sensitive data if caused by
// security-sensitive information included in the request environment variable or
// plain text fields.
//
// - RetryStrategy - The number of times to retry the job when the job fails due
// to an InternalServerError .
//
// For more information about SageMaker, see [How It Works].
//
// [Algorithms]: https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html
// [How It Works]: https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works.html
//
// [Managed Spot Training]: https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html
func (c *Client) CreateTrainingJob(ctx context.Context, params *CreateTrainingJobInput, optFns ...func(*Options)) (*CreateTrainingJobOutput, error) {
if params == nil {
params = &CreateTrainingJobInput{}
}
result, metadata, err := c.invokeOperation(ctx, "CreateTrainingJob", params, optFns, c.addOperationCreateTrainingJobMiddlewares)
if err != nil {
return nil, err
}
out := result.(*CreateTrainingJobOutput)
out.ResultMetadata = metadata
return out, nil
}
type CreateTrainingJobInput struct {
// The registry path of the Docker image that contains the training algorithm and
// algorithm-specific metadata, including the input mode. For more information
// about algorithms provided by SageMaker, see [Algorithms]. For information about providing
// your own algorithms, see [Using Your Own Algorithms with Amazon SageMaker].
//
// [Algorithms]: https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html
// [Using Your Own Algorithms with Amazon SageMaker]: https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms.html
//
// This member is required.
AlgorithmSpecification *types.AlgorithmSpecification
// Specifies the path to the S3 location where you want to store model artifacts.
// SageMaker creates subfolders for the artifacts.
//
// This member is required.
OutputDataConfig *types.OutputDataConfig
// The resources, including the ML compute instances and ML storage volumes, to
// use for model training.
//
// ML storage volumes store model artifacts and incremental states. Training
// algorithms might also use ML storage volumes for scratch space. If you want
// SageMaker to use the ML storage volume to store the training data, choose File
// as the TrainingInputMode in the algorithm specification. For distributed
// training algorithms, specify an instance count greater than 1.
//
// This member is required.
ResourceConfig *types.ResourceConfig
// The Amazon Resource Name (ARN) of an IAM role that SageMaker can assume to
// perform tasks on your behalf.
//
// During model training, SageMaker needs your permission to read input data from
// an S3 bucket, download a Docker image that contains training code, write model
// artifacts to an S3 bucket, write logs to Amazon CloudWatch Logs, and publish
// metrics to Amazon CloudWatch. You grant permissions for all of these tasks to an
// IAM role. For more information, see [SageMaker Roles].
//
// To be able to pass this role to SageMaker, the caller of this API must have the
// iam:PassRole permission.
//
// [SageMaker Roles]: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html
//
// This member is required.
RoleArn *string
// Specifies a limit to how long a model training job can run. It also specifies
// how long a managed Spot training job has to complete. When the job reaches the
// time limit, SageMaker ends the training job. Use this API to cap model training
// costs.
//
// To stop a job, SageMaker sends the algorithm the SIGTERM signal, which delays
// job termination for 120 seconds. Algorithms can use this 120-second window to
// save the model artifacts, so the results of training are not lost.
//
// This member is required.
StoppingCondition *types.StoppingCondition
// The name of the training job. The name must be unique within an Amazon Web
// Services Region in an Amazon Web Services account.
//
// This member is required.
TrainingJobName *string
// Contains information about the output location for managed spot training
// checkpoint data.
CheckpointConfig *types.CheckpointConfig
// Configuration information for the Amazon SageMaker Debugger hook parameters,
// metric and tensor collections, and storage paths. To learn more about how to
// configure the DebugHookConfig parameter, see [Use the SageMaker and Debugger Configuration API Operations to Create, Update, and Debug Your Training Job].
//
// [Use the SageMaker and Debugger Configuration API Operations to Create, Update, and Debug Your Training Job]: https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html
DebugHookConfig *types.DebugHookConfig
// Configuration information for Amazon SageMaker Debugger rules for debugging
// output tensors.
DebugRuleConfigurations []types.DebugRuleConfiguration
// To encrypt all communications between ML compute instances in distributed
// training, choose True . Encryption provides greater security for distributed
// training, but training might take longer. How long it takes depends on the
// amount of communication between compute instances, especially if you use a deep
// learning algorithm in distributed training. For more information, see [Protect Communications Between ML Compute Instances in a Distributed Training Job].
//
// [Protect Communications Between ML Compute Instances in a Distributed Training Job]: https://docs.aws.amazon.com/sagemaker/latest/dg/train-encrypt.html
EnableInterContainerTrafficEncryption *bool
// To train models using managed spot training, choose True . Managed spot training
// provides a fully managed and scalable infrastructure for training machine
// learning models. this option is useful when training jobs can be interrupted and
// when there is flexibility when the training job is run.
//
// The complete and intermediate results of jobs are stored in an Amazon S3
// bucket, and can be used as a starting point to train models incrementally.
// Amazon SageMaker provides metrics and logs in CloudWatch. They can be used to
// see when managed spot training jobs are running, interrupted, resumed, or
// completed.
EnableManagedSpotTraining *bool
// Isolates the training container. No inbound or outbound network calls can be
// made, except for calls between peers within a training cluster for distributed
// training. If you enable network isolation for training jobs that are configured
// to use a VPC, SageMaker downloads and uploads customer data and model artifacts
// through the specified VPC, but the training container does not have network
// access.
EnableNetworkIsolation *bool
// The environment variables to set in the Docker container.
//
// Do not include any security-sensitive information including account access IDs,
// secrets, or tokens in any environment fields. As part of the shared
// responsibility model, you are responsible for any potential exposure,
// unauthorized access, or compromise of your sensitive data if caused by
// security-sensitive information included in the request environment variable or
// plain text fields.
Environment map[string]string
// Associates a SageMaker job as a trial component with an experiment and trial.
// Specified when you call the following APIs:
//
// [CreateProcessingJob]
//
// [CreateTrainingJob]
//
// [CreateTransformJob]
//
// [CreateTransformJob]: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateTransformJob.html
// [CreateTrainingJob]: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateTrainingJob.html
// [CreateProcessingJob]: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateProcessingJob.html
ExperimentConfig *types.ExperimentConfig
// Algorithm-specific parameters that influence the quality of the model. You set
// hyperparameters before you start the learning process. For a list of
// hyperparameters for each training algorithm provided by SageMaker, see [Algorithms].
//
// You can specify a maximum of 100 hyperparameters. Each hyperparameter is a
// key-value pair. Each key and value is limited to 256 characters, as specified by
// the Length Constraint .
//
// Do not include any security-sensitive information including account access IDs,
// secrets, or tokens in any hyperparameter fields. As part of the shared
// responsibility model, you are responsible for any potential exposure,
// unauthorized access, or compromise of your sensitive data if caused by any
// security-sensitive information included in the request hyperparameter variable
// or plain text fields.
//
// [Algorithms]: https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html
HyperParameters map[string]string
// Contains information about the infrastructure health check configuration for
// the training job.
InfraCheckConfig *types.InfraCheckConfig
// An array of Channel objects. Each channel is a named input source.
// InputDataConfig describes the input data and its location.
//
// Algorithms can accept input data from one or more channels. For example, an
// algorithm might have two channels of input data, training_data and
// validation_data . The configuration for each channel provides the S3, EFS, or
// FSx location where the input data is stored. It also provides information about
// the stored data: the MIME type, compression method, and whether the data is
// wrapped in RecordIO format.
//
// Depending on the input mode that the algorithm supports, SageMaker either
// copies input data files from an S3 bucket to a local directory in the Docker
// container, or makes it available as input streams. For example, if you specify
// an EFS location, input data files are available as input streams. They do not
// need to be downloaded.
//
// Your input must be in the same Amazon Web Services region as your training job.
InputDataConfig []types.Channel
// Configuration information for Amazon SageMaker Debugger system monitoring,
// framework profiling, and storage paths.
ProfilerConfig *types.ProfilerConfig
// Configuration information for Amazon SageMaker Debugger rules for profiling
// system and framework metrics.
ProfilerRuleConfigurations []types.ProfilerRuleConfiguration
// Configuration for remote debugging. To learn more about the remote debugging
// functionality of SageMaker, see [Access a training container through Amazon Web Services Systems Manager (SSM) for remote debugging].
//
// [Access a training container through Amazon Web Services Systems Manager (SSM) for remote debugging]: https://docs.aws.amazon.com/sagemaker/latest/dg/train-remote-debugging.html
RemoteDebugConfig *types.RemoteDebugConfig
// The number of times to retry the job when the job fails due to an
// InternalServerError .
RetryStrategy *types.RetryStrategy
// Contains information about attribute-based access control (ABAC) for the
// training job.
SessionChainingConfig *types.SessionChainingConfig
// An array of key-value pairs. You can use tags to categorize your Amazon Web
// Services resources in different ways, for example, by purpose, owner, or
// environment. For more information, see [Tagging Amazon Web Services Resources].
//
// Do not include any security-sensitive information including account access IDs,
// secrets, or tokens in any tags. As part of the shared responsibility model, you
// are responsible for any potential exposure, unauthorized access, or compromise
// of your sensitive data if caused by any security-sensitive information included
// in the request tag variable or plain text fields.
//
// [Tagging Amazon Web Services Resources]: https://docs.aws.amazon.com/general/latest/gr/aws_tagging.html
Tags []types.Tag
// Configuration of storage locations for the Amazon SageMaker Debugger
// TensorBoard output data.
TensorBoardOutputConfig *types.TensorBoardOutputConfig
// A [VpcConfig] object that specifies the VPC that you want your training job to connect to.
// Control access to and from your training container by configuring the VPC. For
// more information, see [Protect Training Jobs by Using an Amazon Virtual Private Cloud].
//
// [VpcConfig]: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_VpcConfig.html
// [Protect Training Jobs by Using an Amazon Virtual Private Cloud]: https://docs.aws.amazon.com/sagemaker/latest/dg/train-vpc.html
VpcConfig *types.VpcConfig
noSmithyDocumentSerde
}
type CreateTrainingJobOutput struct {
// The Amazon Resource Name (ARN) of the training job.
//
// This member is required.
TrainingJobArn *string
// Metadata pertaining to the operation's result.
ResultMetadata middleware.Metadata
noSmithyDocumentSerde
}
func (c *Client) addOperationCreateTrainingJobMiddlewares(stack *middleware.Stack, options Options) (err error) {
if err := stack.Serialize.Add(&setOperationInputMiddleware{}, middleware.After); err != nil {
return err
}
err = stack.Serialize.Add(&awsAwsjson11_serializeOpCreateTrainingJob{}, middleware.After)
if err != nil {
return err
}
err = stack.Deserialize.Add(&awsAwsjson11_deserializeOpCreateTrainingJob{}, middleware.After)
if err != nil {
return err
}
if err := addProtocolFinalizerMiddlewares(stack, options, "CreateTrainingJob"); err != nil {
return fmt.Errorf("add protocol finalizers: %v", err)
}
if err = addlegacyEndpointContextSetter(stack, options); err != nil {
return err
}
if err = addSetLoggerMiddleware(stack, options); err != nil {
return err
}
if err = addClientRequestID(stack); err != nil {
return err
}
if err = addComputeContentLength(stack); err != nil {
return err
}
if err = addResolveEndpointMiddleware(stack, options); err != nil {
return err
}
if err = addComputePayloadSHA256(stack); err != nil {
return err
}
if err = addRetry(stack, options); err != nil {
return err
}
if err = addRawResponseToMetadata(stack); err != nil {
return err
}
if err = addRecordResponseTiming(stack); err != nil {
return err
}
if err = addSpanRetryLoop(stack, options); err != nil {
return err
}
if err = addClientUserAgent(stack, options); err != nil {
return err
}
if err = smithyhttp.AddErrorCloseResponseBodyMiddleware(stack); err != nil {
return err
}
if err = smithyhttp.AddCloseResponseBodyMiddleware(stack); err != nil {
return err
}
if err = addSetLegacyContextSigningOptionsMiddleware(stack); err != nil {
return err
}
if err = addTimeOffsetBuild(stack, c); err != nil {
return err
}
if err = addUserAgentRetryMode(stack, options); err != nil {
return err
}
if err = addCredentialSource(stack, options); err != nil {
return err
}
if err = addOpCreateTrainingJobValidationMiddleware(stack); err != nil {
return err
}
if err = stack.Initialize.Add(newServiceMetadataMiddleware_opCreateTrainingJob(options.Region), middleware.Before); err != nil {
return err
}
if err = addRecursionDetection(stack); err != nil {
return err
}
if err = addRequestIDRetrieverMiddleware(stack); err != nil {
return err
}
if err = addResponseErrorMiddleware(stack); err != nil {
return err
}
if err = addRequestResponseLogging(stack, options); err != nil {
return err
}
if err = addDisableHTTPSMiddleware(stack, options); err != nil {
return err
}
if err = addSpanInitializeStart(stack); err != nil {
return err
}
if err = addSpanInitializeEnd(stack); err != nil {
return err
}
if err = addSpanBuildRequestStart(stack); err != nil {
return err
}
if err = addSpanBuildRequestEnd(stack); err != nil {
return err
}
return nil
}
func newServiceMetadataMiddleware_opCreateTrainingJob(region string) *awsmiddleware.RegisterServiceMetadata {
return &awsmiddleware.RegisterServiceMetadata{
Region: region,
ServiceID: ServiceID,
OperationName: "CreateTrainingJob",
}
}