in code/cross_validation_with_hpo.py [0:0]
def train(train=None,
test=None,
image_uri=None,
instance_type="ml.c4.xlarge",
instance_count=1,
output_path=None,
k = 5,
max_jobs=2,
max_parallel_jobs=2,
min_c = 0,
max_c = 1,
min_gamma=0.0001,
max_gamma=0.001,
gamma_scaling_type="Logarithmic",
region = "us-east-2"):
"""Triggers a sagemaker automatic hyperparameter tuning optimization job to train and evaluate a given algorithm.
Hyperparameter tuner job triggers maximum number of training jobs with the given maximum parallel jobs per batch. Each training job triggered by the tuner would trigger k cross validation model training jobs.
Args:
train: S3 URI where the training dataset is located
test: S3 URI where the test dataset is located
image_uri: ECR repository URI for the training image
instance_type: Instance type to be used for the Sagemaker Training Jobs.
instance_count: number of intances to be used for the Sagemaker Training Jobs.
output_path: S3 URI for the output artifacts generated in this script.
k: number of k in Kfold cross validation
max_jobs: Maximum number of jobs the HyperparameterTuner triggers
max_parallel_jobs: Maximum number of parallel jobs the HyperparameterTuner trigger in one batch.
min_c: minimum c value configure as continuous parameter for hyperparameter tuning process
max_c: maximum c value configure as continuous parameter for hyperparameter tuning process
min_gamma: minimum gamma value configure as continuous parameter for hyperparameter tuning process
max_gamma: maximum gamma value configure as continuous parameter for hyperparameter tuning process
gamma_scaling_type: scaling type used in the Hyperparameter tuning process for gamma
"""
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()
sm_client = boto3.client("sagemaker")
# An Estimator object to be associated with the HyperparameterTuner job.
cv_estimator = Estimator(
image_uri=image_uri,
instance_type=instance_type,
instance_count=instance_count,
role=role,
sagemaker_session=sagemaker_session,
output_path=output_path)
cv_estimator.set_hyperparameters(
train_src = train,
test_src = test,
k = k,
instance_type = instance_type,
region = region)
hyperparameter_ranges = {
'c': ContinuousParameter(min_c, max_c),
'kernel' : CategoricalParameter(['linear', 'poly', 'rbf', 'sigmoid']),
'gamma' : ContinuousParameter(min_value=min_gamma,
max_value=max_gamma,
scaling_type=gamma_scaling_type)
}
objective_metric_name = "test:score"
tuner = HyperparameterTuner(cv_estimator,
objective_metric_name,
hyperparameter_ranges,
objective_type="Maximize",
max_jobs=max_jobs,
max_parallel_jobs=max_parallel_jobs,
metric_definitions=[{"Name": objective_metric_name,
"Regex": "model test score:(.*?);"}])
tuner.fit({"train": train, "test": test}, include_cls_metadata=True)
best_traning_job_name = tuner.best_training_job()
tuner_job_name = tuner.latest_tuning_job.name
best_performing_job = sm_client.describe_training_job(TrainingJobName=best_traning_job_name)
hyper_params = best_performing_job['HyperParameters']
best_hyperparams = { k:v for k,v in hyper_params.items() if not k.startswith("sagemaker_")}
jobinfo = {}
jobinfo['name'] = tuner_job_name
jobinfo['best_training_job'] = best_traning_job_name
jobinfo['hyperparams'] = best_hyperparams
metric_value = [ x['Value'] for x in best_performing_job['FinalMetricDataList']
if x['MetricName'] == objective_metric_name ][0]
evaluation_metrics = { "multiclass_classification_metrics" : {
"accuracy" : {
"value" : metric_value,
"standard_deviation" : "NaN"
},
}
}
os.makedirs(base_dir_evaluation, exist_ok=True)
with open(f'{base_dir_evaluation}/evaluation.json', 'w') as f:
f.write(json.dumps(evaluation_metrics))
with open(f'{base_dir_jobinfo}/jobinfo.json', 'w') as f:
f.write(json.dumps(jobinfo))