in backend/training-pipeline/functions/api/start_job.py [0:0]
def post(event, context):
print('===Starting start_job function ===')
# Get parameters
if event['body'] is None:
print('No parameter passed. Returning error.')
return create_response_obj(400, {
'errorMessage': 'No parameter passed in the POST body'
})
body = json.loads(event['body'])
# Input for Step Function
sf_input = copy.deepcopy(template_input)
sf_input['Mode'] = MODE.value
# Input source for training
s3_data_source = sf_input['Training']['InputDataConfig'][0]['DataSource']['S3DataSource']
s3_filename = str(body['trainingDataS3Name'])
s3_data_source['S3Uri'] = f"{s3_path}public/{s3_filename}"
print(f"Training dataset is from: {s3_data_source['S3Uri']}")
if MODE == Mode.MODEL:
print('Training new model mode inited')
name = MODE.value.lower() + '-' + body['modelName']
# Hyper parameters
sf_input['Training']['HyperParameters'] = {
'final_training': 'True',
# Common parameters
'target': str(body['target']),
'batch_normalization': str(body['batchNormalization']),
'include_dropout': str(body['includeDropout']),
'loss_metric': str(body['lossMetric']),
'monitor_metric': str(body['monitorMetric']),
'lr_update_patience': str(body['lrUpdatePatience']),
'early_stopping_patience': str(body['earlyStoppingPatience']),
# Train new model specific parameter
'nb_epochs_f': str(body['nbEpochsF']),
'batch_size_f': str(body['batchSizeF']),
'optimizer_f': str(body['optimizerF']),
'last_activation_f': str(body['lastActivationF']),
'num_layers_f': str(len(body['nodes'])),
'nodes': str(body['nodes'][:-1])
}
# Use the same name for all component for ease of tracing
sf_input['Training']['TrainingJobName'] = name
sf_input['Create Model']['ModelName'] = name
sf_input['Configure Endpoint']['EndpointConfigName'] = name
sf_input['Configure Endpoint']['ProductionVariants'][0]['ModelName'] = name
sf_input['Deploy']['EndpointConfigName'] = name
sf_input['Deploy']['EndpointName'] = name
elif MODE == Mode.HPO:
print('HPO mode inited')
now = datetime.datetime.now()
name = MODE.value.lower() + '-' + now.strftime("%Y-%m-%d-%H-%M-%S")
del sf_input['Create Model']
del sf_input['Configure Endpoint']
del sf_input['Deploy']
sf_input['Training']['TrainingJobName'] = name
sf_input['Training']['HyperParameters'] = {
'final_training': 'False',
# Common parameters
'target': str(body['target']),
'batch_normalization': str(body['batchNormalization']),
'include_dropout': str(body['includeDropout']),
'loss_metric': str(body['lossMetric']),
'monitor_metric': str(body['monitorMetric']),
'lr_update_patience': str(body['lrUpdatePatience']),
'early_stopping_patience': str(body['earlyStoppingPatience']),
# HPO-specific parameters
'dropout': str(body['dropout']),
'train_validation_split': str(body['trainValidationSplit']),
'used_data_percentage': str(body['usedDataPercentage']),
'choice_of_node_numbers': str(body['choiceOfNodeNumbers']),
'batch_size': str(body['batchSize']),
'MAX_EVALS': str(body['maxEval']),
'randstate': str(body['randomState']),
'num_layers_low': str(body['numLayersLow']),
'num_layers_high': str(body['numLayersHigh']),
'nb_epochs': str(body['nbEpochs']),
'optimizer': str(body['optimizers']),
'last_activation': str(body['activationFunctions']),
}
print('====sf_input===')
print(json.dumps(sf_input, indent=1))
print(f"TRAINING_STATE_MACHINE_ARN = {TRAINING_STATE_MACHINE_ARN}")
sf_response = sf_client.start_execution(
stateMachineArn=TRAINING_STATE_MACHINE_ARN,
input=json.dumps(sf_input)
)
print(f"sf_response = {sf_response}")
return create_response_obj(200, sf_response)