in dags/xgboost-ml-pipeline/1.10/mwaa-customer-churn-dag.py [0:0]
def preprocess_glue():
"""preprocess data using glue for etl"""
# not best practice to hard code location
glue_script_location = 's3://{}/{}'.format(config.GLUE_JOB_SCRIPT_S3_BUCKET, config.GLUE_JOB_SCRIPT_S3_KEY)
glue_client = boto3.client('glue')
# instantiate the Glue ETL job
response = glue_client.create_job(
Name=glue_job_name,
Description='PySpark job to extract the data and split in to training and validation data sets',
Role=config.GLUE_ROLE_NAME,
ExecutionProperty={
'MaxConcurrentRuns': 2
},
Command={
'Name': 'glueetl',
'ScriptLocation': glue_script_location,
'PythonVersion': '3'
},
DefaultArguments={
'--job-language': 'python'
},
GlueVersion='1.0',
WorkerType='Standard',
NumberOfWorkers=2,
Timeout=60
)
# execute the previously instantiated Glue ETL job
response = glue_client.start_job_run(
JobName=response['Name'],
Arguments={
'--S3_SOURCE': config.DATA_S3_SOURCE,
'--S3_DEST': config.DATA_S3_DEST,
'--TRAIN_KEY': 'train/',
'--VAL_KEY': 'validation/'
}
)