in lib/lambda-handler/hello_emr.py [0:0]
def lambda_handler(event, context):
print(event)
input_file_path = get_upload_file_path(event)
dest_file_path = f's3://{dest_bucket_name}/{int(time.time())}'
response = emr.run_job_flow(
Name="SampleCluster",
ReleaseLabel='emr-5.23.0',
Applications=[
{
'Name': 'Hadoop'
},
{
'Name': 'Hive'
},
{
'Name': 'Spark'
},
],
Instances={
'InstanceGroups': [
{
'Name': 'Master nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'MASTER',
'InstanceType': 'r3.xlarge',
'InstanceCount': 1,
},
{
'Name': 'Core nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'CORE',
'InstanceType': 'r3.xlarge',
'InstanceCount': 4,
'EbsConfiguration': {
'EbsBlockDeviceConfigs': [
{
'VolumeSpecification': {
'VolumeType': 'gp2',
'SizeInGB': 100
},
'VolumesPerInstance': 1
},
],
'EbsOptimized': False
},
},
],
'KeepJobFlowAliveWhenNoSteps': False,
'TerminationProtected': False
#'Ec2SubnetId': ec2SubnetId,
#'EmrManagedMasterSecurityGroup': master_sg,
#'EmrManagedSlaveSecurityGroup': slave_sg,
#'ServiceAccessSecurityGroup': service_sg,
},
Steps=[
{
'Name': 'Spark application',
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
'spark-submit',
'--class','com.aws.sample.SparkSimpleJob',
job_file_path,
input_file_path,
dest_file_path
]
}
}
],
VisibleToAllUsers=True,
JobFlowRole='EMR_EC2_DefaultRole',
ServiceRole='EMR_DefaultRole',
)
# print(response)
return {
'statusCode': 200,
'body': json.dumps(response)
}