in lib/constructs/data-set-enrollment.ts [218:320]
constructor(scope: Construct, id: string, props: DataSetEnrollmentProps) {
super(scope, id);
this.DataLakeTargets = props.DataLakeTargets;
this.DataLakeBucketName = props.GlueScriptArguments['--DL_BUCKET'];
this.DataLakePrefix = props.GlueScriptArguments['--DL_PREFIX'];
this.DataSetName = props.dataSetName;
this.Dataset_SourceDatabaseName = `${props.dataSetName}_src`;
this.Dataset_DatalakeDatabaseName = `${props.dataSetName}_dl`;
this.Dataset_Source = new glue.CfnDatabase(this, `${props.dataSetName}_src`, {
catalogId: Aws.ACCOUNT_ID,
databaseInput: {
name: this.Dataset_SourceDatabaseName,
locationUri: `s3://${props.dataLakeBucket.bucketName}/${props.dataSetName}/`
}
});
this.Dataset_Datalake = new glue.CfnDatabase(this, `${props.dataSetName}_dl`, {
catalogId: Aws.ACCOUNT_ID,
databaseInput: {
name: this.Dataset_DatalakeDatabaseName,
locationUri: `s3://${props.dataLakeBucket.bucketName}/${props.dataSetName}/`
}
});
let connectionArray = [];
if(props.SourceConnectionInput){
this.SourceConnection = new glue.CfnConnection(this, `${props.dataSetName}-src-connection`, {
catalogId: this.Dataset_Source.catalogId,
connectionInput: props.SourceConnectionInput
});
if(props.SourceConnectionInput.name){
connectionArray.push(props.SourceConnectionInput.name);
}
}
this.DataSetGlueRole = new iam.Role(this, `${props.dataSetName}-GlueRole`, {
assumedBy: new iam.ServicePrincipal('glue.amazonaws.com')
});
this.DataSetGlueRole.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole'));
this.DataSetGlueRole.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('CloudWatchAgentServerPolicy'));
props.dataLakeBucket.grantReadWrite(this.DataSetGlueRole);
if(typeof props.SourceAccessPolicy !== 'undefined'){
props.SourceAccessPolicy.attachToRole(this.DataSetGlueRole);
}
const sourceCrawler = this.setupCrawler(this.Dataset_Source, props.SourceTargets, true, this.Dataset_SourceDatabaseName);
const glueScript = new s3assets.Asset(this, `${props.dataSetName}-GlueScript`, {
path: props.GlueScriptPath
});
glueScript.grantRead(this.DataSetGlueRole);
/// The spread operator below (...) makes the connections property conditional. Its only used for JDBC sources at the moment.
const jobParams = {
executionProperty: {
maxConcurrentRuns: 1
},
name: `${props.dataSetName}_src_to_dl_etl`,
timeout: 2880,
glueVersion: "2.0",
maxCapacity: props.MaxDPUs,
command: {
scriptLocation: `s3://${glueScript.s3BucketName}/${glueScript.s3ObjectKey}`,
name: "glueetl",
pythonVersion: "3"
},
role: this.DataSetGlueRole.roleArn,
maxRetries: 0,
defaultArguments: props.GlueScriptArguments,
...(typeof props.SourceConnectionInput !== "undefined" && {
connections: {
connections: connectionArray
}
})
}
const etl_job = new glue.CfnJob(this, `${props.dataSetName}-EtlJob`, jobParams );
const datalake_crawler = this.setupCrawler(this.Dataset_Datalake, this.DataLakeTargets, false, this.Dataset_DatalakeDatabaseName);
const datalakeEnrollmentWorkflow = new DataLakeEnrollmentWorkflow(this,`${props.dataSetName}DataLakeWorkflow`,{
workfowName: `${props.dataSetName}_DataLakeEnrollmentWorkflow`,
srcCrawler: sourceCrawler,
etlJob: etl_job,
datalakeCrawler: datalake_crawler,
WorkflowCronScheduleExpression: props.WorkflowCronScheduleExpression
});
}