packages/constructs/L3/dataops/dataops-databrew-l3-construct/lib/dataops-databrew-l3-construct.ts (232 lines of code) (raw):

/*! * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. * SPDX-License-Identifier: Apache-2.0 */ import { CfnJob, CfnRecipe, CfnDataset, CfnProject } from 'aws-cdk-lib/aws-databrew'; import { IResolvable } from 'aws-cdk-lib'; import { MdaaRoleRef } from '@aws-mdaa/iam-role-helper'; import { MdaaL3Construct, MdaaL3ConstructProps } from '@aws-mdaa/l3-construct'; import { MdaaDataBrewJob, MdaaDataBrewJobProps } from '@aws-mdaa/databrew-constructs'; import { Construct } from 'constructs'; import { MdaaDataBrewDataset, MdaaDataBrewRecipe, MdaaDataBrewSchedule, MdaaDataBrewProject } from '../lib'; export interface DataBrewL3ConstructProps extends MdaaL3ConstructProps { // Name of the Data-Ops project. readonly projectName: string; // List of recipes to be created. readonly recipes?: Record<string, RecipeProps>; // List of recipes to be created. readonly datasets?: Record<string, DatasetProps>; // List of jobs to be created. readonly jobs?: Record<string, DataBrewJobProps>; } export interface DataBrewJobProps { // Databrew job type readonly type: string; // KMS Key for the job readonly kmsKeyArn: string; // Databrew project name for recipe job. readonly projectName?: string; // Output locations for the recipe job. readonly outputs?: IResolvable | (CfnJob.OutputProperty | IResolvable)[]; // Dataset for the job readonly dataset: ConfigOptions; // A list of steps that are defined by the recipe. readonly recipe?: ConfigOptions; // Execution role for the job readonly executionRole: MdaaRoleRef; // A sample configuration for profile jobs only, which determines the number of rows on which the profile job is run. readonly jobSample?: CfnJob.JobSampleProperty | IResolvable; // The current status of Amazon CloudWatch logging for the job. readonly logSubscription?: string; // The maximum number of nodes that can be consumed when the job processes data. readonly maxCapacity?: number; // The maximum number of times to retry the job after a job run fails. readonly maxRetries?: number; // Schedule for the job readonly schedule?: ConfigSchedule; // job output location for Profile job readonly outputLocation?: CfnJob.OutputLocationProperty | IResolvable; // One or more artifacts that represent the AWS Glue Data Catalog output from running the job. readonly dataCatalogOutputs?: IResolvable | (IResolvable | CfnJob.DataCatalogOutputProperty)[]; // Represents a list of JDBC database output objects which defines the output destination for a DataBrew recipe job to write into. readonly databaseOutputs?: IResolvable | (IResolvable | CfnJob.DatabaseOutputProperty)[]; // Configuration for profile jobs. readonly profileConfiguration?: CfnJob.ProfileConfigurationProperty | IResolvable; // The job's timeout in minutes. readonly timeout?: number; // List of validation configurations that are applied to the profile job. readonly validationConfigurations?: IResolvable | (CfnJob.ValidationConfigurationProperty | IResolvable)[]; } export interface ConfigSchedule { // The name of the schedule. readonly name: string; // The dates and times when the job is to run. readonly cronExpression: string; // A list of jobs to be run, according to the schedule. readonly jobNames?: string[]; } export interface ConfigOptions { readonly existing?: CfnJob.RecipeProperty; readonly generated?: string; } export interface DatasetProps { // IResolvable Information on how DataBrew can find the dataset, in either the AWS Glue Data Catalog or Amazon S3. readonly input: CfnDataset.InputProperty | IResolvable; // The file format of a dataset that is created from an Amazon S3 file or folder. readonly format?: string; // A set of options that define how DataBrew interprets the data in the dataset. readonly formatOptions?: CfnDataset.FormatOptionsProperty | IResolvable; // A set of options that defines how DataBrew interprets an Amazon S3 path of the dataset. readonly pathOptions?: CfnDataset.PathOptionsProperty | IResolvable; } export interface RecipeProps { // A list of steps that are defined by the recipe. readonly steps: string; // The description of the recipe. readonly description?: string; } export interface DatabrewProjectConfig { // The unique name of a project. readonly name: string; // The dataset that the project is to act upon. readonly datasetName: string; // The name of a recipe that will be developed during a project session. readonly recipeName: string; // The Amazon Resource Name (ARN) of the role that will be assumed for this project. readonly roleArn: string; // The sample size and sampling type to apply to the data. readonly sample?: CfnProject.SampleProperty | IResolvable; } //This stack creates and manages a SageMaker Studio Domain export class DataBrewL3Construct extends MdaaL3Construct { protected readonly props: DataBrewL3ConstructProps; private datasets = new Map(); private recipes = new Map(); constructor(scope: Construct, id: string, props: DataBrewL3ConstructProps) { super(scope, id, props); this.props = props; // create datasets const datasets = this.props.datasets; for (const key in datasets) { const name = key.trim(); this.datasets.set(name, this.createDataset(name, datasets[name])); } // create recipes const recipes = this.props.recipes; for (const key in recipes) { const name = key.trim(); this.recipes.set(name, this.createRecipe(name, recipes[name])); } // create list of databrew jobs and schedule them const records = this.props.jobs; for (const key in records) { const roleName = this.props.roleHelper .resolveRoleRefsWithOrdinals([records[key].executionRole], 'executionRole') .map(x => x.name())[0]; const job = this.createJob(key, roleName, records[key]); const schedule = records[key]?.schedule; if (schedule) { this.createSchedule([job.name], schedule).addDependency(job); } } } private createJob(jobName: string, roleName: string, params: DataBrewJobProps): MdaaDataBrewJob { // get project Name const getProjectName = function () { let projectName = jobName; if (params?.projectName) projectName = params.projectName; return projectName; }; // create project const project = params?.projectName ? this.createProject({ name: getProjectName(), datasetName: this.getDatasetName(params), recipeName: this.getRecipe(params).name, roleArn: roleName, }) : undefined; // make sure default dataset and recipe are provisioned before project is created if (project) this.addDependency(params, project); // get job props const props = this.getDataBrewJobProps(jobName, roleName, params, project); // create databrew job const job = new MdaaDataBrewJob(this, jobName, props); // put dependecy on project if it is project based job if (project) job.addDependency(project); // put depdency on dataset and recipes if not project based if (!project) this.addDependency(params, job); return job; } private addDependency(params: DataBrewJobProps, dependent: CfnJob | CfnProject) { if (params.dataset?.generated) { const defaultDataset: MdaaDataBrewDataset = this.datasets.get(params.dataset.generated); dependent.addDependency(defaultDataset); } if (params.recipe?.generated) { const defaultRecipe: MdaaDataBrewRecipe = this.recipes.get(params.recipe.generated); dependent.addDependency(defaultRecipe); } } private getDataBrewJobProps( jobName: string, roleName: string, params: DataBrewJobProps, project?: CfnProject, ): MdaaDataBrewJobProps { // set basic props for the job const props = { naming: this.props.naming, name: jobName, roleArn: roleName, type: params.type, encryptionKeyArn: params.kmsKeyArn, dataCatalogOutputs: params.dataCatalogOutputs, databaseOutputs: params.databaseOutputs, jobSample: params.jobSample, logSubscription: params.logSubscription, maxCapacity: params.maxCapacity, maxRetries: params.maxRetries, timeout: params.timeout, }; // parameters for recipe job with project if (project) { return { ...props, projectName: project.name, outputs: params.outputs, }; } // parameters for recipe job with dataset and recipe if (params.type == 'RECIPE') { return { ...props, datasetName: this.getDatasetName(params), recipe: this.getRecipe(params), outputs: params.outputs, }; } // properties for profile job return { ...props, datasetName: this.getDatasetName(params), outputLocation: params.outputLocation, profileConfiguration: params.profileConfiguration, validationConfigurations: params.validationConfigurations, }; } private getDatasetName(params: DataBrewJobProps): string { let defaultDatasetName = ''; if (params.dataset?.generated) { const defaultDataset: MdaaDataBrewDataset = this.datasets.get(params.dataset.generated); defaultDatasetName = defaultDataset.name; } else if (params.dataset?.existing) { defaultDatasetName = params.dataset.existing.name; } return defaultDatasetName; } private getRecipe(params: DataBrewJobProps): CfnJob.RecipeProperty { let recipe: CfnJob.RecipeProperty = { name: '', version: '' }; if (params.recipe?.generated) { const defaultRecipe: MdaaDataBrewRecipe = this.recipes.get(params.recipe.generated); recipe = { name: defaultRecipe.name, version: '' }; } else if (params.recipe?.existing) { recipe = { name: params.recipe.existing.name, version: params.recipe.existing.version }; } return recipe; } private createProject(params: DatabrewProjectConfig): MdaaDataBrewProject { const props = { naming: this.props.naming, name: params.name, datasetName: params.datasetName, recipeName: params.recipeName, roleArn: params.roleArn, sample: params.sample, }; return new MdaaDataBrewProject(this, params.name, props); } private createDataset(name: string, params: DatasetProps): MdaaDataBrewDataset { const props = { naming: this.props.naming, name: name, input: params.input, format: params.format, formatOptions: params.formatOptions, pathOptions: params.pathOptions, }; return new MdaaDataBrewDataset(this, name, props); } private createRecipe(name: string, recipeConfig: RecipeProps): MdaaDataBrewRecipe { const toLowerCase = (str: string) => str.charAt(0).toLowerCase() + str.slice(1); const isUpperCase = (char: string) => char == char.toUpperCase(); const transformedSteps = JSON.parse(recipeConfig.steps, function (key, value) { if (isNaN(Number(key)) && key.trim().length > 0 && isUpperCase(key.charAt(0))) { this[toLowerCase(key)] = value; return; } return value; }); const props = { naming: this.props.naming, name: name, steps: transformedSteps as CfnRecipe.RecipeStepProperty[], description: recipeConfig.description, }; return new MdaaDataBrewRecipe(this, name, props); } private createSchedule(jobNames: string[], params: ConfigSchedule): MdaaDataBrewSchedule { const props = { naming: this.props.naming, name: params.name, cronExpression: params.cronExpression, jobNames: jobNames, }; return new MdaaDataBrewSchedule(this, params.name, props); } }