packages/@aws-cdk/aws-glue-alpha/lib/jobs/spark-job.ts (96 lines of code) (raw):
import * as iam from 'aws-cdk-lib/aws-iam';
import * as s3 from 'aws-cdk-lib/aws-s3';
import { addConstructMetadata } from 'aws-cdk-lib/core/lib/metadata-resource';
import * as constructs from 'constructs';
import { Code } from '../code';
import { Job, JobProps } from './job';
import { Token } from 'aws-cdk-lib';
import { EOL } from 'os';
/**
* Code props for different {@link Code} assets used by different types of Spark jobs.
*/
export interface SparkExtraCodeProps {
/**
* Extra Python Files S3 URL (optional)
* S3 URL where additional python dependencies are located
*
* @default - no extra files
*/
readonly extraPythonFiles?: Code[];
/**
* Additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it.
*
* @default - no extra files specified.
*
* @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html
*/
readonly extraFiles?: Code[];
/**
* Extra Jars S3 URL (optional)
* S3 URL where additional jar dependencies are located
* @default - no extra jar files
*/
readonly extraJars?: Code[];
/**
* Setting this value to true prioritizes the customer's extra JAR files in the classpath.
*
* @default false - priority is not given to user-provided jars
*
* @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html
*/
readonly extraJarsFirst?: boolean;
}
/**
* Properties for enabling Spark UI monitoring feature for Spark-based Glue jobs.
*
* @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html
* @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html
*/
export interface SparkUIProps {
/**
* The bucket where the Glue job stores the logs.
*
* @default a new bucket will be created.
*/
readonly bucket?: s3.IBucket;
/**
* The path inside the bucket (objects prefix) where the Glue job stores the logs.
* Use format `'/foo/bar'`
*
* @default - the logs will be written at the root of the bucket
*/
readonly prefix?: string;
}
/**
* The Spark UI logging location.
*
* @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html
* @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html
*/
export interface SparkUILoggingLocation {
/**
* The bucket where the Glue job stores the logs.
*/
readonly bucket: s3.IBucket;
/**
* The path inside the bucket (objects prefix) where the Glue job stores the logs.
*
* @default '/' - the logs will be written at the root of the bucket
*/
readonly prefix?: string;
}
/**
* Common properties for different types of Spark jobs.
*/
export interface SparkJobProps extends JobProps {
/**
* Enables the Spark UI debugging and monitoring with the specified props.
*
* @default - Spark UI debugging and monitoring is disabled.
*
* @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html
* @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html
*/
readonly sparkUI?: SparkUIProps;
}
/**
* Base class for different types of Spark Jobs.
*/
export abstract class SparkJob extends Job {
public readonly role: iam.IRole;
public readonly grantPrincipal: iam.IPrincipal;
/**
* The Spark UI logs location if Spark UI monitoring and debugging is enabled.
*
* @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html
* @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html
*/
public readonly sparkUILoggingLocation?: SparkUILoggingLocation;
constructor(scope: constructs.Construct, id: string, props: SparkJobProps) {
super(scope, id, {
physicalName: props.jobName,
});
// Enhanced CDK Analytics Telemetry
addConstructMetadata(this, props);
this.role = props.role;
this.grantPrincipal = this.role;
this.sparkUILoggingLocation = props.sparkUI ? this.setupSparkUILoggingLocation(props.sparkUI) : undefined;
}
protected nonExecutableCommonArguments(props: SparkJobProps): {[key: string]: string} {
// Enable CloudWatch metrics and continuous logging by default as a best practice
const continuousLoggingArgs = this.setupContinuousLogging(this.role, props.continuousLogging);
const profilingMetricsArgs = { '--enable-metrics': '' };
const observabilityMetricsArgs = { '--enable-observability-metrics': 'true' };
// Set spark ui args, if spark ui logging had been setup
const sparkUIArgs = this.sparkUILoggingLocation ? ({
'--enable-spark-ui': 'true',
'--spark-event-logs-path': this.sparkUILoggingLocation.bucket.s3UrlForObject(this.sparkUILoggingLocation.prefix).replace(/\/?$/, '/'), // path will always end with a slash
}): {};
return {
...continuousLoggingArgs,
...profilingMetricsArgs,
...observabilityMetricsArgs,
...sparkUIArgs,
...this.checkNoReservedArgs(props.defaultArguments),
};
}
/**
* Set the arguments for extra {@link Code}-related properties
*/
protected setupExtraCodeArguments(args: { [key: string]: string }, props: SparkExtraCodeProps) {
if (props.extraJars && props.extraJars.length > 0) {
args['--extra-jars'] = props.extraJars.map(code => this.codeS3ObjectUrl(code)).join(',');
}
if (props.extraJarsFirst) {
args['--user-jars-first'] = 'true';
}
if (props.extraPythonFiles && props.extraPythonFiles.length > 0) {
args['--extra-py-files'] = props.extraPythonFiles.map(code => this.codeS3ObjectUrl(code)).join(',');
}
if (props.extraFiles && props.extraFiles.length > 0) {
args['--extra-files'] = props.extraFiles.map(code => this.codeS3ObjectUrl(code)).join(',');
}
}
private setupSparkUILoggingLocation(props: SparkUIProps): SparkUILoggingLocation {
validateSparkUiPrefix(props.prefix);
const bucket = props.bucket ?? new s3.Bucket(this, 'SparkUIBucket', { enforceSSL: true, encryption: s3.BucketEncryption.S3_MANAGED });
bucket.grantReadWrite(this, cleanSparkUiPrefixForGrant(props.prefix));
return {
prefix: props.prefix,
bucket,
};
}
}
function validateSparkUiPrefix(prefix?: string): void {
if (!prefix || Token.isUnresolved(prefix)) {
// skip validation if prefix is not specified or is a token
return;
}
const errors: string[] = [];
if (!prefix.startsWith('/')) {
errors.push('Prefix must begin with \'/\'');
}
if (prefix.endsWith('/')) {
errors.push('Prefix must not end with \'/\'');
}
if (errors.length > 0) {
throw new Error(`Invalid prefix format (value: ${prefix})${EOL}${errors.join(EOL)}`);
}
}
function cleanSparkUiPrefixForGrant(prefix?: string): string | undefined {
return prefix !== undefined ? prefix.slice(1) + '/*' : undefined;
}