in src/backend/orchestration/index.ts [154:356]
public constructor(scope: Construct, id: string, props: OrchestrationProps) {
super(scope, id);
this.deadLetterQueue = new Queue(this, 'DLQ', {
encryption: QueueEncryption.KMS_MANAGED,
retentionPeriod: Duration.days(14),
visibilityTimeout: Duration.minutes(15),
});
props.monitoring.addHighSeverityAlarm(
'Backend Orchestration Dead-Letter Queue is not empty',
new MathExpression({
expression: 'm1 + m2',
label: 'Dead-Letter Queue not empty',
usingMetrics: {
m1: this.deadLetterQueue.metricApproximateNumberOfMessagesVisible({ period: Duration.minutes(1) }),
m2: this.deadLetterQueue.metricApproximateNumberOfMessagesNotVisible({ period: Duration.minutes(1) }),
},
}).createAlarm(this, 'DLQAlarm', {
alarmName: `${this.deadLetterQueue.node.path}/NotEmpty`,
alarmDescription: [
'Backend orchestration dead-letter queue is not empty.',
'',
`RunBook: ${RUNBOOK_URL}`,
'',
`Direct link to queue: ${sqsQueueUrl(this.deadLetterQueue)}`,
'Warning: State Machines executions that sent messages to the DLQ will not show as "failed".',
].join('\n'),
comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
evaluationPeriods: 1,
threshold: 1,
}),
);
const sendToDeadLetterQueue = new tasks.SqsSendMessage(this, 'Send to Dead Letter Queue', {
messageBody: TaskInput.fromJsonPathAt('$'),
queue: this.deadLetterQueue,
resultPath: JsonPath.DISCARD,
}).next(new Succeed(this, 'Sent to DLQ'));
const ignore = new Pass(this, 'Ignore');
this.catalogBuilder = new CatalogBuilder(this, 'CatalogBuilder', props);
const addToCatalog = new tasks.LambdaInvoke(this, 'Add to catalog.json', {
lambdaFunction: this.catalogBuilder.function,
resultPath: '$.catalogBuilderOutput',
resultSelector: {
'ETag.$': '$.Payload.ETag',
'VersionId.$': '$.Payload.VersionId',
},
})
// This has a concurrency of 1, so we want to aggressively retry being throttled here.
.addRetry({ errors: ['Lambda.TooManyRequestsException'], ...THROTTLE_RETRY_POLICY })
.addCatch(sendToDeadLetterQueue, { errors: ['Lambda.TooManyRequestsException'], resultPath: '$.error' })
.addCatch(sendToDeadLetterQueue, { errors: ['States.TaskFailed'], resultPath: '$.error' })
.addCatch(sendToDeadLetterQueue, { errors: ['States.ALL'], resultPath: '$.error' });
const needsCatalogUpdateFunction = new NeedsCatalogUpdate(this, 'NeedsCatalogUpdate', {
architecture: gravitonLambdaIfAvailable(this),
description: '[ConstructHub/Orchestration/NeedsCatalogUpdate] Determines whether a package version requires a catalog update',
environment: { CATALOG_BUCKET_NAME: props.bucket.bucketName, CATALOG_OBJECT_KEY: CATALOG_KEY },
memorySize: 1_024,
timeout: Duration.minutes(1),
});
props.bucket.grantRead(needsCatalogUpdateFunction);
// Check whether the catalog needs updating. If so, trigger addToCatalog.
const addToCatalogIfNeeded = new tasks.LambdaInvoke(this, 'Check whether catalog needs udpating', {
lambdaFunction: needsCatalogUpdateFunction,
payloadResponseOnly: true,
resultPath: '$.catalogNeedsUpdating',
})
.addRetry({
errors: [
'Lambda.TooManyRequestsException',
'Lambda.Unknown', // happens when a lambda times out.
],
...THROTTLE_RETRY_POLICY,
})
.addCatch(sendToDeadLetterQueue, { errors: ['Lambda.TooManyRequestsException', 'Lambda.Unknown'], resultPath: '$.error' } )
.addCatch(sendToDeadLetterQueue, { errors: ['States.TaskFailed'], resultPath: '$.error' } )
.addCatch(sendToDeadLetterQueue, { errors: ['States.ALL'], resultPath: '$.error' })
.next(new Choice(this, 'Is catalog update needed?')
.when(Condition.booleanEquals('$.catalogNeedsUpdating', true), addToCatalog)
.otherwise(new Succeed(this, 'Done')),
);
this.ecsCluster = new Cluster(this, 'Cluster', {
containerInsights: true,
enableFargateCapacityProviders: true,
vpc: props.vpc,
});
this.transliterator = new Transliterator(this, 'Transliterator', props);
const definition = new Pass(this, 'Track Execution Infos', {
inputPath: '$$.Execution',
parameters: {
'Id.$': '$.Id',
'Name.$': '$.Name',
'RoleArn.$': '$.RoleArn',
'StartTime.$': '$.StartTime',
},
resultPath: '$.$TaskExecution',
})
.next(
new Pass(this, 'Prepare doc-gen ECS Command', {
parameters: { 'command.$': 'States.Array(States.JsonToString($))' },
resultPath: '$.docGen',
}),
)
.next(
this.transliterator.createEcsRunTask(this, 'Generate docs', {
cluster: this.ecsCluster,
inputPath: '$.docGen.command',
resultPath: '$.docGenOutput',
// aws-cdk-lib succeeds in roughly 1 hour, so this should give us
// enough of a buffer and prorably account for all other libraries out there.
timeout: Duration.hours(2),
vpcSubnets: props.vpcSubnets,
securityGroups: props.vpcSecurityGroups,
})
// Do not retry NoSpaceLeftOnDevice errors, these are typically not transient.
.addRetry({ errors: ['jsii-docgen.NoSpaceLeftOnDevice'], maxAttempts: 0 })
.addRetry({
errors: [
'ECS.AmazonECSException', // Task failed starting, usually due to throttle / out of capacity
'ECS.InvalidParameterException', // This is returned when ECS gets throttled when trying to access VPC/SGs.
'jsii-docgen.NpmError.E429', // HTTP 429 ("Too Many Requests") from CodeArtifact's S3 bucket
'jsii-codgen.NpmError.EPROTO', // Sporadic TLS negotiation failures we see in logs, transient
],
...DOCGEN_THROTTLE_RETRY_POLICY,
})
.addRetry({
errors: ['jsii-docgen.NpmError.ETARGET'], // Seen when dependencies aren't available yet
// We'll wait longer between retries. This is to account for CodeArtifact's lag behind npm
backoffRate: 2,
interval: Duration.minutes(5),
maxAttempts: 3,
})
.addRetry({ maxAttempts: 3 })
.addCatch(ignore, { errors: [UNPROCESSABLE_PACKAGE_ERROR_NAME] })
.addCatch(sendToDeadLetterQueue, { errors: ['States.Timeout'], resultPath: '$.error' } )
.addCatch(sendToDeadLetterQueue, { errors: ['ECS.AmazonECSException', 'ECS.InvalidParameterException'], resultPath: '$.error' })
.addCatch(sendToDeadLetterQueue, { errors: ['States.TaskFailed'], resultPath: '$.error' })
.addCatch(sendToDeadLetterQueue, { errors: ['States.ALL'], resultPath: '$.error' })
.next(addToCatalogIfNeeded),
);
this.stateMachine = new StateMachine(this, 'Resource', {
definition,
stateMachineName: stateMachineNameFrom(this.node.path),
timeout: Duration.days(1), // Ample time for retries, etc...
tracingEnabled: true,
});
if (props.vpc) {
// Ensure the State Machine does not get to run before the VPC can be used.
this.stateMachine.node.addDependency(props.vpc.internetConnectivityEstablished);
}
props.monitoring.addHighSeverityAlarm(
'Backend Orchestration Failed',
this.stateMachine.metricFailed()
.createAlarm(this, 'OrchestrationFailed', {
alarmName: `${this.stateMachine.node.path}/${this.stateMachine.metricFailed().metricName}`,
alarmDescription: [
'Backend orchestration failed!',
'',
`RunBook: ${RUNBOOK_URL}`,
'',
`Direct link to state machine: ${stateMachineUrl(this.stateMachine)}`,
'Warning: messages that resulted in a failed exectuion will NOT be in the DLQ!',
].join('\n'),
comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
evaluationPeriods: 1,
threshold: 1,
}));
// This function is intended to be manually triggered by an operrator to
// attempt redriving messages from the DLQ.
this.redriveFunction = new RedriveStateMachine(this, 'Redrive', {
description: '[ConstructHub/Redrive] Manually redrives all messages from the backend dead letter queue',
environment: {
STATE_MACHINE_ARN: this.stateMachine.stateMachineArn,
QUEUE_URL: this.deadLetterQueue.queueUrl,
},
memorySize: 1_024,
timeout: Duration.minutes(15),
tracing: Tracing.ACTIVE,
});
this.stateMachine.grantStartExecution(this.redriveFunction);
this.deadLetterQueue.grantConsumeMessages(this.redriveFunction);
// The workflow is intended to be manually triggered by an operator to
// reprocess all package versions currently in store through the orchestrator.
this.regenerateAllDocumentation = new RegenerateAllDocumentation(this, 'RegenerateAllDocumentation', {
bucket: props.bucket,
stateMachine: this.stateMachine,
}).stateMachine;
}