Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GENP-3110 Batch job monitor #100

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion package.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

141 changes: 141 additions & 0 deletions src/batch.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import * as batch from '@aws-cdk/aws-batch';
import * as events from '@aws-cdk/aws-events';
import * as events_targets from '@aws-cdk/aws-events-targets';
import * as sns from '@aws-cdk/aws-sns';
import { Construct, Duration } from '@aws-cdk/core';
import { IWatchful } from './api';

const DEFAULT_DURATION_THRESHOLD_PERCENT = 80;

export interface WatchBatchJobsOptions {
/**
* Flag to disable alerting on errors
*
* @default false
*/
readonly errorsDisableAlerts?: boolean;

/**
* Number of allowed errors per minute. If there are more errors than that, an alarm will trigger.
*
* @default 0
*/
readonly errorsPerMinuteThreshold?: number;

/**
* Flag to enable alerting on invocationsMetric
*
@default false
*/
readonly invocationsEnableAlerts?: boolean;

/**
* Threshold for alerting invocations
*/
readonly invocationsThreshold?: Duration;

/**
* Number of allowed throttles per minute.
*
* @default 0
*/
readonly throttlesPerMinuteThreshold?: number;

/**
* Threshold for the duration alarm as percentage of the function's timeout
* value.
*
* If this is set to 50%, the alarm will be set when p99 latency of the
* function exceeds 50% of the function's timeout setting.
*
* @default 80
*/
readonly durationThresholdPercent?: number;

/**
* Override duration timeout threshold.
* Necessary for lambdas that aren't created via the CDK.
* This value is still adjusted by durationThresholdPercent
*
* @default 3
*/
readonly durationTimeoutSec?: number;

/**
* Send notifications to resolve alerts
*
* @default false
*/
readonly autoResolveEvents?: boolean;
}

export interface WatchBatchJobsProps extends WatchBatchJobsOptions {
readonly title: string;
readonly watchful: IWatchful;
readonly batchQueue: batch.IJobQueue;
readonly alarmTopic: sns.ITopic;
}

export class WatchBatchJobs extends Construct {

private readonly batchQueue: batch.IJobQueue;
private readonly alarmTopic: sns.ITopic;

constructor(scope: Construct, id: string, props: WatchBatchJobsProps) {
super(scope, id);

this.batchQueue = props.batchQueue;
this.alarmTopic = props.alarmTopic;

// this.watchful.addSection(props.title, {
// links: [
// { title: 'AWS Lambda Console', url: linkForLambdaFunction(this.fn) },
// { title: 'CloudWatch Logs', url: linkForLambdaLogs(this.fn) },
// ],
// });

this.createFailedJobsMonitor();
}

createFailedJobsMonitor() {
const batchQueue = this.batchQueue;
new events.Rule(this, 'FailedJobsRule', {
eventPattern: {
source: ['aws.batch'],
detailType: ['Batch Job State Change'],
detail: {
status: ['FAILED'],
jobQueue: [batchQueue.jobQueueName],
},
},
targets: [new events_targets.SnsTopic(this.alarmTopic)],
});
}

createStuckJobsMonitor() {
const batchQueue = this.batchQueue;
new events.Rule(this, 'StuckJobsRule', {
eventPattern: {
source: ['aws.batch'],
detailType: ['Batch Job Queue Blocked'],
detail: {
jobQueue: [batchQueue.jobQueueName],
},
},
targets: [new events_targets.SnsTopic(this.alarmTopic, {
message: events.RuleTargetInput.fromObject({
message: 'Batch job stuck in RUNNABLE state',
jobQueue: batchQueue.jobQueueName,
}),
})],
});
}
}

// function linkForLambdaFunction(fn: lambda.IFunction, tab = 'graph') {
// return `https://console.aws.amazon.com/lambda/home?region=${fn.stack.region}#/functions/${fn.functionName}?tab=${tab}`;
// }

// function linkForLambdaLogs(fn: lambda.IFunction) {
// return `https://console.aws.amazon.com/cloudwatch/home?region=${fn.stack.region}#logEventViewer:group=/aws/lambda/${fn.functionName}`;
// }
8 changes: 8 additions & 0 deletions src/watchful.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import * as sns from '@aws-cdk/aws-sns';
import * as sns_subscriptions from '@aws-cdk/aws-sns-subscriptions';
import * as sqs from '@aws-cdk/aws-sqs';
import * as stepfunctions from '@aws-cdk/aws-stepfunctions';
import * as batch from '@aws-cdk/aws-batch';
import { Construct, CfnOutput, Aspects } from '@aws-cdk/core';
import { IWatchful, SectionOptions } from './api';
import { WatchApiGatewayOptions, WatchApiGateway } from './api-gateway';
Expand All @@ -22,6 +23,7 @@ import { WatchLambdaFunctionOptions, WatchLambdaFunction } from './lambda';
import { WatchRdsAuroraOptions, WatchRdsAurora } from './rds-aurora';
import { WatchSqsOptions, WatchSqsService } from './sqs';
import { WatchStateMachineOptions, WatchStateMachine } from './state-machine';
import { WatchBatchJobs } from './batch';

export interface WatchfulProps {
readonly alarmEmail?: string;
Expand Down Expand Up @@ -148,6 +150,12 @@ export class Watchful extends Construct implements IWatchful {
title, watchful: this, sqs, ...options,
});
}

public watchBatchJobs(title: string, batchQueue: batch.IJobQueue, alarmTopic: sns.ITopic) {
return new WatchBatchJobs(this, batchQueue.node.addr, {
title, watchful: this, batchQueue, alarmTopic,
});
}
}

function linkForDashboard(dashboard: cloudwatch.Dashboard) {
Expand Down
Loading
Loading