Skip to content

Commit

Permalink
fix(orchestration): tweak config for faster failures and less wasted …
Browse files Browse the repository at this point in the history
…performance
  • Loading branch information
mrgrain committed Aug 29, 2024
1 parent b166dc4 commit 5fe0cd2
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 116 deletions.
3 changes: 2 additions & 1 deletion projenrc/magic-ecs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ function newEcsTask(project: TypeScriptProject, entrypoint: string) {
main.line();

main.open('async function main(): Promise<void> {');
main.line('const heartbeat = setInterval(sendHeartbeat, 180_000);'); // Heartbeat is only expected every 10min
main.line('// Heartbeat is expected every 5min');
main.line('const heartbeat = setInterval(sendHeartbeat, 90_000);');
main.line('try {');
// Deserialize the input, which ECS provides as a sequence of JSON objects. We skip the first 2 values (argv[0] is the
// node binary, and argv[1] is this JS file).
Expand Down
65 changes: 15 additions & 50 deletions src/__tests__/__snapshots__/construct-hub.test.ts.snap

Large diffs are not rendered by default.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 3 additions & 10 deletions src/__tests__/devapp/__snapshots__/snapshot.test.ts.snap

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 6 additions & 3 deletions src/backend/orchestration/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,8 @@ export class Orchestration extends Construct {
.otherwise(new Succeed(this, 'Done'))
);

const transliteratorTimeout = Duration.hours(2);
// aws-cdk-lib is one of the biggest libs and runs in ~8min
const transliteratorTimeout = Duration.hours(1);

this.ecsCluster = new Cluster(this, 'Cluster', {
containerInsights: true,
Expand Down Expand Up @@ -380,7 +381,7 @@ export class Orchestration extends Construct {
// cases the first heartbeat may take a while to come back due to
// the time it takes to provision the task in the cluster, so we
// give a more generous buffer here.
heartbeat: Duration.minutes(10),
heartbeat: Duration.minutes(5),
})
// Do not retry NoSpaceLeftOnDevice errors, these are typically not transient.
.addRetry({
Expand All @@ -407,7 +408,9 @@ export class Orchestration extends Construct {
errors: ['States.Timeout'], // The task has stopped responding, or is just taking a long time to provision
// To compensate we'll give more retries and pause between them in
// case it's just a transient issue.
maxAttempts: 5,
backoffRate: 2,
interval: Duration.seconds(30),
maxAttempts: 3,
})
.addRetry({ maxAttempts: 3 })
.addCatch(ignore, { errors: [UNPROCESSABLE_PACKAGE_ERROR_NAME] })
Expand Down
31 changes: 20 additions & 11 deletions src/backend/transliterator/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ import {
ICluster,
LogDrivers,
OperatingSystemFamily,
UlimitName,
} from 'aws-cdk-lib/aws-ecs';
import { Effect, PolicyStatement } from 'aws-cdk-lib/aws-iam';
import { ILogGroup, LogGroup, RetentionDays } from 'aws-cdk-lib/aws-logs';
Expand Down Expand Up @@ -174,16 +173,26 @@ export class Transliterator extends Construct {
},
}),
});
// Encountered an error of "EMFILE: too many open files" in ECS.
// Default nofile ulimit is 1024/4096.
//
// For ECS ulimit documentation see: https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Ulimit.html
// For construct hub tracking issue see: https://github.com/cdklabs/construct-hub/issues/982
this.containerDefinition.addUlimits({
name: UlimitName.NOFILE, // file descriptors
softLimit: 16_384,
hardLimit: 65_535,
});

/**
* Construct hub tracking issue: https://github.com/cdklabs/construct-hub/issues/982
*
* Encountered one of these errors in ECS:
* - EMFILE: too many open files
* - Error: getaddrinfo EBUSY states.us-east-1.amazonaws.com
*
* This issue occurs because the container is running out of file descriptors, to read files or make network requests.
* Normally this limit is configured via the `ulimit` setting. However on AWS Fargate a much lower limit applies.
* For ECS ulimit documentation see: https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Ulimit.html
*
* We currently don't believe a specific ulimit configuration is needed, the defaults (which are the max) should work.
* For references, here is the previously used config.
*/
// this.containerDefinition.addUlimits({
// name: UlimitName.NOFILE, // file descriptors
// softLimit: 16_384,
// hardLimit: 65_535,
// });

repository?.grantReadFromRepository(this.taskDefinition.taskRole);

Expand Down
3 changes: 2 additions & 1 deletion src/backend/transliterator/transliterator.ecs-entrypoint.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -1614,14 +1614,7 @@
}
}
},
"Name": "Resource",
"Ulimits": [
{
"HardLimit": 65535,
"Name": "nofile",
"SoftLimit": 16384
}
]
"Name": "Resource"
}
],
"Cpu": "4096",
Expand Down

0 comments on commit 5fe0cd2

Please sign in to comment.