Skip to content

Commit 4dbf11a

Browse files
authored
Run Engine 2: More robust attempt failing/retrying (inc. OOM retrying) (#1773)
* Added describe to tests that were missing it * Added a function to get the maxOldSpaceSize * Make it easy to take `NODE_OPTIONS` and set the old space flag * Added a zed task to rebuild the packages * Moved isOOMRunError and added SIBABRT condition * Deduplication flags function with tests * Export flags file * On TaskRunProcess, set max old space and deduplicate the flags with priority order * Move retrying logic to a separate function, it was getting very messy * Created new test file for attempt failures * Allow setting retry settings for tests * Some retrying tests, including OOM * More failure condition tests * Fix for OOM retrying * Complete the attempt span if it was an OOM error * Remove old broken import * Fixed order of exports * Fix for docker-provider checkpoints import
1 parent e297c7f commit 4dbf11a

File tree

22 files changed

+1534
-403
lines changed

22 files changed

+1534
-403
lines changed

.zed/tasks.json

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
[
2+
{
3+
"label": "Build packages",
4+
"command": "pnpm run build --filter \"@trigger.dev/*\" --filter trigger.dev",
5+
//"args": [],
6+
// Env overrides for the command, will be appended to the terminal's environment from the settings.
7+
"env": { "foo": "bar" },
8+
// Current working directory to spawn the command into, defaults to current project root.
9+
//"cwd": "/path/to/working/directory",
10+
// Whether to use a new terminal tab or reuse the existing one to spawn the process, defaults to `false`.
11+
"use_new_terminal": false,
12+
// Whether to allow multiple instances of the same task to be run, or rather wait for the existing ones to finish, defaults to `false`.
13+
"allow_concurrent_runs": false,
14+
// What to do with the terminal pane and tab, after the command was started:
15+
// * `always` — always show the task's pane, and focus the corresponding tab in it (default)
16+
// * `no_focus` — always show the task's pane, add the task's tab in it, but don't focus it
17+
// * `never` — do not alter focus, but still add/reuse the task's tab in its pane
18+
"reveal": "always",
19+
// What to do with the terminal pane and tab, after the command has finished:
20+
// * `never` — Do nothing when the command finishes (default)
21+
// * `always` — always hide the terminal tab, hide the pane also if it was the last tab in it
22+
// * `on_success` — hide the terminal tab on task success only, otherwise behaves similar to `always`
23+
"hide": "never",
24+
// Which shell to use when running a task inside the terminal.
25+
// May take 3 values:
26+
// 1. (default) Use the system's default terminal configuration in /etc/passwd
27+
// "shell": "system"
28+
// 2. A program:
29+
// "shell": {
30+
// "program": "sh"
31+
// }
32+
// 3. A program with arguments:
33+
// "shell": {
34+
// "with_arguments": {
35+
// "program": "/bin/bash",
36+
// "args": ["--login"]
37+
// }
38+
// }
39+
"shell": "system",
40+
// Whether to show the task line in the output of the spawned task, defaults to `true`.
41+
"show_summary": true,
42+
// Whether to show the command line in the output of the spawned task, defaults to `true`.
43+
"show_output": true
44+
}
45+
]

apps/docker-provider/src/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import {
88
} from "@trigger.dev/core/v3/apps";
99
import { SimpleLogger } from "@trigger.dev/core/v3/apps";
1010
import { isExecaChildProcess } from "@trigger.dev/core/v3/apps";
11-
import { testDockerCheckpoint } from "@trigger.dev/core/v3/checkpoints";
11+
import { testDockerCheckpoint } from "@trigger.dev/core/v3/serverOnly";
1212
import { setTimeout } from "node:timers/promises";
1313
import { PostStartCauses, PreStopCauses } from "@trigger.dev/core/v3";
1414

apps/webapp/app/v3/services/alerts/deliverAlert.server.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import {
1313
RunFailedWebhook,
1414
DeploymentFailedWebhook,
1515
DeploymentSuccessWebhook,
16+
isOOMRunError,
1617
} from "@trigger.dev/core/v3";
1718
import assertNever from "assert-never";
1819
import { subtle } from "crypto";
@@ -39,7 +40,6 @@ import { generateFriendlyId } from "~/v3/friendlyIdentifiers";
3940
import { ProjectAlertChannelType, ProjectAlertType } from "@trigger.dev/database";
4041
import { alertsRateLimiter } from "~/v3/alertsRateLimiter.server";
4142
import { v3RunPath } from "~/utils/pathBuilder";
42-
import { isOOMError } from "../completeAttempt.server";
4343
import { ApiRetrieveRunPresenter } from "~/presenters/v3/ApiRetrieveRunPresenter.server";
4444

4545
type FoundAlert = Prisma.Result<
@@ -375,7 +375,7 @@ export class DeliverAlertService extends BaseService {
375375
idempotencyKey: alert.taskRun.idempotencyKey ?? undefined,
376376
tags: alert.taskRun.runTags,
377377
error,
378-
isOutOfMemoryError: isOOMError(error),
378+
isOutOfMemoryError: isOOMRunError(error),
379379
machine: alert.taskRun.machinePreset ?? "Unknown",
380380
dashboardUrl: `${env.APP_ORIGIN}${v3RunPath(
381381
alert.project.organization,

apps/webapp/app/v3/services/completeAttempt.server.ts

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import {
1111
TaskRunSuccessfulExecutionResult,
1212
flattenAttributes,
1313
isManualOutOfMemoryError,
14+
isOOMRunError,
1415
sanitizeError,
1516
shouldRetryError,
1617
taskRunErrorEnhancer,
@@ -255,7 +256,7 @@ export class CompleteAttemptService extends BaseService {
255256

256257
let retriableError = shouldRetryError(taskRunErrorEnhancer(completion.error));
257258
let isOOMRetry = false;
258-
let isOOMAttempt = isOOMError(completion.error);
259+
let isOOMAttempt = isOOMRunError(completion.error);
259260
let isOnMaxOOMMachine = false;
260261
let oomMachine: MachinePresetName | undefined;
261262

@@ -738,45 +739,6 @@ async function findAttempt(prismaClient: PrismaClientOrTransaction, friendlyId:
738739
});
739740
}
740741

741-
export function isOOMError(error: TaskRunError) {
742-
if (error.type === "INTERNAL_ERROR") {
743-
if (
744-
error.code === "TASK_PROCESS_OOM_KILLED" ||
745-
error.code === "TASK_PROCESS_MAYBE_OOM_KILLED"
746-
) {
747-
return true;
748-
}
749-
750-
// For the purposes of retrying on a larger machine, we're going to treat this is an OOM error.
751-
// This is what they look like if we're executing using k8s. They then get corrected later, but it's too late.
752-
// {"code": "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE", "type": "INTERNAL_ERROR", "message": "Process exited with code -1 after signal SIGKILL."}
753-
if (
754-
error.code === "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE" &&
755-
error.message &&
756-
error.message.includes("SIGKILL") &&
757-
error.message.includes("-1")
758-
) {
759-
return true;
760-
}
761-
}
762-
763-
if (error.type === "BUILT_IN_ERROR") {
764-
// ffmpeg also does weird stuff
765-
// { "name": "Error", "type": "BUILT_IN_ERROR", "message": "ffmpeg was killed with signal SIGKILL" }
766-
if (error.message && error.message.includes("ffmpeg was killed with signal SIGKILL")) {
767-
return true;
768-
}
769-
}
770-
771-
// Special `OutOfMemoryError` for doing a manual OOM kill.
772-
// Useful if a native library does an OOM but doesn't actually crash the run and you want to manually
773-
if (isManualOutOfMemoryError(error)) {
774-
return true;
775-
}
776-
777-
return false;
778-
}
779-
780742
function exitRun(runId: string) {
781743
socketIo.coordinatorNamespace.emit("REQUEST_RUN_CANCELLATION", {
782744
version: "v1",

0 commit comments

Comments
 (0)