From 8e84846cc2cadbf1b9839a32d0d6d463b8568604 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 10 Apr 2025 15:08:37 +0100 Subject: [PATCH 01/35] update nypm to support text-based bun lockfiles --- packages/cli-v3/package.json | 2 +- pnpm-lock.yaml | 49 ++++++++++++++++++++++++++++++++---- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/packages/cli-v3/package.json b/packages/cli-v3/package.json index b16cee9719..352fb43f88 100644 --- a/packages/cli-v3/package.json +++ b/packages/cli-v3/package.json @@ -109,7 +109,7 @@ "magicast": "^0.3.4", "minimatch": "^10.0.1", "mlly": "^1.7.1", - "nypm": "^0.3.9", + "nypm": "^0.5.4", "object-hash": "^3.0.0", "open": "^10.0.3", "p-limit": "^6.2.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 04abb649d5..ae295a72e1 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1216,8 +1216,8 @@ importers: specifier: ^1.7.1 version: 1.7.1 nypm: - specifier: ^0.3.9 - version: 0.3.9 + specifier: ^0.5.4 + version: 0.5.4 object-hash: specifier: ^3.0.0 version: 3.0.0 @@ -19483,6 +19483,12 @@ packages: engines: {node: '>=0.4.0'} hasBin: true + /acorn@8.14.1: + resolution: {integrity: sha512-OvQ/2pUDKmgfCg++xsTX1wGxfTaszcHVcTctW4UJB4hibJx2HXxxO5UmVgyjMa+ZDsiaf5wWLXYpRWMmBI0QHg==} + engines: {node: '>=0.4.0'} + hasBin: true + dev: false + /acorn@8.8.1: resolution: {integrity: sha512-7zFpHzhnqYKrkYdUjF1HI1bzd0VygEGX8lFk4k5zVMqHEoES+P+7TKI+EvLO9WVMJ8eekdO0aDEK044xTXwPPA==} engines: {node: '>=0.4.0'} @@ -21004,7 +21010,7 @@ packages: /citty@0.1.6: resolution: {integrity: sha512-tskPPKEs8D2KPafUypv2gxwJP8h/OaJmC82QQGGDQcHvXX43xF2VDACcJVmZ0EuSxkpO9Kc4MlrA3q0+FG58AQ==} dependencies: - consola: 3.2.3 + consola: 3.4.2 dev: false /cjs-module-lexer@1.2.3: @@ -21334,6 +21340,10 @@ packages: /confbox@0.1.7: resolution: {integrity: sha512-uJcB/FKZtBMCJpK8MQji6bJHgu1tixKPxRLeGkNzBoOZzpnZUJm0jm2/sBDWcuBx1dYgxV4JU+g5hmNxCyAmdA==} + /confbox@0.1.8: + resolution: {integrity: sha512-RMtmw0iFkeR4YV+fUOSucriAQNb9g8zFR52MWCtl+cCZOFRNL6zeB395vPzFhEjjn4fMxXudmELnl/KF/WrK6w==} + dev: false + /config-chain@1.1.13: resolution: {integrity: sha512-qj+f8APARXHrM0hraqXYb2/bOVSV4PvJQlNZ/DVj0QrmNM2q2euizkeuVckQ57J+W0mRH6Hvi+k50M4Jul2VRQ==} dependencies: @@ -27925,6 +27935,15 @@ packages: pkg-types: 1.1.3 ufo: 1.5.4 + /mlly@1.7.4: + resolution: {integrity: sha512-qmdSIPC4bDJXgZTCR7XosJiNKySV7O215tsPtDN9iEO/7q/76b/ijtgRu/+epFXSJhijtTCCGp3DWS549P3xKw==} + dependencies: + acorn: 8.14.1 + pathe: 2.0.3 + pkg-types: 1.3.1 + ufo: 1.5.4 + dev: false + /module-details-from-path@1.0.3: resolution: {integrity: sha512-ySViT69/76t8VhE1xXHK6Ch4NcDd26gx0MzKXLO+F7NOtnqH68d9zF94nT8ZWSxXh8ELOERsnJO/sWt1xZYw5A==} @@ -28528,13 +28547,26 @@ packages: hasBin: true dependencies: citty: 0.1.6 - consola: 3.2.3 + consola: 3.4.2 execa: 8.0.1 pathe: 1.1.2 pkg-types: 1.1.3 ufo: 1.5.4 dev: false + /nypm@0.5.4: + resolution: {integrity: sha512-X0SNNrZiGU8/e/zAB7sCTtdxWTMSIO73q+xuKgglm2Yvzwlo8UoC5FNySQFCvl84uPaeADkqHUZUkWy4aH4xOA==} + engines: {node: ^14.16.0 || >=16.10.0} + hasBin: true + dependencies: + citty: 0.1.6 + consola: 3.4.2 + pathe: 2.0.3 + pkg-types: 1.3.1 + tinyexec: 0.3.2 + ufo: 1.5.4 + dev: false + /oauth-sign@0.9.0: resolution: {integrity: sha512-fexhUFFPTGV8ybAtSIGbV6gOkSv8UtRbDBnAyLQw4QPKkgNlsH2ByPGtMUqdWkos6YCRmAqViwgZrJc/mRDzZQ==} dev: false @@ -29260,7 +29292,6 @@ packages: /pathe@2.0.3: resolution: {integrity: sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w==} - dev: true /pathval@1.1.1: resolution: {integrity: sha512-Dp6zGqpTdETdR63lehJYPeIOqpiNBNtc7BpWSLrOje7UaIsE5aY92r/AunQA7rsXvet3lrJ3JnZX29UPTKXyKQ==} @@ -29456,6 +29487,14 @@ packages: mlly: 1.7.1 pathe: 1.1.2 + /pkg-types@1.3.1: + resolution: {integrity: sha512-/Jm5M4RvtBFVkKWRu2BLUTNP8/M2a+UwuAX+ae4770q1qVGtfjG+WTCupoZixokjmHiry8uI+dlY8KXYV5HVVQ==} + dependencies: + confbox: 0.1.8 + mlly: 1.7.4 + pathe: 2.0.3 + dev: false + /platform@1.3.6: resolution: {integrity: sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==} dev: false From a3c94fda85102143397ff1b776d10c858da3d10a Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 10 Apr 2025 15:55:51 +0100 Subject: [PATCH 02/35] fix retry spans --- apps/webapp/app/v3/runEngineHandlers.server.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/webapp/app/v3/runEngineHandlers.server.ts b/apps/webapp/app/v3/runEngineHandlers.server.ts index 6f236cf3ed..1a71157289 100644 --- a/apps/webapp/app/v3/runEngineHandlers.server.ts +++ b/apps/webapp/app/v3/runEngineHandlers.server.ts @@ -333,6 +333,7 @@ export function registerRunEngineEventBusHandlers() { } await eventRepository.recordEvent(retryMessage, { + startTime: BigInt(time.getTime() * 1000000), taskSlug: run.taskIdentifier, environment, attributes: { @@ -347,7 +348,6 @@ export function registerRunEngineEventBusHandlers() { queueName: run.queue, }, context: run.traceContext as Record, - spanIdSeed: `retry-${run.attemptNumber + 1}`, endTime: retryAt, }); } catch (error) { From 74e7c270feb160bd4ff9b608ef8d230e06d74846 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 10 Apr 2025 16:01:00 +0100 Subject: [PATCH 03/35] only download debug logs if admin --- .../resources.runs.$runParam.logs.download.ts | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts b/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts index bbc58d3de3..9bfe5dc9e2 100644 --- a/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts +++ b/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts @@ -1,16 +1,17 @@ import { LoaderFunctionArgs } from "@remix-run/server-runtime"; import { prisma } from "~/db.server"; -import { requireUserId } from "~/services/session.server"; +import { requireUser } from "~/services/session.server"; import { v3RunParamsSchema } from "~/utils/pathBuilder"; -import { PreparedEvent, RunPreparedEvent, eventRepository } from "~/v3/eventRepository.server"; +import { RunPreparedEvent, eventRepository } from "~/v3/eventRepository.server"; import { createGzip } from "zlib"; import { Readable } from "stream"; import { formatDurationMilliseconds } from "@trigger.dev/core/v3/utils/durations"; import { getDateFromNanoseconds } from "~/utils/taskEvent"; import { getTaskEventStoreTableForRun } from "~/v3/taskEventStore.server"; +import { TaskEventKind } from "@trigger.dev/database"; export async function loader({ params, request }: LoaderFunctionArgs) { - const userId = await requireUserId(request); + const user = await requireUser(request); const parsedParams = v3RunParamsSchema.pick({ runParam: true }).parse(params); const run = await prisma.taskRun.findFirst({ @@ -20,7 +21,7 @@ export async function loader({ params, request }: LoaderFunctionArgs) { organization: { members: { some: { - userId, + userId: user.id, }, }, }, @@ -44,6 +45,10 @@ export async function loader({ params, request }: LoaderFunctionArgs) { read() { runEvents.forEach((event) => { try { + if (!user.admin && event.kind === TaskEventKind.LOG) { + // Only return debug logs for admins + return; + } this.push(formatRunEvent(event) + "\n"); } catch {} }); From 392b96f32b8dcce1e3973a05ffda74748f00a394 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 10 Apr 2025 16:06:01 +0100 Subject: [PATCH 04/35] add nypm changeset --- .changeset/polite-lies-fix.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/polite-lies-fix.md diff --git a/.changeset/polite-lies-fix.md b/.changeset/polite-lies-fix.md new file mode 100644 index 0000000000..6e60a77604 --- /dev/null +++ b/.changeset/polite-lies-fix.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +Update nypm package to support test-based bun.lock files From 060bbc12cd380a07280aaf3898915ddb75956859 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 11:03:05 +0100 Subject: [PATCH 05/35] pull out env override logic --- .../src/entryPoints/managed-run-controller.ts | 166 +++---------- .../cli-v3/src/entryPoints/managed/env.ts | 222 ++++++++++++++++++ .../src/entryPoints/managed/overrides.ts | 29 +++ 3 files changed, 290 insertions(+), 127 deletions(-) create mode 100644 packages/cli-v3/src/entryPoints/managed/env.ts create mode 100644 packages/cli-v3/src/entryPoints/managed/overrides.ts diff --git a/packages/cli-v3/src/entryPoints/managed-run-controller.ts b/packages/cli-v3/src/entryPoints/managed-run-controller.ts index c41b50ad27..d509721de1 100644 --- a/packages/cli-v3/src/entryPoints/managed-run-controller.ts +++ b/packages/cli-v3/src/entryPoints/managed-run-controller.ts @@ -1,8 +1,6 @@ import { logger } from "../utilities/logger.js"; import { TaskRunProcess } from "../executions/taskRunProcess.js"; import { env as stdEnv } from "std-env"; -import { z } from "zod"; -import { randomUUID } from "crypto"; import { readJSONFile } from "../utilities/fileSystem.js"; import { type CompleteRunAttemptResult, @@ -26,57 +24,13 @@ import { import { assertExhaustive } from "../utilities/assertExhaustive.js"; import { setTimeout as sleep } from "timers/promises"; import { io, type Socket } from "socket.io-client"; - -const DateEnv = z - .string() - .transform((val) => new Date(parseInt(val, 10))) - .pipe(z.date()); - -// All IDs are friendly IDs -const Env = z.object({ - // Set at build time - TRIGGER_CONTENT_HASH: z.string(), - TRIGGER_DEPLOYMENT_ID: z.string(), - TRIGGER_DEPLOYMENT_VERSION: z.string(), - TRIGGER_PROJECT_ID: z.string(), - TRIGGER_PROJECT_REF: z.string(), - NODE_ENV: z.string().default("production"), - NODE_EXTRA_CA_CERTS: z.string().optional(), - - // Set at runtime - TRIGGER_WORKLOAD_CONTROLLER_ID: z.string().default(`controller_${randomUUID()}`), - TRIGGER_ENV_ID: z.string(), - TRIGGER_RUN_ID: z.string().optional(), // This is only useful for cold starts - TRIGGER_SNAPSHOT_ID: z.string().optional(), // This is only useful for cold starts - OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(), - TRIGGER_WARM_START_URL: z.string().optional(), - TRIGGER_WARM_START_CONNECTION_TIMEOUT_MS: z.coerce.number().default(30_000), - TRIGGER_WARM_START_KEEPALIVE_MS: z.coerce.number().default(300_000), - TRIGGER_MACHINE_CPU: z.string().default("0"), - TRIGGER_MACHINE_MEMORY: z.string().default("0"), - TRIGGER_RUNNER_ID: z.string(), - TRIGGER_METADATA_URL: z.string().optional(), - TRIGGER_PRE_SUSPEND_WAIT_MS: z.coerce.number().default(200), - - // Timeline metrics - TRIGGER_POD_SCHEDULED_AT_MS: DateEnv, - TRIGGER_DEQUEUED_AT_MS: DateEnv, - - // May be overridden - TRIGGER_SUPERVISOR_API_PROTOCOL: z.enum(["http", "https"]), - TRIGGER_SUPERVISOR_API_DOMAIN: z.string(), - TRIGGER_SUPERVISOR_API_PORT: z.coerce.number(), - TRIGGER_WORKER_INSTANCE_NAME: z.string(), - TRIGGER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().default(30), - TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().default(5), - TRIGGER_SUCCESS_EXIT_CODE: z.coerce.number().default(0), - TRIGGER_FAILURE_EXIT_CODE: z.coerce.number().default(1), -}); - -const env = Env.parse(stdEnv); +import { RunnerEnv } from "./managed/env.js"; +import { MetadataClient } from "./managed/overrides.js"; logger.loggerLevel = "debug"; +const env = new RunnerEnv(stdEnv); + type ManagedRunControllerOptions = { workerManifest: WorkerManifest; }; @@ -90,40 +44,10 @@ type Snapshot = { friendlyId: string; }; -type Metadata = { - TRIGGER_SUPERVISOR_API_PROTOCOL: string | undefined; - TRIGGER_SUPERVISOR_API_DOMAIN: string | undefined; - TRIGGER_SUPERVISOR_API_PORT: number | undefined; - TRIGGER_WORKER_INSTANCE_NAME: string | undefined; - TRIGGER_HEARTBEAT_INTERVAL_SECONDS: number | undefined; - TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS: number | undefined; - TRIGGER_SUCCESS_EXIT_CODE: number | undefined; - TRIGGER_FAILURE_EXIT_CODE: number | undefined; - TRIGGER_RUNNER_ID: string | undefined; -}; - -class MetadataClient { - private readonly url: URL; - - constructor(url: string) { - this.url = new URL(url); - } - - async getEnvOverrides(): Promise { - try { - const response = await fetch(new URL("/env", this.url)); - return response.json(); - } catch (error) { - console.error("Failed to fetch metadata", { error }); - return null; - } - } -} - class ManagedRunController { private taskRunProcess?: TaskRunProcess; - private workerManifest: WorkerManifest; + private readonly workerManifest: WorkerManifest; private readonly httpClient: WorkloadHttpClient; private readonly warmStartClient: WarmStartClient | undefined; @@ -132,18 +56,35 @@ class ManagedRunController { private socket: Socket; private readonly runHeartbeat: HeartbeatService; - private heartbeatIntervalSeconds: number; - private readonly snapshotPoller: HeartbeatService; - private snapshotPollIntervalSeconds: number; - private workerApiUrl: string; - private workerInstanceName: string; + get heartbeatIntervalSeconds() { + return env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS; + } + + get snapshotPollIntervalSeconds() { + return env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS; + } + + get runnerId() { + return env.TRIGGER_RUNNER_ID; + } + + get successExitCode() { + return env.TRIGGER_SUCCESS_EXIT_CODE; + } - private runnerId: string; + get failureExitCode() { + return env.TRIGGER_FAILURE_EXIT_CODE; + } + + get workerApiUrl() { + return env.TRIGGER_SUPERVISOR_API_URL; + } - private successExitCode = env.TRIGGER_SUCCESS_EXIT_CODE; - private failureExitCode = env.TRIGGER_FAILURE_EXIT_CODE; + get workerInstanceName() { + return env.TRIGGER_WORKER_INSTANCE_NAME; + } private state: | { @@ -158,11 +99,6 @@ class ManagedRunController { constructor(opts: ManagedRunControllerOptions) { this.workerManifest = opts.workerManifest; - this.runnerId = env.TRIGGER_RUNNER_ID; - - this.workerApiUrl = `${env.TRIGGER_SUPERVISOR_API_PROTOCOL}://${env.TRIGGER_SUPERVISOR_API_DOMAIN}:${env.TRIGGER_SUPERVISOR_API_PORT}`; - this.workerInstanceName = env.TRIGGER_WORKER_INSTANCE_NAME; - this.httpClient = new WorkloadHttpClient({ workerApiUrl: this.workerApiUrl, runnerId: this.runnerId, @@ -172,7 +108,7 @@ class ManagedRunController { }); const properties = { - ...env, + ...env.raw, TRIGGER_POD_SCHEDULED_AT_MS: env.TRIGGER_POD_SCHEDULED_AT_MS.toISOString(), TRIGGER_DEQUEUED_AT_MS: env.TRIGGER_DEQUEUED_AT_MS.toISOString(), }; @@ -183,9 +119,6 @@ class ManagedRunController { properties, }); - this.heartbeatIntervalSeconds = env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS; - this.snapshotPollIntervalSeconds = env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS; - if (env.TRIGGER_METADATA_URL) { this.metadataClient = new MetadataClient(env.TRIGGER_METADATA_URL); } @@ -834,45 +767,24 @@ class ManagedRunController { properties: { ...overrides }, }); - if (overrides.TRIGGER_SUCCESS_EXIT_CODE) { - this.successExitCode = overrides.TRIGGER_SUCCESS_EXIT_CODE; - } - - if (overrides.TRIGGER_FAILURE_EXIT_CODE) { - this.failureExitCode = overrides.TRIGGER_FAILURE_EXIT_CODE; - } + // Override the env with the new values + env.override(overrides); + // Update services and clients with the new values if (overrides.TRIGGER_HEARTBEAT_INTERVAL_SECONDS) { - this.heartbeatIntervalSeconds = overrides.TRIGGER_HEARTBEAT_INTERVAL_SECONDS; - this.runHeartbeat.updateInterval(this.heartbeatIntervalSeconds * 1000); + this.runHeartbeat.updateInterval(env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS * 1000); } - if (overrides.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS) { - this.snapshotPollIntervalSeconds = overrides.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS; - this.snapshotPoller.updateInterval(this.snapshotPollIntervalSeconds * 1000); - } - - if (overrides.TRIGGER_WORKER_INSTANCE_NAME) { - this.workerInstanceName = overrides.TRIGGER_WORKER_INSTANCE_NAME; + this.snapshotPoller.updateInterval(env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS * 1000); } - if ( overrides.TRIGGER_SUPERVISOR_API_PROTOCOL || overrides.TRIGGER_SUPERVISOR_API_DOMAIN || overrides.TRIGGER_SUPERVISOR_API_PORT ) { - const protocol = - overrides.TRIGGER_SUPERVISOR_API_PROTOCOL ?? env.TRIGGER_SUPERVISOR_API_PROTOCOL; - const domain = overrides.TRIGGER_SUPERVISOR_API_DOMAIN ?? env.TRIGGER_SUPERVISOR_API_DOMAIN; - const port = overrides.TRIGGER_SUPERVISOR_API_PORT ?? env.TRIGGER_SUPERVISOR_API_PORT; - - this.workerApiUrl = `${protocol}://${domain}:${port}`; - this.httpClient.updateApiUrl(this.workerApiUrl); } - if (overrides.TRIGGER_RUNNER_ID) { - this.runnerId = overrides.TRIGGER_RUNNER_ID; this.httpClient.updateRunnerId(this.runnerId); } } @@ -1670,9 +1582,9 @@ await prodWorker.start(); function gatherProcessEnv(): Record { const $env = { - NODE_ENV: env.NODE_ENV, - NODE_EXTRA_CA_CERTS: env.NODE_EXTRA_CA_CERTS, - OTEL_EXPORTER_OTLP_ENDPOINT: env.OTEL_EXPORTER_OTLP_ENDPOINT, + NODE_ENV: stdEnv.NODE_ENV, + NODE_EXTRA_CA_CERTS: stdEnv.NODE_EXTRA_CA_CERTS, + OTEL_EXPORTER_OTLP_ENDPOINT: stdEnv.OTEL_EXPORTER_OTLP_ENDPOINT, }; // Filter out undefined values diff --git a/packages/cli-v3/src/entryPoints/managed/env.ts b/packages/cli-v3/src/entryPoints/managed/env.ts new file mode 100644 index 0000000000..2e15276971 --- /dev/null +++ b/packages/cli-v3/src/entryPoints/managed/env.ts @@ -0,0 +1,222 @@ +import { randomUUID } from "node:crypto"; +import { Metadata } from "./overrides.js"; +import { z } from "zod"; + +const DateEnv = z + .string() + .transform((val) => new Date(parseInt(val, 10))) + .pipe(z.date()); + +// All IDs are friendly IDs +const Env = z.object({ + // Set at build time + TRIGGER_CONTENT_HASH: z.string(), + TRIGGER_DEPLOYMENT_ID: z.string(), + TRIGGER_DEPLOYMENT_VERSION: z.string(), + TRIGGER_PROJECT_ID: z.string(), + TRIGGER_PROJECT_REF: z.string(), + NODE_ENV: z.string().default("production"), + NODE_EXTRA_CA_CERTS: z.string().optional(), + + // Set at runtime + TRIGGER_WORKLOAD_CONTROLLER_ID: z.string().default(`controller_${randomUUID()}`), + TRIGGER_ENV_ID: z.string(), + TRIGGER_RUN_ID: z.string().optional(), // This is only useful for cold starts + TRIGGER_SNAPSHOT_ID: z.string().optional(), // This is only useful for cold starts + OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(), + TRIGGER_WARM_START_URL: z.string().optional(), + TRIGGER_WARM_START_CONNECTION_TIMEOUT_MS: z.coerce.number().default(30_000), + TRIGGER_WARM_START_KEEPALIVE_MS: z.coerce.number().default(300_000), + TRIGGER_MACHINE_CPU: z.string().default("0"), + TRIGGER_MACHINE_MEMORY: z.string().default("0"), + TRIGGER_RUNNER_ID: z.string(), + TRIGGER_METADATA_URL: z.string().optional(), + TRIGGER_PRE_SUSPEND_WAIT_MS: z.coerce.number().default(200), + + // Timeline metrics + TRIGGER_POD_SCHEDULED_AT_MS: DateEnv, + TRIGGER_DEQUEUED_AT_MS: DateEnv, + + // May be overridden + TRIGGER_SUPERVISOR_API_PROTOCOL: z.enum(["http", "https"]), + TRIGGER_SUPERVISOR_API_DOMAIN: z.string(), + TRIGGER_SUPERVISOR_API_PORT: z.coerce.number(), + TRIGGER_WORKER_INSTANCE_NAME: z.string(), + TRIGGER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().default(30), + TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().default(5), + TRIGGER_SUCCESS_EXIT_CODE: z.coerce.number().default(0), + TRIGGER_FAILURE_EXIT_CODE: z.coerce.number().default(1), +}); + +type Env = z.infer; + +export class RunnerEnv { + private env: Env; + public readonly initial: Env; + + constructor(env: Record) { + this.env = Env.parse(env); + this.initial = { ...this.env }; + } + + get raw() { + return this.env; + } + + // Base environment variables + get NODE_ENV() { + return this.env.NODE_ENV; + } + get NODE_EXTRA_CA_CERTS() { + return this.env.NODE_EXTRA_CA_CERTS; + } + get OTEL_EXPORTER_OTLP_ENDPOINT() { + return this.env.OTEL_EXPORTER_OTLP_ENDPOINT; + } + get TRIGGER_CONTENT_HASH() { + return this.env.TRIGGER_CONTENT_HASH; + } + get TRIGGER_DEPLOYMENT_ID() { + return this.env.TRIGGER_DEPLOYMENT_ID; + } + get TRIGGER_DEPLOYMENT_VERSION() { + return this.env.TRIGGER_DEPLOYMENT_VERSION; + } + get TRIGGER_PROJECT_ID() { + return this.env.TRIGGER_PROJECT_ID; + } + get TRIGGER_PROJECT_REF() { + return this.env.TRIGGER_PROJECT_REF; + } + get TRIGGER_WORKLOAD_CONTROLLER_ID() { + return this.env.TRIGGER_WORKLOAD_CONTROLLER_ID; + } + get TRIGGER_ENV_ID() { + return this.env.TRIGGER_ENV_ID; + } + get TRIGGER_RUN_ID() { + return this.env.TRIGGER_RUN_ID; + } + get TRIGGER_SNAPSHOT_ID() { + return this.env.TRIGGER_SNAPSHOT_ID; + } + get TRIGGER_WARM_START_URL() { + return this.env.TRIGGER_WARM_START_URL; + } + get TRIGGER_WARM_START_CONNECTION_TIMEOUT_MS() { + return this.env.TRIGGER_WARM_START_CONNECTION_TIMEOUT_MS; + } + get TRIGGER_WARM_START_KEEPALIVE_MS() { + return this.env.TRIGGER_WARM_START_KEEPALIVE_MS; + } + get TRIGGER_MACHINE_CPU() { + return this.env.TRIGGER_MACHINE_CPU; + } + get TRIGGER_MACHINE_MEMORY() { + return this.env.TRIGGER_MACHINE_MEMORY; + } + get TRIGGER_METADATA_URL() { + return this.env.TRIGGER_METADATA_URL; + } + get TRIGGER_PRE_SUSPEND_WAIT_MS() { + return this.env.TRIGGER_PRE_SUSPEND_WAIT_MS; + } + get TRIGGER_POD_SCHEDULED_AT_MS() { + return this.env.TRIGGER_POD_SCHEDULED_AT_MS; + } + get TRIGGER_DEQUEUED_AT_MS() { + return this.env.TRIGGER_DEQUEUED_AT_MS; + } + + // Overridable values + get TRIGGER_SUCCESS_EXIT_CODE() { + return this.env.TRIGGER_SUCCESS_EXIT_CODE; + } + get TRIGGER_FAILURE_EXIT_CODE() { + return this.env.TRIGGER_FAILURE_EXIT_CODE; + } + get TRIGGER_HEARTBEAT_INTERVAL_SECONDS() { + return this.env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS; + } + get TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS() { + return this.env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS; + } + get TRIGGER_WORKER_INSTANCE_NAME() { + return this.env.TRIGGER_WORKER_INSTANCE_NAME; + } + get TRIGGER_RUNNER_ID() { + return this.env.TRIGGER_RUNNER_ID; + } + + get TRIGGER_SUPERVISOR_API_PROTOCOL() { + return this.env.TRIGGER_SUPERVISOR_API_PROTOCOL; + } + + get TRIGGER_SUPERVISOR_API_DOMAIN() { + return this.env.TRIGGER_SUPERVISOR_API_DOMAIN; + } + + get TRIGGER_SUPERVISOR_API_PORT() { + return this.env.TRIGGER_SUPERVISOR_API_PORT; + } + + get TRIGGER_SUPERVISOR_API_URL() { + return `${this.TRIGGER_SUPERVISOR_API_PROTOCOL}://${this.TRIGGER_SUPERVISOR_API_DOMAIN}:${this.TRIGGER_SUPERVISOR_API_PORT}`; + } + + /** Overrides existing env vars with new values */ + override(overrides: Metadata) { + if (overrides.TRIGGER_SUCCESS_EXIT_CODE) { + this.env.TRIGGER_SUCCESS_EXIT_CODE = overrides.TRIGGER_SUCCESS_EXIT_CODE; + } + + if (overrides.TRIGGER_FAILURE_EXIT_CODE) { + this.env.TRIGGER_FAILURE_EXIT_CODE = overrides.TRIGGER_FAILURE_EXIT_CODE; + } + + if (overrides.TRIGGER_HEARTBEAT_INTERVAL_SECONDS) { + this.env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS = overrides.TRIGGER_HEARTBEAT_INTERVAL_SECONDS; + } + + if (overrides.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS) { + this.env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS = + overrides.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS; + } + + if (overrides.TRIGGER_WORKER_INSTANCE_NAME) { + this.env.TRIGGER_WORKER_INSTANCE_NAME = overrides.TRIGGER_WORKER_INSTANCE_NAME; + } + + if (overrides.TRIGGER_SUPERVISOR_API_PROTOCOL) { + this.env.TRIGGER_SUPERVISOR_API_PROTOCOL = overrides.TRIGGER_SUPERVISOR_API_PROTOCOL as + | "http" + | "https"; + } + + if (overrides.TRIGGER_SUPERVISOR_API_DOMAIN) { + this.env.TRIGGER_SUPERVISOR_API_DOMAIN = overrides.TRIGGER_SUPERVISOR_API_DOMAIN; + } + + if (overrides.TRIGGER_SUPERVISOR_API_PORT) { + this.env.TRIGGER_SUPERVISOR_API_PORT = overrides.TRIGGER_SUPERVISOR_API_PORT; + } + + if (overrides.TRIGGER_RUNNER_ID) { + this.env.TRIGGER_RUNNER_ID = overrides.TRIGGER_RUNNER_ID; + } + } + + // Helper method to get process env for task runs + gatherProcessEnv(): Record { + const $env = { + NODE_ENV: this.NODE_ENV, + NODE_EXTRA_CA_CERTS: this.NODE_EXTRA_CA_CERTS, + OTEL_EXPORTER_OTLP_ENDPOINT: this.OTEL_EXPORTER_OTLP_ENDPOINT, + }; + + // Filter out undefined values + return Object.fromEntries( + Object.entries($env).filter(([key, value]) => value !== undefined) + ) as Record; + } +} diff --git a/packages/cli-v3/src/entryPoints/managed/overrides.ts b/packages/cli-v3/src/entryPoints/managed/overrides.ts new file mode 100644 index 0000000000..872b5ad0b3 --- /dev/null +++ b/packages/cli-v3/src/entryPoints/managed/overrides.ts @@ -0,0 +1,29 @@ +export type Metadata = { + TRIGGER_SUPERVISOR_API_PROTOCOL: string | undefined; + TRIGGER_SUPERVISOR_API_DOMAIN: string | undefined; + TRIGGER_SUPERVISOR_API_PORT: number | undefined; + TRIGGER_WORKER_INSTANCE_NAME: string | undefined; + TRIGGER_HEARTBEAT_INTERVAL_SECONDS: number | undefined; + TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS: number | undefined; + TRIGGER_SUCCESS_EXIT_CODE: number | undefined; + TRIGGER_FAILURE_EXIT_CODE: number | undefined; + TRIGGER_RUNNER_ID: string | undefined; +}; + +export class MetadataClient { + private readonly url: URL; + + constructor(url: string) { + this.url = new URL(url); + } + + async getEnvOverrides(): Promise { + try { + const response = await fetch(new URL("/env", this.url)); + return response.json(); + } catch (error) { + console.error("Failed to fetch metadata", { error }); + return null; + } + } +} From 40b4dfb9ef7d7befd28d35fd496f3d5493478f62 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 11:09:51 +0100 Subject: [PATCH 06/35] use runner env gather helper --- .../src/entryPoints/managed-run-controller.ts | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/packages/cli-v3/src/entryPoints/managed-run-controller.ts b/packages/cli-v3/src/entryPoints/managed-run-controller.ts index d509721de1..7e408c0e3a 100644 --- a/packages/cli-v3/src/entryPoints/managed-run-controller.ts +++ b/packages/cli-v3/src/entryPoints/managed-run-controller.ts @@ -898,7 +898,7 @@ class ManagedRunController { ) satisfies TaskRunExecutionMetrics; const taskRunEnv = { - ...gatherProcessEnv(), + ...env.gatherProcessEnv(), ...envVars, }; @@ -1580,19 +1580,6 @@ const workerManifest = await loadWorkerManifest(); const prodWorker = new ManagedRunController({ workerManifest }); await prodWorker.start(); -function gatherProcessEnv(): Record { - const $env = { - NODE_ENV: stdEnv.NODE_ENV, - NODE_EXTRA_CA_CERTS: stdEnv.NODE_EXTRA_CA_CERTS, - OTEL_EXPORTER_OTLP_ENDPOINT: stdEnv.OTEL_EXPORTER_OTLP_ENDPOINT, - }; - - // Filter out undefined values - return Object.fromEntries( - Object.entries($env).filter(([key, value]) => value !== undefined) - ) as Record; -} - async function loadWorkerManifest() { const manifest = await readJSONFile("./index.json"); return WorkerManifest.parse(manifest); From ce2558e7b100a9a9ac3a2700928ef82ad8a396fe Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 11:57:52 +0100 Subject: [PATCH 07/35] handle dev flushing failures gracefully --- .../cli-v3/src/entryPoints/dev-run-worker.ts | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/packages/cli-v3/src/entryPoints/dev-run-worker.ts b/packages/cli-v3/src/entryPoints/dev-run-worker.ts index 76c892720c..d821621ebd 100644 --- a/packages/cli-v3/src/entryPoints/dev-run-worker.ts +++ b/packages/cli-v3/src/entryPoints/dev-run-worker.ts @@ -472,7 +472,34 @@ const zodIpc = new ZodIpcConnection({ async function flushAll(timeoutInMs: number = 10_000) { const now = performance.now(); - await Promise.all([flushTracingSDK(timeoutInMs), flushMetadata(timeoutInMs)]); + const results = await Promise.allSettled([ + flushTracingSDK(timeoutInMs), + flushMetadata(timeoutInMs), + ]); + + const successfulFlushes = results + .filter((result) => result.status === "fulfilled") + .map((result) => result.value.flushed); + + const failedFlushes = ["tracingSDK", "runMetadata"].filter( + (flushed) => !successfulFlushes.includes(flushed) + ); + + if (failedFlushes.length > 0) { + logError(`Failed to flush ${failedFlushes.join(", ")}`); + } + + const errorMessages = results + .filter((result) => result.status === "rejected") + .map((result) => result.reason); + + if (errorMessages.length > 0) { + logError(errorMessages.join("\n")); + } + + for (const flushed of successfulFlushes) { + log(`Flushed ${flushed} successfully`); + } const duration = performance.now() - now; @@ -487,6 +514,11 @@ async function flushTracingSDK(timeoutInMs: number = 10_000) { const duration = performance.now() - now; log(`Flushed tracingSDK in ${duration}ms`); + + return { + flushed: "tracingSDK", + durationMs: duration, + }; } async function flushMetadata(timeoutInMs: number = 10_000) { @@ -497,6 +529,11 @@ async function flushMetadata(timeoutInMs: number = 10_000) { const duration = performance.now() - now; log(`Flushed runMetadata in ${duration}ms`); + + return { + flushed: "runMetadata", + durationMs: duration, + }; } const managedWorkerRuntime = new ManagedRuntimeManager(zodIpc, showInternalLogs); From 9bc9a157ab0a243457548aeaa5851e13a582fa2a Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 12:26:56 +0100 Subject: [PATCH 08/35] fix path normalization for init.ts --- packages/cli-v3/src/build/bundle.ts | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/packages/cli-v3/src/build/bundle.ts b/packages/cli-v3/src/build/bundle.ts index 6206289ce8..90b488a4dc 100644 --- a/packages/cli-v3/src/build/bundle.ts +++ b/packages/cli-v3/src/build/bundle.ts @@ -3,7 +3,7 @@ import { DEFAULT_RUNTIME, ResolvedConfig } from "@trigger.dev/core/v3/build"; import { BuildManifest, BuildTarget, TaskFile } from "@trigger.dev/core/v3/schemas"; import * as esbuild from "esbuild"; import { createHash } from "node:crypto"; -import { join, relative, resolve } from "node:path"; +import { basename, dirname, join, relative, resolve } from "node:path"; import { createFile } from "../utilities/fileSystem.js"; import { logger } from "../utilities/logger.js"; import { resolveFileSources } from "../utilities/sourceFiles.js"; @@ -239,15 +239,18 @@ export async function getBundleResultFromBuild( // Check if the entry point is an init.ts file at the root of a trigger directory function isInitEntryPoint(entryPoint: string): boolean { - const normalizedEntryPoint = entryPoint.replace(/\\/g, "/"); // Normalize path separators const initFileNames = ["init.ts", "init.mts", "init.cts", "init.js", "init.mjs", "init.cjs"]; // Check if it's directly in one of the trigger directories return resolvedConfig.dirs.some((dir) => { - const normalizedDir = dir.replace(/\\/g, "/"); - return initFileNames.some( - (fileName) => normalizedEntryPoint === `${normalizedDir}/${fileName}` - ); + const normalizedDir = resolve(dir); + const normalizedEntryDir = resolve(dirname(entryPoint)); + + if (normalizedDir !== normalizedEntryDir) { + return false; + } + + return initFileNames.includes(basename(entryPoint)); }); } From 81b8c4b882d771dd4625b34d766f2d9e1aa0126b Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 12:48:15 +0100 Subject: [PATCH 09/35] add logger --- .../src/entryPoints/managed-run-controller.ts | 50 ++++-------------- .../cli-v3/src/entryPoints/managed/logger.ts | 52 +++++++++++++++++++ 2 files changed, 61 insertions(+), 41 deletions(-) create mode 100644 packages/cli-v3/src/entryPoints/managed/logger.ts diff --git a/packages/cli-v3/src/entryPoints/managed-run-controller.ts b/packages/cli-v3/src/entryPoints/managed-run-controller.ts index 7e408c0e3a..4f6cbe1c40 100644 --- a/packages/cli-v3/src/entryPoints/managed-run-controller.ts +++ b/packages/cli-v3/src/entryPoints/managed-run-controller.ts @@ -1,4 +1,3 @@ -import { logger } from "../utilities/logger.js"; import { TaskRunProcess } from "../executions/taskRunProcess.js"; import { env as stdEnv } from "std-env"; import { readJSONFile } from "../utilities/fileSystem.js"; @@ -16,7 +15,6 @@ import { WarmStartClient, WORKLOAD_HEADERS, type WorkloadClientToServerEvents, - type WorkloadDebugLogRequestBody, WorkloadHttpClient, type WorkloadServerToClientEvents, type WorkloadRunAttemptStartResponseBody, @@ -26,8 +24,7 @@ import { setTimeout as sleep } from "timers/promises"; import { io, type Socket } from "socket.io-client"; import { RunnerEnv } from "./managed/env.js"; import { MetadataClient } from "./managed/overrides.js"; - -logger.loggerLevel = "debug"; +import { RunLogger, SendDebugLogOptions } from "./managed/logger.js"; const env = new RunnerEnv(stdEnv); @@ -54,6 +51,7 @@ class ManagedRunController { private readonly metadataClient?: MetadataClient; private socket: Socket; + private readonly logger: RunLogger; private readonly runHeartbeat: HeartbeatService; private readonly snapshotPoller: HeartbeatService; @@ -107,6 +105,11 @@ class ManagedRunController { projectRef: env.TRIGGER_PROJECT_REF, }); + this.logger = new RunLogger({ + httpClient: this.httpClient, + env, + }); + const properties = { ...env.raw, TRIGGER_POD_SCHEDULED_AT_MS: env.TRIGGER_POD_SCHEDULED_AT_MS.toISOString(), @@ -1484,43 +1487,8 @@ class ManagedRunController { assertExhaustive(attemptStatus); } - sendDebugLog({ - runId, - message, - date, - properties, - }: { - runId?: string; - message: string; - date?: Date; - properties?: WorkloadDebugLogRequestBody["properties"]; - }) { - if (!runId) { - runId = this.runFriendlyId; - } - - if (!runId) { - runId = env.TRIGGER_RUN_ID; - } - - if (!runId) { - return; - } - - const mergedProperties = { - ...properties, - runId, - runnerId: this.runnerId, - workerName: this.workerInstanceName, - }; - - console.log(message, mergedProperties); - - this.httpClient.sendDebugLog(runId, { - message, - time: date ?? new Date(), - properties: mergedProperties, - }); + sendDebugLog(opts: SendDebugLogOptions) { + this.logger.sendDebugLog(opts); } async cancelAttempt(runId: string) { diff --git a/packages/cli-v3/src/entryPoints/managed/logger.ts b/packages/cli-v3/src/entryPoints/managed/logger.ts new file mode 100644 index 0000000000..3a7a045476 --- /dev/null +++ b/packages/cli-v3/src/entryPoints/managed/logger.ts @@ -0,0 +1,52 @@ +import { + WorkloadDebugLogRequestBody, + WorkloadHttpClient, +} from "@trigger.dev/core/v3/runEngineWorker"; +import { RunnerEnv } from "./env.js"; + +export type SendDebugLogOptions = { + runId?: string; + message: string; + date?: Date; + properties?: WorkloadDebugLogRequestBody["properties"]; +}; + +export type RunLoggerOptions = { + httpClient: WorkloadHttpClient; + env: RunnerEnv; +}; + +export class RunLogger { + private readonly httpClient: WorkloadHttpClient; + private readonly env: RunnerEnv; + + constructor(private readonly opts: RunLoggerOptions) { + this.httpClient = opts.httpClient; + this.env = opts.env; + } + + sendDebugLog({ runId, message, date, properties }: SendDebugLogOptions) { + if (!runId) { + runId = this.env.TRIGGER_RUN_ID; + } + + if (!runId) { + return; + } + + const mergedProperties = { + ...properties, + runId, + runnerId: this.env.TRIGGER_RUNNER_ID, + workerName: this.env.TRIGGER_WORKER_INSTANCE_NAME, + }; + + console.log(message, mergedProperties); + + this.httpClient.sendDebugLog(runId, { + message, + time: date ?? new Date(), + properties: mergedProperties, + }); + } +} From 0fd10c92a8ad640aa05540617fe2f98f2a16c632 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:03:34 +0100 Subject: [PATCH 10/35] add execution heartbeat service --- .../src/entryPoints/managed-run-controller.ts | 145 +++++--------- .../src/entryPoints/managed/heartbeat.ts | 183 ++++++++++++++++++ 2 files changed, 232 insertions(+), 96 deletions(-) create mode 100644 packages/cli-v3/src/entryPoints/managed/heartbeat.ts diff --git a/packages/cli-v3/src/entryPoints/managed-run-controller.ts b/packages/cli-v3/src/entryPoints/managed-run-controller.ts index 4f6cbe1c40..b727ebbab3 100644 --- a/packages/cli-v3/src/entryPoints/managed-run-controller.ts +++ b/packages/cli-v3/src/entryPoints/managed-run-controller.ts @@ -25,6 +25,7 @@ import { io, type Socket } from "socket.io-client"; import { RunnerEnv } from "./managed/env.js"; import { MetadataClient } from "./managed/overrides.js"; import { RunLogger, SendDebugLogOptions } from "./managed/logger.js"; +import { RunExecutionHeartbeat } from "./managed/heartbeat.js"; const env = new RunnerEnv(stdEnv); @@ -54,45 +55,7 @@ class ManagedRunController { private readonly logger: RunLogger; private readonly runHeartbeat: HeartbeatService; - private readonly snapshotPoller: HeartbeatService; - - get heartbeatIntervalSeconds() { - return env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS; - } - - get snapshotPollIntervalSeconds() { - return env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS; - } - - get runnerId() { - return env.TRIGGER_RUNNER_ID; - } - - get successExitCode() { - return env.TRIGGER_SUCCESS_EXIT_CODE; - } - - get failureExitCode() { - return env.TRIGGER_FAILURE_EXIT_CODE; - } - - get workerApiUrl() { - return env.TRIGGER_SUPERVISOR_API_URL; - } - - get workerInstanceName() { - return env.TRIGGER_WORKER_INSTANCE_NAME; - } - - private state: - | { - phase: "RUN"; - run: Run; - snapshot: Snapshot; - } - | { - phase: "IDLE" | "WARM_START"; - } = { phase: "IDLE" }; + private readonly snapshotPoller: RunExecutionHeartbeat; constructor(opts: ManagedRunControllerOptions) { this.workerManifest = opts.workerManifest; @@ -137,63 +100,14 @@ class ManagedRunController { }); } - this.snapshotPoller = new HeartbeatService({ - heartbeat: async () => { - if (!this.runFriendlyId) { - this.sendDebugLog({ - runId: env.TRIGGER_RUN_ID, - message: "Skipping snapshot poll, no run ID", - }); - return; - } - - this.sendDebugLog({ - runId: env.TRIGGER_RUN_ID, - message: "Polling for latest snapshot", - }); - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: `snapshot poll: started`, - properties: { - snapshotId: this.snapshotFriendlyId, - }, - }); - - const response = await this.httpClient.getRunExecutionData(this.runFriendlyId); - - if (!response.success) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Snapshot poll failed", - properties: { - error: response.error, - }, - }); - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: `snapshot poll: failed`, - properties: { - snapshotId: this.snapshotFriendlyId, - error: response.error, - }, - }); - - return; - } - - await this.handleSnapshotChange(response.data.execution); - }, - intervalMs: this.snapshotPollIntervalSeconds * 1000, - leadingEdge: false, - onError: async (error) => { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Failed to poll for snapshot", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - }, + this.snapshotPoller = new RunExecutionHeartbeat({ + // @ts-expect-error + runFriendlyId: env.TRIGGER_RUN_ID, + // @ts-expect-error + snapshotFriendlyId: env.TRIGGER_SNAPSHOT_ID, + httpClient: this.httpClient, + logger: this.logger, + heartbeatIntervalSeconds: this.heartbeatIntervalSeconds, }); this.runHeartbeat = new HeartbeatService({ @@ -246,6 +160,44 @@ class ManagedRunController { }); } + get heartbeatIntervalSeconds() { + return env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS; + } + + get snapshotPollIntervalSeconds() { + return env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS; + } + + get runnerId() { + return env.TRIGGER_RUNNER_ID; + } + + get successExitCode() { + return env.TRIGGER_SUCCESS_EXIT_CODE; + } + + get failureExitCode() { + return env.TRIGGER_FAILURE_EXIT_CODE; + } + + get workerApiUrl() { + return env.TRIGGER_SUPERVISOR_API_URL; + } + + get workerInstanceName() { + return env.TRIGGER_WORKER_INSTANCE_NAME; + } + + private state: + | { + phase: "RUN"; + run: Run; + snapshot: Snapshot; + } + | { + phase: "IDLE" | "WARM_START"; + } = { phase: "IDLE" }; + private enterRunPhase(run: Run, snapshot: Snapshot) { this.onExitRunPhase(run); this.state = { phase: "RUN", run, snapshot }; @@ -477,6 +429,7 @@ class ManagedRunController { try { this.updateRunPhase(run, snapshot); + this.snapshotPoller.updateSnapshotId(snapshot.friendlyId); } catch (error) { this.sendDebugLog({ runId: run.friendlyId, diff --git a/packages/cli-v3/src/entryPoints/managed/heartbeat.ts b/packages/cli-v3/src/entryPoints/managed/heartbeat.ts new file mode 100644 index 0000000000..0a725505d0 --- /dev/null +++ b/packages/cli-v3/src/entryPoints/managed/heartbeat.ts @@ -0,0 +1,183 @@ +import { HeartbeatService, RunExecutionData } from "@trigger.dev/core/v3"; +import { WorkloadHttpClient } from "@trigger.dev/core/v3/runEngineWorker"; +import { RunLogger } from "./logger.js"; + +export type RunExecutionHeartbeatOptions = { + runFriendlyId: string; + snapshotFriendlyId: string; + httpClient: WorkloadHttpClient; + logger: RunLogger; + heartbeatIntervalSeconds: number; +}; + +export class RunExecutionHeartbeat { + private readonly logger: RunLogger; + private readonly heartbeat: HeartbeatService; + private readonly httpClient: WorkloadHttpClient; + + private readonly runFriendlyId: string; + private snapshotFriendlyId: string; + + constructor(opts: RunExecutionHeartbeatOptions) { + this.logger = opts.logger; + this.httpClient = opts.httpClient; + + this.runFriendlyId = opts.runFriendlyId; + this.snapshotFriendlyId = opts.snapshotFriendlyId; + + this.heartbeat = new HeartbeatService({ + heartbeat: async () => { + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: "heartbeat: started", + }); + + const response = await this.httpClient.heartbeatRun( + this.runFriendlyId, + this.snapshotFriendlyId + ); + + if (!response.success) { + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: "heartbeat: failed", + properties: { + error: response.error, + }, + }); + } + }, + intervalMs: opts.heartbeatIntervalSeconds * 1000, + leadingEdge: false, + onError: async (error) => { + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: "Failed to send heartbeat", + properties: { error: error instanceof Error ? error.message : String(error) }, + }); + }, + }); + } + + resetCurrentInterval() { + this.heartbeat.resetCurrentInterval(); + } + + updateSnapshotId(snapshotId: string) { + this.snapshotFriendlyId = snapshotId; + } + + updateInterval(intervalMs: number) { + this.heartbeat.updateInterval(intervalMs); + } + + start() { + this.heartbeat.start(); + } + + stop() { + this.heartbeat.stop(); + } +} + +type RunExecutionSnapshotPollerOptions = { + runFriendlyId: string; + snapshotFriendlyId: string; + httpClient: WorkloadHttpClient; + logger: RunLogger; + snapshotPollIntervalSeconds: number; + handleSnapshotChange: (execution: RunExecutionData) => Promise; +}; + +class RunExecutionSnapshotPoller { + private readonly logger: RunLogger; + private readonly poller: HeartbeatService; + private readonly httpClient: WorkloadHttpClient; + + private readonly runFriendlyId: string; + private readonly snapshotFriendlyId: string; + private readonly snapshotPollIntervalSeconds: number; + + private readonly handleSnapshotChange: (execution: RunExecutionData) => Promise; + + constructor(opts: RunExecutionSnapshotPollerOptions) { + this.logger = opts.logger; + this.httpClient = opts.httpClient; + + this.runFriendlyId = opts.runFriendlyId; + this.snapshotFriendlyId = opts.snapshotFriendlyId; + + this.handleSnapshotChange = opts.handleSnapshotChange; + + this.poller = new HeartbeatService({ + heartbeat: async () => { + if (!this.runFriendlyId) { + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: "Skipping snapshot poll, no run ID", + }); + return; + } + + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: "Polling for latest snapshot", + }); + + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: `snapshot poll: started`, + properties: { + snapshotId: this.snapshotFriendlyId, + }, + }); + + const response = await this.httpClient.getRunExecutionData(this.runFriendlyId); + + if (!response.success) { + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: "Snapshot poll failed", + properties: { + error: response.error, + }, + }); + + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: `snapshot poll: failed`, + properties: { + snapshotId: this.snapshotFriendlyId, + error: response.error, + }, + }); + + return; + } + + await this.handleSnapshotChange(response.data.execution); + }, + intervalMs: this.snapshotPollIntervalSeconds * 1000, + leadingEdge: false, + onError: async (error) => { + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: "Failed to poll for snapshot", + properties: { error: error instanceof Error ? error.message : String(error) }, + }); + }, + }); + } + + updateInterval(intervalMs: number) { + this.poller.updateInterval(intervalMs); + } + + start() { + this.poller.start(); + } + + stop() { + this.poller.stop(); + } +} From 990ac466af8d6c46fbd80b24bb0916cc83aa3d0e Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:10:25 +0100 Subject: [PATCH 11/35] add snapshot poller service --- .../src/entryPoints/managed-run-controller.ts | 60 +++------ .../cli-v3/src/entryPoints/managed/poller.ts | 114 ++++++++++++++++++ 2 files changed, 130 insertions(+), 44 deletions(-) create mode 100644 packages/cli-v3/src/entryPoints/managed/poller.ts diff --git a/packages/cli-v3/src/entryPoints/managed-run-controller.ts b/packages/cli-v3/src/entryPoints/managed-run-controller.ts index b727ebbab3..8a83f6b8a5 100644 --- a/packages/cli-v3/src/entryPoints/managed-run-controller.ts +++ b/packages/cli-v3/src/entryPoints/managed-run-controller.ts @@ -3,7 +3,6 @@ import { env as stdEnv } from "std-env"; import { readJSONFile } from "../utilities/fileSystem.js"; import { type CompleteRunAttemptResult, - HeartbeatService, type RunExecutionData, SuspendedProcessError, type TaskRunExecutionMetrics, @@ -26,6 +25,7 @@ import { RunnerEnv } from "./managed/env.js"; import { MetadataClient } from "./managed/overrides.js"; import { RunLogger, SendDebugLogOptions } from "./managed/logger.js"; import { RunExecutionHeartbeat } from "./managed/heartbeat.js"; +import { RunExecutionSnapshotPoller } from "./managed/poller.js"; const env = new RunnerEnv(stdEnv); @@ -54,8 +54,8 @@ class ManagedRunController { private socket: Socket; private readonly logger: RunLogger; - private readonly runHeartbeat: HeartbeatService; - private readonly snapshotPoller: RunExecutionHeartbeat; + private readonly runHeartbeat: RunExecutionHeartbeat; + private readonly snapshotPoller: RunExecutionSnapshotPoller; constructor(opts: ManagedRunControllerOptions) { this.workerManifest = opts.workerManifest; @@ -100,55 +100,25 @@ class ManagedRunController { }); } - this.snapshotPoller = new RunExecutionHeartbeat({ + this.snapshotPoller = new RunExecutionSnapshotPoller({ // @ts-expect-error runFriendlyId: env.TRIGGER_RUN_ID, // @ts-expect-error snapshotFriendlyId: env.TRIGGER_SNAPSHOT_ID, httpClient: this.httpClient, logger: this.logger, - heartbeatIntervalSeconds: this.heartbeatIntervalSeconds, + snapshotPollIntervalSeconds: this.snapshotPollIntervalSeconds, + handleSnapshotChange: this.handleSnapshotChange, }); - this.runHeartbeat = new HeartbeatService({ - heartbeat: async () => { - if (!this.runFriendlyId || !this.snapshotFriendlyId) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Skipping heartbeat, no run ID or snapshot ID", - }); - return; - } - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "heartbeat: started", - }); - - const response = await this.httpClient.heartbeatRun( - this.runFriendlyId, - this.snapshotFriendlyId - ); - - if (!response.success) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "heartbeat: failed", - properties: { - error: response.error, - }, - }); - } - }, - intervalMs: this.heartbeatIntervalSeconds * 1000, - leadingEdge: false, - onError: async (error) => { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Failed to send heartbeat", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - }, + this.runHeartbeat = new RunExecutionHeartbeat({ + // @ts-expect-error + runFriendlyId: env.TRIGGER_RUN_ID, + // @ts-expect-error + snapshotFriendlyId: env.TRIGGER_SNAPSHOT_ID, + httpClient: this.httpClient, + logger: this.logger, + heartbeatIntervalSeconds: this.heartbeatIntervalSeconds, }); process.on("SIGTERM", async () => { @@ -429,6 +399,8 @@ class ManagedRunController { try { this.updateRunPhase(run, snapshot); + + this.runHeartbeat.updateSnapshotId(snapshot.friendlyId); this.snapshotPoller.updateSnapshotId(snapshot.friendlyId); } catch (error) { this.sendDebugLog({ diff --git a/packages/cli-v3/src/entryPoints/managed/poller.ts b/packages/cli-v3/src/entryPoints/managed/poller.ts new file mode 100644 index 0000000000..72ede94c29 --- /dev/null +++ b/packages/cli-v3/src/entryPoints/managed/poller.ts @@ -0,0 +1,114 @@ +import { WorkloadHttpClient } from "@trigger.dev/core/v3/runEngineWorker"; +import { RunLogger } from "./logger.js"; +import { HeartbeatService, RunExecutionData } from "@trigger.dev/core/v3"; + +export type RunExecutionSnapshotPollerOptions = { + runFriendlyId: string; + snapshotFriendlyId: string; + httpClient: WorkloadHttpClient; + logger: RunLogger; + snapshotPollIntervalSeconds: number; + handleSnapshotChange: (execution: RunExecutionData) => Promise; +}; + +export class RunExecutionSnapshotPoller { + private readonly logger: RunLogger; + private readonly poller: HeartbeatService; + private readonly httpClient: WorkloadHttpClient; + + private readonly runFriendlyId: string; + private snapshotFriendlyId: string; + + private readonly snapshotPollIntervalSeconds: number; + + private readonly handleSnapshotChange: (execution: RunExecutionData) => Promise; + + constructor(opts: RunExecutionSnapshotPollerOptions) { + this.logger = opts.logger; + this.httpClient = opts.httpClient; + + this.runFriendlyId = opts.runFriendlyId; + this.snapshotFriendlyId = opts.snapshotFriendlyId; + + this.handleSnapshotChange = opts.handleSnapshotChange; + + this.poller = new HeartbeatService({ + heartbeat: async () => { + if (!this.runFriendlyId) { + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: "Skipping snapshot poll, no run ID", + }); + return; + } + + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: "Polling for latest snapshot", + }); + + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: `snapshot poll: started`, + properties: { + snapshotId: this.snapshotFriendlyId, + }, + }); + + const response = await this.httpClient.getRunExecutionData(this.runFriendlyId); + + if (!response.success) { + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: "Snapshot poll failed", + properties: { + error: response.error, + }, + }); + + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: `snapshot poll: failed`, + properties: { + snapshotId: this.snapshotFriendlyId, + error: response.error, + }, + }); + + return; + } + + await this.handleSnapshotChange(response.data.execution); + }, + intervalMs: this.snapshotPollIntervalSeconds * 1000, + leadingEdge: false, + onError: async (error) => { + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: "Failed to poll for snapshot", + properties: { error: error instanceof Error ? error.message : String(error) }, + }); + }, + }); + } + + resetCurrentInterval() { + this.poller.resetCurrentInterval(); + } + + updateSnapshotId(snapshotId: string) { + this.snapshotFriendlyId = snapshotId; + } + + updateInterval(intervalMs: number) { + this.poller.updateInterval(intervalMs); + } + + start() { + this.poller.start(); + } + + stop() { + this.poller.stop(); + } +} From 030d4f8c78e307c26e1f8e17df4429951df4eb8b Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:33:13 +0100 Subject: [PATCH 12/35] fix poller --- .../src/entryPoints/managed-run-controller.ts | 2 +- .../src/entryPoints/managed/heartbeat.ts | 10 ++++++++++ .../cli-v3/src/entryPoints/managed/poller.ts | 18 +++++++++++------- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/packages/cli-v3/src/entryPoints/managed-run-controller.ts b/packages/cli-v3/src/entryPoints/managed-run-controller.ts index 8a83f6b8a5..757c75a248 100644 --- a/packages/cli-v3/src/entryPoints/managed-run-controller.ts +++ b/packages/cli-v3/src/entryPoints/managed-run-controller.ts @@ -108,7 +108,7 @@ class ManagedRunController { httpClient: this.httpClient, logger: this.logger, snapshotPollIntervalSeconds: this.snapshotPollIntervalSeconds, - handleSnapshotChange: this.handleSnapshotChange, + handleSnapshotChange: this.handleSnapshotChange.bind(this), }); this.runHeartbeat = new RunExecutionHeartbeat({ diff --git a/packages/cli-v3/src/entryPoints/managed/heartbeat.ts b/packages/cli-v3/src/entryPoints/managed/heartbeat.ts index 0a725505d0..4b1787eb51 100644 --- a/packages/cli-v3/src/entryPoints/managed/heartbeat.ts +++ b/packages/cli-v3/src/entryPoints/managed/heartbeat.ts @@ -25,6 +25,16 @@ export class RunExecutionHeartbeat { this.runFriendlyId = opts.runFriendlyId; this.snapshotFriendlyId = opts.snapshotFriendlyId; + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: "RunExecutionHeartbeat", + properties: { + runFriendlyId: this.runFriendlyId, + snapshotFriendlyId: this.snapshotFriendlyId, + heartbeatIntervalSeconds: opts.heartbeatIntervalSeconds, + }, + }); + this.heartbeat = new HeartbeatService({ heartbeat: async () => { this.logger.sendDebugLog({ diff --git a/packages/cli-v3/src/entryPoints/managed/poller.ts b/packages/cli-v3/src/entryPoints/managed/poller.ts index 72ede94c29..65d225391a 100644 --- a/packages/cli-v3/src/entryPoints/managed/poller.ts +++ b/packages/cli-v3/src/entryPoints/managed/poller.ts @@ -19,10 +19,6 @@ export class RunExecutionSnapshotPoller { private readonly runFriendlyId: string; private snapshotFriendlyId: string; - private readonly snapshotPollIntervalSeconds: number; - - private readonly handleSnapshotChange: (execution: RunExecutionData) => Promise; - constructor(opts: RunExecutionSnapshotPollerOptions) { this.logger = opts.logger; this.httpClient = opts.httpClient; @@ -30,7 +26,15 @@ export class RunExecutionSnapshotPoller { this.runFriendlyId = opts.runFriendlyId; this.snapshotFriendlyId = opts.snapshotFriendlyId; - this.handleSnapshotChange = opts.handleSnapshotChange; + this.logger.sendDebugLog({ + runId: this.runFriendlyId, + message: "RunExecutionSnapshotPoller", + properties: { + runFriendlyId: this.runFriendlyId, + snapshotFriendlyId: this.snapshotFriendlyId, + snapshotPollIntervalSeconds: opts.snapshotPollIntervalSeconds, + }, + }); this.poller = new HeartbeatService({ heartbeat: async () => { @@ -78,9 +82,9 @@ export class RunExecutionSnapshotPoller { return; } - await this.handleSnapshotChange(response.data.execution); + await opts.handleSnapshotChange(response.data.execution); }, - intervalMs: this.snapshotPollIntervalSeconds * 1000, + intervalMs: opts.snapshotPollIntervalSeconds * 1000, leadingEdge: false, onError: async (error) => { this.logger.sendDebugLog({ From 21ad68c88e4525449035bc81f5e2e56847f24486 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:39:09 +0100 Subject: [PATCH 13/35] add changesets --- .changeset/late-chairs-ring.md | 5 +++++ .changeset/shiny-kiwis-beam.md | 5 +++++ 2 files changed, 10 insertions(+) create mode 100644 .changeset/late-chairs-ring.md create mode 100644 .changeset/shiny-kiwis-beam.md diff --git a/.changeset/late-chairs-ring.md b/.changeset/late-chairs-ring.md new file mode 100644 index 0000000000..cd7c9f3620 --- /dev/null +++ b/.changeset/late-chairs-ring.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +Fix init.ts in custom trigger dirs diff --git a/.changeset/shiny-kiwis-beam.md b/.changeset/shiny-kiwis-beam.md new file mode 100644 index 0000000000..c01b131162 --- /dev/null +++ b/.changeset/shiny-kiwis-beam.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +Handle flush errors gracefully in dev From bbd7ea689b5ba0157701a58607fd97eb9d9e6617 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 14:52:52 +0100 Subject: [PATCH 14/35] create socket in constructor --- .../src/entryPoints/managed-run-controller.ts | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/packages/cli-v3/src/entryPoints/managed-run-controller.ts b/packages/cli-v3/src/entryPoints/managed-run-controller.ts index 757c75a248..90ab486bd0 100644 --- a/packages/cli-v3/src/entryPoints/managed-run-controller.ts +++ b/packages/cli-v3/src/entryPoints/managed-run-controller.ts @@ -42,6 +42,8 @@ type Snapshot = { friendlyId: string; }; +type SupervisorSocket = Socket; + class ManagedRunController { private taskRunProcess?: TaskRunProcess; @@ -51,7 +53,7 @@ class ManagedRunController { private readonly warmStartClient: WarmStartClient | undefined; private readonly metadataClient?: MetadataClient; - private socket: Socket; + private socket: SupervisorSocket; private readonly logger: RunLogger; private readonly runHeartbeat: RunExecutionHeartbeat; @@ -121,6 +123,9 @@ class ManagedRunController { heartbeatIntervalSeconds: this.heartbeatIntervalSeconds, }); + // Websocket notifications are only an optimisation so we don't need to wait for a successful connection + this.socket = this.createSupervisorSocket(); + process.on("SIGTERM", async () => { this.sendDebugLog({ runId: this.runFriendlyId, @@ -130,6 +135,8 @@ class ManagedRunController { }); } + // These settings depend on env vars that may be overridden, e.g. after runs and restores + get heartbeatIntervalSeconds() { return env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS; } @@ -1103,17 +1110,18 @@ class ManagedRunController { process.exit(code); } - createSocket() { + createSupervisorSocket(): SupervisorSocket { const wsUrl = new URL("/workload", this.workerApiUrl); - this.socket = io(wsUrl.href, { + const socket = io(wsUrl.href, { transports: ["websocket"], extraHeaders: { [WORKLOAD_HEADERS.DEPLOYMENT_ID]: env.TRIGGER_DEPLOYMENT_ID, [WORKLOAD_HEADERS.RUNNER_ID]: env.TRIGGER_RUNNER_ID, }, - }); - this.socket.on("run:notify", async ({ version, run }) => { + }) satisfies SupervisorSocket; + + socket.on("run:notify", async ({ version, run }) => { this.sendDebugLog({ runId: run.friendlyId, message: "run:notify received by runner", @@ -1165,7 +1173,8 @@ class ManagedRunController { await this.handleSnapshotChange(latestSnapshot.data.execution); }); - this.socket.on("connect", () => { + + socket.on("connect", () => { this.sendDebugLog({ runId: this.runFriendlyId, message: "Connected to supervisor", @@ -1177,20 +1186,24 @@ class ManagedRunController { this.subscribeToRunNotifications({ run, snapshot }); } }); - this.socket.on("connect_error", (error) => { + + socket.on("connect_error", (error) => { this.sendDebugLog({ runId: this.runFriendlyId, message: "Connection error", properties: { error: error instanceof Error ? error.message : String(error) }, }); }); - this.socket.on("disconnect", (reason, description) => { + + socket.on("disconnect", (reason, description) => { this.sendDebugLog({ runId: this.runFriendlyId, message: "Disconnected from supervisor", properties: { reason, description: description?.toString() }, }); }); + + return socket; } private async executeRun({ @@ -1432,9 +1445,6 @@ class ManagedRunController { message: "Starting up", }); - // Websocket notifications are only an optimisation so we don't need to wait for a successful connection - this.createSocket(); - // If we have run and snapshot IDs, we can start an attempt immediately if (env.TRIGGER_RUN_ID && env.TRIGGER_SNAPSHOT_ID) { this.startAndExecuteRunAttempt({ From 1afe3a78533c6d0da2b3be63b41f09c5cbbf0b0b Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 15:14:05 +0100 Subject: [PATCH 15/35] enable strictPropertyInitialization --- .configs/tsconfig.base.json | 2 +- packages/cli-v3/src/dev/devSupervisor.ts | 34 ++++++++++++------- .../src/entryPoints/managed/heartbeat.ts | 7 ++-- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/.configs/tsconfig.base.json b/.configs/tsconfig.base.json index 3ce4c2db29..2d560d22d0 100644 --- a/.configs/tsconfig.base.json +++ b/.configs/tsconfig.base.json @@ -10,7 +10,7 @@ "strict": true, "alwaysStrict": true, - "strictPropertyInitialization": false, + "strictPropertyInitialization": true, "skipLibCheck": true, "forceConsistentCasingInFileNames": true, "noUnusedLocals": false, diff --git a/packages/cli-v3/src/dev/devSupervisor.ts b/packages/cli-v3/src/dev/devSupervisor.ts index e1445b4600..be00598471 100644 --- a/packages/cli-v3/src/dev/devSupervisor.ts +++ b/packages/cli-v3/src/dev/devSupervisor.ts @@ -49,13 +49,13 @@ export async function startWorkerRuntime(options: WorkerRuntimeOptions): Promise * - Receiving snapshot update pings (via socket) */ class DevSupervisor implements WorkerRuntime { - private config: DevConfigResponseBody; + private config?: DevConfigResponseBody; private disconnectPresence: (() => void) | undefined; private lastManifest?: BuildManifest; private latestWorkerId?: string; /** Receive notifications when runs change state */ - private socket: Socket; + private socket?: Socket; private socketIsReconnecting = false; /** Workers are versions of the code */ @@ -93,7 +93,7 @@ class DevSupervisor implements WorkerRuntime { this.runLimiter = pLimit(maxConcurrentRuns); - this.#createSocket(); + this.socket = this.#createSocket(); //start an SSE connection for presence this.disconnectPresence = await this.#startPresenceConnection(); @@ -105,7 +105,7 @@ class DevSupervisor implements WorkerRuntime { async shutdown(): Promise { this.disconnectPresence?.(); try { - this.socket.close(); + this.socket?.close(); } catch (error) { logger.debug("[DevSupervisor] shutdown, socket failed to close", { error }); } @@ -187,6 +187,10 @@ class DevSupervisor implements WorkerRuntime { * For the latest version we will pull from the main queue, so we don't specify that. */ async #dequeueRuns() { + if (!this.config) { + throw new Error("No config, can't dequeue runs"); + } + if (!this.latestWorkerId) { //try again later logger.debug(`[DevSupervisor] dequeueRuns. No latest worker ID, trying again later`); @@ -409,13 +413,14 @@ class DevSupervisor implements WorkerRuntime { const wsUrl = new URL(this.options.client.apiURL); wsUrl.pathname = "/dev-worker"; - this.socket = io(wsUrl.href, { + const socket = io(wsUrl.href, { transports: ["websocket"], extraHeaders: { Authorization: `Bearer ${this.options.client.accessToken}`, }, }); - this.socket.on("run:notify", async ({ version, run }) => { + + socket.on("run:notify", async ({ version, run }) => { logger.debug("[DevSupervisor] Received run notification", { version, run }); this.options.client.dev.sendDebugLog(run.friendlyId, { @@ -434,10 +439,11 @@ class DevSupervisor implements WorkerRuntime { await controller.getLatestSnapshot(); }); - this.socket.on("connect", () => { + + socket.on("connect", () => { logger.debug("[DevSupervisor] Connected to supervisor"); - if (this.socket.recovered || this.socketIsReconnecting) { + if (socket.recovered || this.socketIsReconnecting) { logger.debug("[DevSupervisor] Socket recovered"); eventBus.emit("socketConnectionReconnected", `Connection was recovered`); } @@ -448,19 +454,21 @@ class DevSupervisor implements WorkerRuntime { controller.resubscribeToRunNotifications(); } }); - this.socket.on("connect_error", (error) => { + + socket.on("connect_error", (error) => { logger.debug("[DevSupervisor] Connection error", { error }); }); - this.socket.on("disconnect", (reason, description) => { + + socket.on("disconnect", (reason, description) => { logger.debug("[DevSupervisor] socket was disconnected", { reason, description, - active: this.socket.active, + active: socket.active, }); if (reason === "io server disconnect") { // the disconnection was initiated by the server, you need to manually reconnect - this.socket.connect(); + socket.connect(); } else { this.socketIsReconnecting = true; eventBus.emit("socketConnectionDisconnected", reason); @@ -472,6 +480,8 @@ class DevSupervisor implements WorkerRuntime { connections: Array.from(this.socketConnections), }); }, 5000); + + return socket; } #subscribeToRunNotifications() { diff --git a/packages/cli-v3/src/entryPoints/managed/heartbeat.ts b/packages/cli-v3/src/entryPoints/managed/heartbeat.ts index 4b1787eb51..3863a0828f 100644 --- a/packages/cli-v3/src/entryPoints/managed/heartbeat.ts +++ b/packages/cli-v3/src/entryPoints/managed/heartbeat.ts @@ -106,7 +106,6 @@ class RunExecutionSnapshotPoller { private readonly runFriendlyId: string; private readonly snapshotFriendlyId: string; - private readonly snapshotPollIntervalSeconds: number; private readonly handleSnapshotChange: (execution: RunExecutionData) => Promise; @@ -167,7 +166,7 @@ class RunExecutionSnapshotPoller { await this.handleSnapshotChange(response.data.execution); }, - intervalMs: this.snapshotPollIntervalSeconds * 1000, + intervalMs: opts.snapshotPollIntervalSeconds * 1000, leadingEdge: false, onError: async (error) => { this.logger.sendDebugLog({ @@ -179,6 +178,10 @@ class RunExecutionSnapshotPoller { }); } + resetCurrentInterval() { + this.poller.resetCurrentInterval(); + } + updateInterval(intervalMs: number) { this.poller.updateInterval(intervalMs); } From 9cf17ce83a372a12c8ec6af7f69e76652a0dc23f Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 16:16:59 +0100 Subject: [PATCH 16/35] deprecate dequeue from version --- packages/core/src/v3/runEngineWorker/supervisor/http.ts | 1 + packages/core/src/v3/runEngineWorker/workload/http.ts | 1 + 2 files changed, 2 insertions(+) diff --git a/packages/core/src/v3/runEngineWorker/supervisor/http.ts b/packages/core/src/v3/runEngineWorker/supervisor/http.ts index 8814c84c35..4f899e4f22 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/http.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/http.ts @@ -81,6 +81,7 @@ export class SupervisorHttpClient { ); } + /** @deprecated Not currently used */ async dequeueFromVersion(deploymentId: string, maxRunCount = 1, runnerId?: string) { return wrapZodFetch( WorkerApiDequeueResponseBody, diff --git a/packages/core/src/v3/runEngineWorker/workload/http.ts b/packages/core/src/v3/runEngineWorker/workload/http.ts index 9d97896f09..9dde07d35d 100644 --- a/packages/core/src/v3/runEngineWorker/workload/http.ts +++ b/packages/core/src/v3/runEngineWorker/workload/http.ts @@ -165,6 +165,7 @@ export class WorkloadHttpClient { } } + /** @deprecated Not currently used */ async dequeue() { return wrapZodFetch( WorkloadDequeueFromVersionResponseBody, From 6fff053e236227eefab8db2100d5707cc10df689 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 16:17:16 +0100 Subject: [PATCH 17/35] start is not async --- .../src/entryPoints/managed-run-controller.ts | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/packages/cli-v3/src/entryPoints/managed-run-controller.ts b/packages/cli-v3/src/entryPoints/managed-run-controller.ts index 90ab486bd0..f36d2bbf79 100644 --- a/packages/cli-v3/src/entryPoints/managed-run-controller.ts +++ b/packages/cli-v3/src/entryPoints/managed-run-controller.ts @@ -1439,7 +1439,7 @@ class ManagedRunController { await this.taskRunProcess?.cancel(); } - async start() { + start() { this.sendDebugLog({ runId: this.runFriendlyId, message: "Starting up", @@ -1478,12 +1478,7 @@ class ManagedRunController { } } -const workerManifest = await loadWorkerManifest(); +const manifest = await readJSONFile("./index.json"); +const workerManifest = WorkerManifest.parse(manifest); -const prodWorker = new ManagedRunController({ workerManifest }); -await prodWorker.start(); - -async function loadWorkerManifest() { - const manifest = await readJSONFile("./index.json"); - return WorkerManifest.parse(manifest); -} +new ManagedRunController({ workerManifest }).start(); From 76ac4a834f93dc0d48220f4a95156127a9c0f17a Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 16:26:21 +0100 Subject: [PATCH 18/35] dependency injection in prep for tests --- .../src/entryPoints/managed-run-controller.ts | 1484 +---------------- .../src/entryPoints/managed/controller.ts | 1482 ++++++++++++++++ .../cli-v3/src/entryPoints/managed/env.ts | 3 +- 3 files changed, 1490 insertions(+), 1479 deletions(-) create mode 100644 packages/cli-v3/src/entryPoints/managed/controller.ts diff --git a/packages/cli-v3/src/entryPoints/managed-run-controller.ts b/packages/cli-v3/src/entryPoints/managed-run-controller.ts index f36d2bbf79..4baa701b05 100644 --- a/packages/cli-v3/src/entryPoints/managed-run-controller.ts +++ b/packages/cli-v3/src/entryPoints/managed-run-controller.ts @@ -1,1484 +1,12 @@ -import { TaskRunProcess } from "../executions/taskRunProcess.js"; import { env as stdEnv } from "std-env"; import { readJSONFile } from "../utilities/fileSystem.js"; -import { - type CompleteRunAttemptResult, - type RunExecutionData, - SuspendedProcessError, - type TaskRunExecutionMetrics, - type TaskRunExecutionResult, - type TaskRunFailedExecutionResult, - WorkerManifest, -} from "@trigger.dev/core/v3"; -import { - WarmStartClient, - WORKLOAD_HEADERS, - type WorkloadClientToServerEvents, - WorkloadHttpClient, - type WorkloadServerToClientEvents, - type WorkloadRunAttemptStartResponseBody, -} from "@trigger.dev/core/v3/workers"; -import { assertExhaustive } from "../utilities/assertExhaustive.js"; -import { setTimeout as sleep } from "timers/promises"; -import { io, type Socket } from "socket.io-client"; -import { RunnerEnv } from "./managed/env.js"; -import { MetadataClient } from "./managed/overrides.js"; -import { RunLogger, SendDebugLogOptions } from "./managed/logger.js"; -import { RunExecutionHeartbeat } from "./managed/heartbeat.js"; -import { RunExecutionSnapshotPoller } from "./managed/poller.js"; - -const env = new RunnerEnv(stdEnv); - -type ManagedRunControllerOptions = { - workerManifest: WorkerManifest; -}; - -type Run = { - friendlyId: string; - attemptNumber?: number | null; -}; - -type Snapshot = { - friendlyId: string; -}; - -type SupervisorSocket = Socket; - -class ManagedRunController { - private taskRunProcess?: TaskRunProcess; - - private readonly workerManifest: WorkerManifest; - - private readonly httpClient: WorkloadHttpClient; - private readonly warmStartClient: WarmStartClient | undefined; - private readonly metadataClient?: MetadataClient; - - private socket: SupervisorSocket; - private readonly logger: RunLogger; - - private readonly runHeartbeat: RunExecutionHeartbeat; - private readonly snapshotPoller: RunExecutionSnapshotPoller; - - constructor(opts: ManagedRunControllerOptions) { - this.workerManifest = opts.workerManifest; - - this.httpClient = new WorkloadHttpClient({ - workerApiUrl: this.workerApiUrl, - runnerId: this.runnerId, - deploymentId: env.TRIGGER_DEPLOYMENT_ID, - deploymentVersion: env.TRIGGER_DEPLOYMENT_VERSION, - projectRef: env.TRIGGER_PROJECT_REF, - }); - - this.logger = new RunLogger({ - httpClient: this.httpClient, - env, - }); - - const properties = { - ...env.raw, - TRIGGER_POD_SCHEDULED_AT_MS: env.TRIGGER_POD_SCHEDULED_AT_MS.toISOString(), - TRIGGER_DEQUEUED_AT_MS: env.TRIGGER_DEQUEUED_AT_MS.toISOString(), - }; - - this.sendDebugLog({ - runId: env.TRIGGER_RUN_ID, - message: "Creating run controller", - properties, - }); - - if (env.TRIGGER_METADATA_URL) { - this.metadataClient = new MetadataClient(env.TRIGGER_METADATA_URL); - } - - if (env.TRIGGER_WARM_START_URL) { - this.warmStartClient = new WarmStartClient({ - apiUrl: new URL(env.TRIGGER_WARM_START_URL), - controllerId: env.TRIGGER_WORKLOAD_CONTROLLER_ID, - deploymentId: env.TRIGGER_DEPLOYMENT_ID, - deploymentVersion: env.TRIGGER_DEPLOYMENT_VERSION, - machineCpu: env.TRIGGER_MACHINE_CPU, - machineMemory: env.TRIGGER_MACHINE_MEMORY, - }); - } - - this.snapshotPoller = new RunExecutionSnapshotPoller({ - // @ts-expect-error - runFriendlyId: env.TRIGGER_RUN_ID, - // @ts-expect-error - snapshotFriendlyId: env.TRIGGER_SNAPSHOT_ID, - httpClient: this.httpClient, - logger: this.logger, - snapshotPollIntervalSeconds: this.snapshotPollIntervalSeconds, - handleSnapshotChange: this.handleSnapshotChange.bind(this), - }); - - this.runHeartbeat = new RunExecutionHeartbeat({ - // @ts-expect-error - runFriendlyId: env.TRIGGER_RUN_ID, - // @ts-expect-error - snapshotFriendlyId: env.TRIGGER_SNAPSHOT_ID, - httpClient: this.httpClient, - logger: this.logger, - heartbeatIntervalSeconds: this.heartbeatIntervalSeconds, - }); - - // Websocket notifications are only an optimisation so we don't need to wait for a successful connection - this.socket = this.createSupervisorSocket(); - - process.on("SIGTERM", async () => { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Received SIGTERM, stopping worker", - }); - await this.stop(); - }); - } - - // These settings depend on env vars that may be overridden, e.g. after runs and restores - - get heartbeatIntervalSeconds() { - return env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS; - } - - get snapshotPollIntervalSeconds() { - return env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS; - } - - get runnerId() { - return env.TRIGGER_RUNNER_ID; - } - - get successExitCode() { - return env.TRIGGER_SUCCESS_EXIT_CODE; - } - - get failureExitCode() { - return env.TRIGGER_FAILURE_EXIT_CODE; - } - - get workerApiUrl() { - return env.TRIGGER_SUPERVISOR_API_URL; - } - - get workerInstanceName() { - return env.TRIGGER_WORKER_INSTANCE_NAME; - } - - private state: - | { - phase: "RUN"; - run: Run; - snapshot: Snapshot; - } - | { - phase: "IDLE" | "WARM_START"; - } = { phase: "IDLE" }; - - private enterRunPhase(run: Run, snapshot: Snapshot) { - this.onExitRunPhase(run); - this.state = { phase: "RUN", run, snapshot }; - - this.runHeartbeat.start(); - this.snapshotPoller.start(); - } - - private enterWarmStartPhase() { - this.onExitRunPhase(); - this.state = { phase: "WARM_START" }; - } - - // This should only be used when we're already executing a run. Attempt number changes are not allowed. - private updateRunPhase(run: Run, snapshot: Snapshot) { - if (this.state.phase !== "RUN") { - this.sendDebugLog({ - runId: run.friendlyId, - message: `updateRunPhase: Invalid phase for updating snapshot: ${this.state.phase}`, - properties: { - currentPhase: this.state.phase, - snapshotId: snapshot.friendlyId, - }, - }); - - throw new Error(`Invalid phase for updating snapshot: ${this.state.phase}`); - } - - if (this.state.run.friendlyId !== run.friendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: `updateRunPhase: Mismatched run IDs`, - properties: { - currentRunId: this.state.run.friendlyId, - newRunId: run.friendlyId, - currentSnapshotId: this.state.snapshot.friendlyId, - newSnapshotId: snapshot.friendlyId, - }, - }); - - throw new Error("Mismatched run IDs"); - } - - if (this.state.snapshot.friendlyId === snapshot.friendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "updateRunPhase: Snapshot not changed", - properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: `updateRunPhase: Snapshot not changed`, - properties: { - snapshotId: snapshot.friendlyId, - }, - }); - - return; - } - - if (this.state.run.attemptNumber !== run.attemptNumber) { - this.sendDebugLog({ - runId: run.friendlyId, - message: `updateRunPhase: Attempt number changed`, - properties: { - oldAttemptNumber: this.state.run.attemptNumber ?? undefined, - newAttemptNumber: run.attemptNumber ?? undefined, - }, - }); - throw new Error("Attempt number changed"); - } - - this.state = { - phase: "RUN", - run: { - friendlyId: run.friendlyId, - attemptNumber: run.attemptNumber, - }, - snapshot: { - friendlyId: snapshot.friendlyId, - }, - }; - } - - private onExitRunPhase(newRun: Run | undefined = undefined) { - // We're not in a run phase, nothing to do - if (this.state.phase !== "RUN") { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "onExitRunPhase: Not in run phase, skipping", - properties: { phase: this.state.phase }, - }); - return; - } - - // This is still the same run, so we're not exiting the phase - if (newRun?.friendlyId === this.state.run.friendlyId) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "onExitRunPhase: Same run, skipping", - properties: { newRun: newRun?.friendlyId }, - }); - return; - } - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "onExitRunPhase: Exiting run phase", - properties: { newRun: newRun?.friendlyId }, - }); - - this.runHeartbeat.stop(); - this.snapshotPoller.stop(); - - const { run, snapshot } = this.state; - - this.unsubscribeFromRunNotifications({ run, snapshot }); - } - - private subscribeToRunNotifications({ run, snapshot }: { run: Run; snapshot: Snapshot }) { - this.socket.emit("run:start", { - version: "1", - run: { - friendlyId: run.friendlyId, - }, - snapshot: { - friendlyId: snapshot.friendlyId, - }, - }); - } - - private unsubscribeFromRunNotifications({ run, snapshot }: { run: Run; snapshot: Snapshot }) { - this.socket.emit("run:stop", { - version: "1", - run: { - friendlyId: run.friendlyId, - }, - snapshot: { - friendlyId: snapshot.friendlyId, - }, - }); - } - - private get runFriendlyId() { - if (this.state.phase !== "RUN") { - return undefined; - } - - return this.state.run.friendlyId; - } - - private get snapshotFriendlyId() { - if (this.state.phase !== "RUN") { - return; - } - - return this.state.snapshot.friendlyId; - } - - private handleSnapshotChangeLock = false; - - private async handleSnapshotChange({ - run, - snapshot, - completedWaitpoints, - }: Pick) { - if (this.handleSnapshotChangeLock) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "handleSnapshotChange: already in progress", - }); - return; - } - - this.handleSnapshotChangeLock = true; - - try { - if (!this.snapshotFriendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "handleSnapshotChange: Missing snapshot ID", - properties: { - newSnapshotId: snapshot.friendlyId, - newSnapshotStatus: snapshot.executionStatus, - }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: "snapshot change: missing snapshot ID", - properties: { - newSnapshotId: snapshot.friendlyId, - newSnapshotStatus: snapshot.executionStatus, - }, - }); - - return; - } - - if (this.snapshotFriendlyId === snapshot.friendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "handleSnapshotChange: snapshot not changed, skipping", - properties: { snapshot: snapshot.friendlyId }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: "snapshot change: skipping, no change", - properties: { - snapshotId: this.snapshotFriendlyId, - snapshotStatus: snapshot.executionStatus, - }, - }); - - return; - } - - this.sendDebugLog({ - runId: run.friendlyId, - message: `snapshot change: ${snapshot.executionStatus}`, - properties: { - oldSnapshotId: this.snapshotFriendlyId, - newSnapshotId: snapshot.friendlyId, - completedWaitpoints: completedWaitpoints.length, - }, - }); - - try { - this.updateRunPhase(run, snapshot); - - this.runHeartbeat.updateSnapshotId(snapshot.friendlyId); - this.snapshotPoller.updateSnapshotId(snapshot.friendlyId); - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "snapshot change: failed to update run phase", - properties: { - currentPhase: this.state.phase, - error: error instanceof Error ? error.message : String(error), - }, - }); - - this.waitForNextRun(); - return; - } - - switch (snapshot.executionStatus) { - case "PENDING_CANCEL": { - try { - await this.cancelAttempt(run.friendlyId); - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "snapshot change: failed to cancel attempt", - properties: { - error: error instanceof Error ? error.message : String(error), - }, - }); - - this.waitForNextRun(); - return; - } - - return; - } - case "FINISHED": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run is finished, will wait for next run", - }); - - if (this.activeRunExecution) { - // Let's pretend we've just suspended the run. This will kill the process and should automatically wait for the next run. - // We still explicitly call waitForNextRun() afterwards in case of race conditions. Locks will prevent this from causing issues. - await this.taskRunProcess?.suspend(); - } - - this.waitForNextRun(); - - return; - } - case "QUEUED_EXECUTING": - case "EXECUTING_WITH_WAITPOINTS": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run is executing with waitpoints", - properties: { snapshot: snapshot.friendlyId }, - }); - - try { - // This should never throw. It should also never fail the run. - await this.taskRunProcess?.cleanup(false); - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to cleanup task run process", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - } - - if (snapshot.friendlyId !== this.snapshotFriendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Snapshot changed after cleanup, abort", - properties: { - oldSnapshotId: snapshot.friendlyId, - newSnapshotId: this.snapshotFriendlyId, - }, - }); - return; - } - - await sleep(env.TRIGGER_PRE_SUSPEND_WAIT_MS); - - if (snapshot.friendlyId !== this.snapshotFriendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Snapshot changed after suspend threshold, abort", - properties: { - oldSnapshotId: snapshot.friendlyId, - newSnapshotId: this.snapshotFriendlyId, - }, - }); - return; - } - - if (!this.runFriendlyId || !this.snapshotFriendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: - "handleSnapshotChange: Missing run ID or snapshot ID after suspension, abort", - properties: { - runId: this.runFriendlyId, - snapshotId: this.snapshotFriendlyId, - }, - }); - return; - } - - const suspendResult = await this.httpClient.suspendRun( - this.runFriendlyId, - this.snapshotFriendlyId - ); - - if (!suspendResult.success) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to suspend run, staying alive 🎶", - properties: { - error: suspendResult.error, - }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: "checkpoint: suspend request failed", - properties: { - snapshotId: snapshot.friendlyId, - error: suspendResult.error, - }, - }); - - return; - } - - if (!suspendResult.data.ok) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "checkpoint: failed to suspend run", - properties: { - snapshotId: snapshot.friendlyId, - error: suspendResult.data.error, - }, - }); - - return; - } - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Suspending, any day now 🚬", - properties: { ok: suspendResult.data.ok }, - }); - return; - } - case "SUSPENDED": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run was suspended, kill the process and wait for more runs", - properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, - }); - - // This will kill the process and fail the execution with a SuspendedProcessError - await this.taskRunProcess?.suspend(); - - return; - } - case "PENDING_EXECUTING": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run is pending execution", - properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, - }); - - if (completedWaitpoints.length === 0) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "No waitpoints to complete, nothing to do", - }); - return; - } - - // There are waitpoints to complete so we've been restored after being suspended - - // Short delay to give websocket time to reconnect - await sleep(100); - - // Env may have changed after restore - await this.processEnvOverrides(); - - // We need to let the platform know we're ready to continue - const continuationResult = await this.httpClient.continueRunExecution( - run.friendlyId, - snapshot.friendlyId - ); - - if (!continuationResult.success) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "failed to continue execution", - properties: { - error: continuationResult.error, - }, - }); - - this.waitForNextRun(); - return; - } - - return; - } - case "EXECUTING": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run is now executing", - properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, - }); - - if (completedWaitpoints.length === 0) { - return; - } - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Processing completed waitpoints", - properties: { completedWaitpoints: completedWaitpoints.length }, - }); - - if (!this.taskRunProcess) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "No task run process, ignoring completed waitpoints", - properties: { completedWaitpoints: completedWaitpoints.length }, - }); - return; - } - - for (const waitpoint of completedWaitpoints) { - this.taskRunProcess.waitpointCompleted(waitpoint); - } - - return; - } - case "RUN_CREATED": - case "QUEUED": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Status change not handled", - properties: { status: snapshot.executionStatus }, - }); - return; - } - default: { - assertExhaustive(snapshot.executionStatus); - } - } - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "snapshot change: unexpected error", - properties: { - snapshotId: snapshot.friendlyId, - error: error instanceof Error ? error.message : String(error), - }, - }); - } finally { - this.handleSnapshotChangeLock = false; - } - } - - private async processEnvOverrides() { - if (!this.metadataClient) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "No metadata client, skipping env overrides", - }); - return; - } - - const overrides = await this.metadataClient.getEnvOverrides(); - - if (!overrides) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "No env overrides, skipping", - }); - return; - } - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Processing env overrides", - properties: { ...overrides }, - }); - - // Override the env with the new values - env.override(overrides); - - // Update services and clients with the new values - if (overrides.TRIGGER_HEARTBEAT_INTERVAL_SECONDS) { - this.runHeartbeat.updateInterval(env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS * 1000); - } - if (overrides.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS) { - this.snapshotPoller.updateInterval(env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS * 1000); - } - if ( - overrides.TRIGGER_SUPERVISOR_API_PROTOCOL || - overrides.TRIGGER_SUPERVISOR_API_DOMAIN || - overrides.TRIGGER_SUPERVISOR_API_PORT - ) { - this.httpClient.updateApiUrl(this.workerApiUrl); - } - if (overrides.TRIGGER_RUNNER_ID) { - this.httpClient.updateRunnerId(this.runnerId); - } - } - - private activeRunExecution: Promise | null = null; - - private async startAndExecuteRunAttempt({ - runFriendlyId, - snapshotFriendlyId, - dequeuedAt, - podScheduledAt, - isWarmStart, - skipLockCheckForImmediateRetry: skipLockCheck, - }: { - runFriendlyId: string; - snapshotFriendlyId: string; - dequeuedAt?: Date; - podScheduledAt?: Date; - isWarmStart?: boolean; - skipLockCheckForImmediateRetry?: boolean; - }) { - if (!skipLockCheck && this.activeRunExecution) { - this.sendDebugLog({ - runId: runFriendlyId, - message: "startAndExecuteRunAttempt: already in progress", - }); - return; - } - - const execution = async () => { - if (!this.socket) { - this.sendDebugLog({ - runId: runFriendlyId, - message: "Starting run without socket connection", - }); - } - - this.subscribeToRunNotifications({ - run: { friendlyId: runFriendlyId }, - snapshot: { friendlyId: snapshotFriendlyId }, - }); - - const attemptStartedAt = Date.now(); - - const start = await this.httpClient.startRunAttempt(runFriendlyId, snapshotFriendlyId, { - isWarmStart, - }); - - if (!start.success) { - this.sendDebugLog({ - runId: runFriendlyId, - message: "Failed to start run", - properties: { error: start.error }, - }); - - this.sendDebugLog({ - runId: runFriendlyId, - message: "failed to start run attempt", - properties: { - error: start.error, - }, - }); - - this.waitForNextRun(); - return; - } - - const attemptDuration = Date.now() - attemptStartedAt; - - const { run, snapshot, execution, envVars } = start.data; - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Started run", - properties: { snapshot: snapshot.friendlyId }, - }); - - this.enterRunPhase(run, snapshot); - - const metrics = [ - { - name: "start", - event: "create_attempt", - timestamp: attemptStartedAt, - duration: attemptDuration, - }, - ] - .concat( - dequeuedAt - ? [ - { - name: "start", - event: "dequeue", - timestamp: dequeuedAt.getTime(), - duration: 0, - }, - ] - : [] - ) - .concat( - podScheduledAt - ? [ - { - name: "start", - event: "pod_scheduled", - timestamp: podScheduledAt.getTime(), - duration: 0, - }, - ] - : [] - ) satisfies TaskRunExecutionMetrics; - - const taskRunEnv = { - ...env.gatherProcessEnv(), - ...envVars, - }; - - try { - return await this.executeRun({ - run, - snapshot, - envVars: taskRunEnv, - execution, - metrics, - isWarmStart, - }); - } catch (error) { - if (error instanceof SuspendedProcessError) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run was suspended and task run process was killed, waiting for next run", - properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, - }); - - this.waitForNextRun(); - return; - } - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Error while executing attempt", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Submitting attempt completion", - properties: { - snapshotId: snapshot.friendlyId, - updatedSnapshotId: this.snapshotFriendlyId, - }, - }); - - const completion = { - id: execution.run.id, - ok: false, - retry: undefined, - error: TaskRunProcess.parseExecuteError(error), - } satisfies TaskRunFailedExecutionResult; - - const completionResult = await this.httpClient.completeRunAttempt( - run.friendlyId, - // FIXME: if the snapshot has changed since starting the run, this won't be accurate - // ..but we probably shouldn't fetch the latest snapshot either because we may be in an "unhealthy" state while the next runner has already taken over - this.snapshotFriendlyId ?? snapshot.friendlyId, - { completion } - ); - - if (!completionResult.success) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to submit completion after error", - properties: { error: completionResult.error }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: "completion: failed to submit after error", - properties: { - error: completionResult.error, - }, - }); - - this.waitForNextRun(); - return; - } - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Attempt completion submitted after error", - properties: { - attemptStatus: completionResult.data.result.attemptStatus, - runId: completionResult.data.result.run.friendlyId, - snapshotId: completionResult.data.result.snapshot.friendlyId, - }, - }); - - try { - await this.handleCompletionResult(completion, completionResult.data.result); - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to handle completion result after error", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - - this.waitForNextRun(); - return; - } - } - }; - - this.activeRunExecution = execution(); - - try { - await this.activeRunExecution; - } catch (error) { - this.sendDebugLog({ - runId: runFriendlyId, - message: "startAndExecuteRunAttempt: unexpected error", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - } finally { - this.activeRunExecution = null; - } - } - - private waitForNextRunLock = false; - - /** This will kill the child process before spinning up a new one. It will never throw, - * but may exit the process on any errors or when no runs are available after the - * configured duration. */ - private async waitForNextRun() { - if (this.waitForNextRunLock) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: already in progress", - }); - return; - } - - this.waitForNextRunLock = true; - const previousRunId = this.runFriendlyId; - - try { - // If there's a run execution in progress, we need to kill it and wait for it to finish - if (this.activeRunExecution) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: waiting for existing run execution to finish", - }); - await this.activeRunExecution; - } - - // Just for good measure - await this.taskRunProcess?.kill("SIGKILL"); - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: waiting for next run", - }); - - this.enterWarmStartPhase(); - - if (!this.warmStartClient) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: warm starts disabled, shutting down", - }); - this.exitProcess(this.successExitCode); - } - - if (this.taskRunProcess) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: eagerly recreating task run process with options", - }); - this.taskRunProcess = new TaskRunProcess({ - ...this.taskRunProcess.options, - isWarmStart: true, - }).initialize(); - } else { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: no existing task run process, so we can't eagerly recreate it", - }); - } - - // Check the service is up and get additional warm start config - const connect = await this.warmStartClient.connect(); - - if (!connect.success) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: failed to connect to warm start service", - properties: { - warmStartUrl: env.TRIGGER_WARM_START_URL, - error: connect.error, - }, - }); - this.exitProcess(this.successExitCode); - } - - const connectionTimeoutMs = - connect.data.connectionTimeoutMs ?? env.TRIGGER_WARM_START_CONNECTION_TIMEOUT_MS; - const keepaliveMs = connect.data.keepaliveMs ?? env.TRIGGER_WARM_START_KEEPALIVE_MS; - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: connected to warm start service", - properties: { - connectionTimeoutMs, - keepaliveMs, - }, - }); - - if (previousRunId) { - this.sendDebugLog({ - runId: previousRunId, - message: "warm start: received config", - properties: { - connectionTimeoutMs, - keepaliveMs, - }, - }); - } - - if (!connectionTimeoutMs || !keepaliveMs) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: warm starts disabled after connect", - properties: { - connectionTimeoutMs, - keepaliveMs, - }, - }); - this.exitProcess(this.successExitCode); - } - - const nextRun = await this.warmStartClient.warmStart({ - workerInstanceName: this.workerInstanceName, - connectionTimeoutMs, - keepaliveMs, - }); - - if (!nextRun) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: warm start failed, shutting down", - }); - this.exitProcess(this.successExitCode); - } - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: got next run", - properties: { nextRun: nextRun.run.friendlyId }, - }); - - this.startAndExecuteRunAttempt({ - runFriendlyId: nextRun.run.friendlyId, - snapshotFriendlyId: nextRun.snapshot.friendlyId, - dequeuedAt: nextRun.dequeuedAt, - isWarmStart: true, - }).finally(() => {}); - return; - } catch (error) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: unexpected error", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - this.exitProcess(this.failureExitCode); - } finally { - this.waitForNextRunLock = false; - } - } - - private exitProcess(code?: number): never { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Exiting process", - properties: { code }, - }); - if (this.taskRunProcess?.isPreparedForNextRun) { - this.taskRunProcess.forceExit(); - } - process.exit(code); - } - - createSupervisorSocket(): SupervisorSocket { - const wsUrl = new URL("/workload", this.workerApiUrl); - - const socket = io(wsUrl.href, { - transports: ["websocket"], - extraHeaders: { - [WORKLOAD_HEADERS.DEPLOYMENT_ID]: env.TRIGGER_DEPLOYMENT_ID, - [WORKLOAD_HEADERS.RUNNER_ID]: env.TRIGGER_RUNNER_ID, - }, - }) satisfies SupervisorSocket; - - socket.on("run:notify", async ({ version, run }) => { - this.sendDebugLog({ - runId: run.friendlyId, - message: "run:notify received by runner", - properties: { version, runId: run.friendlyId }, - }); - - if (!this.runFriendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "run:notify: ignoring notification, no local run ID", - properties: { - currentRunId: this.runFriendlyId, - currentSnapshotId: this.snapshotFriendlyId, - }, - }); - return; - } - - if (run.friendlyId !== this.runFriendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "run:notify: ignoring notification for different run", - properties: { - currentRunId: this.runFriendlyId, - currentSnapshotId: this.snapshotFriendlyId, - notificationRunId: run.friendlyId, - }, - }); - return; - } - - // Reset the (fallback) snapshot poll interval so we don't do unnecessary work - this.snapshotPoller.resetCurrentInterval(); - - const latestSnapshot = await this.httpClient.getRunExecutionData(this.runFriendlyId); - - if (!latestSnapshot.success) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "run:notify: failed to get latest snapshot data", - properties: { - currentRunId: this.runFriendlyId, - currentSnapshotId: this.snapshotFriendlyId, - error: latestSnapshot.error, - }, - }); - return; - } - - await this.handleSnapshotChange(latestSnapshot.data.execution); - }); - - socket.on("connect", () => { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Connected to supervisor", - }); - - // This should handle the case where we reconnect after being restored - if (this.state.phase === "RUN") { - const { run, snapshot } = this.state; - this.subscribeToRunNotifications({ run, snapshot }); - } - }); - - socket.on("connect_error", (error) => { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Connection error", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - }); - - socket.on("disconnect", (reason, description) => { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Disconnected from supervisor", - properties: { reason, description: description?.toString() }, - }); - }); - - return socket; - } - - private async executeRun({ - run, - snapshot, - envVars, - execution, - metrics, - isWarmStart, - }: WorkloadRunAttemptStartResponseBody & { - metrics?: TaskRunExecutionMetrics; - isWarmStart?: boolean; - }) { - this.snapshotPoller.start(); - - if (!this.taskRunProcess || !this.taskRunProcess.isPreparedForNextRun) { - this.taskRunProcess = new TaskRunProcess({ - workerManifest: this.workerManifest, - env: envVars, - serverWorker: { - id: "unmanaged", - contentHash: env.TRIGGER_CONTENT_HASH, - version: env.TRIGGER_DEPLOYMENT_VERSION, - engine: "V2", - }, - machine: execution.machine, - isWarmStart, - }).initialize(); - } - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "executing task run process", - properties: { - attemptId: execution.attempt.id, - runId: execution.run.id, - }, - }); - - const completion = await this.taskRunProcess.execute( - { - payload: { - execution, - traceContext: execution.run.traceContext ?? {}, - metrics, - }, - messageId: run.friendlyId, - env: envVars, - }, - isWarmStart - ); - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Completed run", - properties: { completion: completion.ok }, - }); - - try { - // The execution has finished, so we can cleanup the task run process. Killing it should be safe. - await this.taskRunProcess.cleanup(true); - } catch (error) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Failed to cleanup task run process, submitting completion anyway", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - } - - if (!this.runFriendlyId || !this.snapshotFriendlyId) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "executeRun: Missing run ID or snapshot ID after execution", - properties: { - runId: this.runFriendlyId, - snapshotId: this.snapshotFriendlyId, - }, - }); - - this.waitForNextRun(); - return; - } - - const completionResult = await this.httpClient.completeRunAttempt( - this.runFriendlyId, - this.snapshotFriendlyId, - { - completion, - } - ); - - if (!completionResult.success) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "completion: failed to submit", - properties: { - error: completionResult.error, - }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: "completion: failed to submit", - properties: { - error: completionResult.error, - }, - }); - - this.waitForNextRun(); - return; - } - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Attempt completion submitted", - properties: { - attemptStatus: completionResult.data.result.attemptStatus, - runId: completionResult.data.result.run.friendlyId, - snapshotId: completionResult.data.result.snapshot.friendlyId, - }, - }); - - try { - await this.handleCompletionResult(completion, completionResult.data.result); - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to handle completion result", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - - this.waitForNextRun(); - return; - } - } - - private async handleCompletionResult( - completion: TaskRunExecutionResult, - result: CompleteRunAttemptResult - ) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Handling completion result", - properties: { - completion: completion.ok, - attemptStatus: result.attemptStatus, - snapshotId: result.snapshot.friendlyId, - runId: result.run.friendlyId, - }, - }); - - const { attemptStatus, snapshot: completionSnapshot, run } = result; - - try { - this.updateRunPhase(run, completionSnapshot); - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to update run phase after completion", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - - this.waitForNextRun(); - return; - } - - if (attemptStatus === "RUN_FINISHED") { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run finished", - }); - - this.waitForNextRun(); - return; - } - - if (attemptStatus === "RUN_PENDING_CANCEL") { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run pending cancel", - }); - return; - } - - if (attemptStatus === "RETRY_QUEUED") { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Retry queued", - }); - - this.waitForNextRun(); - return; - } - - if (attemptStatus === "RETRY_IMMEDIATELY") { - if (completion.ok) { - throw new Error("Should retry but completion OK."); - } - - if (!completion.retry) { - throw new Error("Should retry but missing retry params."); - } - - await sleep(completion.retry.delay); - - if (!this.snapshotFriendlyId) { - throw new Error("Missing snapshot ID after retry"); - } - - this.startAndExecuteRunAttempt({ - runFriendlyId: run.friendlyId, - snapshotFriendlyId: this.snapshotFriendlyId, - skipLockCheckForImmediateRetry: true, - isWarmStart: true, - }).finally(() => {}); - return; - } - - assertExhaustive(attemptStatus); - } - - sendDebugLog(opts: SendDebugLogOptions) { - this.logger.sendDebugLog(opts); - } - - async cancelAttempt(runId: string) { - this.sendDebugLog({ - runId, - message: "cancelling attempt", - properties: { runId }, - }); - - await this.taskRunProcess?.cancel(); - } - - start() { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Starting up", - }); - - // If we have run and snapshot IDs, we can start an attempt immediately - if (env.TRIGGER_RUN_ID && env.TRIGGER_SNAPSHOT_ID) { - this.startAndExecuteRunAttempt({ - runFriendlyId: env.TRIGGER_RUN_ID, - snapshotFriendlyId: env.TRIGGER_SNAPSHOT_ID, - dequeuedAt: env.TRIGGER_DEQUEUED_AT_MS, - podScheduledAt: env.TRIGGER_POD_SCHEDULED_AT_MS, - }).finally(() => {}); - return; - } - - // ..otherwise we need to wait for a run - this.waitForNextRun(); - return; - } - - async stop() { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Shutting down", - }); - - if (this.taskRunProcess) { - await this.taskRunProcess.cleanup(true); - } - - this.runHeartbeat.stop(); - this.snapshotPoller.stop(); - - this.socket.close(); - } -} +import { WorkerManifest } from "@trigger.dev/core/v3"; +import { ManagedRunController } from "./managed/controller.js"; const manifest = await readJSONFile("./index.json"); const workerManifest = WorkerManifest.parse(manifest); -new ManagedRunController({ workerManifest }).start(); +new ManagedRunController({ + workerManifest, + env: stdEnv, +}).start(); diff --git a/packages/cli-v3/src/entryPoints/managed/controller.ts b/packages/cli-v3/src/entryPoints/managed/controller.ts new file mode 100644 index 0000000000..fa1709e142 --- /dev/null +++ b/packages/cli-v3/src/entryPoints/managed/controller.ts @@ -0,0 +1,1482 @@ +import { TaskRunProcess } from "../../executions/taskRunProcess.js"; +import { + type CompleteRunAttemptResult, + type RunExecutionData, + SuspendedProcessError, + type TaskRunExecutionMetrics, + type TaskRunExecutionResult, + type TaskRunFailedExecutionResult, + WorkerManifest, +} from "@trigger.dev/core/v3"; +import { + WarmStartClient, + WORKLOAD_HEADERS, + type WorkloadClientToServerEvents, + WorkloadHttpClient, + type WorkloadServerToClientEvents, + type WorkloadRunAttemptStartResponseBody, +} from "@trigger.dev/core/v3/workers"; +import { assertExhaustive } from "../../utilities/assertExhaustive.js"; +import { setTimeout as sleep } from "timers/promises"; +import { io, type Socket } from "socket.io-client"; +import { RunnerEnv } from "./env.js"; +import { MetadataClient } from "./overrides.js"; +import { RunLogger, SendDebugLogOptions } from "./logger.js"; +import { RunExecutionHeartbeat } from "./heartbeat.js"; +import { RunExecutionSnapshotPoller } from "./poller.js"; +import { EnvObject } from "std-env"; + +type ManagedRunControllerOptions = { + workerManifest: WorkerManifest; + env: EnvObject; +}; + +type Run = { + friendlyId: string; + attemptNumber?: number | null; +}; + +type Snapshot = { + friendlyId: string; +}; + +type SupervisorSocket = Socket; + +export class ManagedRunController { + private readonly env: RunnerEnv; + + private taskRunProcess?: TaskRunProcess; + + private readonly workerManifest: WorkerManifest; + + private readonly httpClient: WorkloadHttpClient; + private readonly warmStartClient: WarmStartClient | undefined; + private readonly metadataClient?: MetadataClient; + + private socket: SupervisorSocket; + private readonly logger: RunLogger; + + private readonly runHeartbeat: RunExecutionHeartbeat; + private readonly snapshotPoller: RunExecutionSnapshotPoller; + + constructor(opts: ManagedRunControllerOptions) { + const env = new RunnerEnv(opts.env); + this.env = env; + + this.workerManifest = opts.workerManifest; + + this.httpClient = new WorkloadHttpClient({ + workerApiUrl: this.workerApiUrl, + runnerId: this.runnerId, + deploymentId: env.TRIGGER_DEPLOYMENT_ID, + deploymentVersion: env.TRIGGER_DEPLOYMENT_VERSION, + projectRef: env.TRIGGER_PROJECT_REF, + }); + + this.logger = new RunLogger({ + httpClient: this.httpClient, + env, + }); + + const properties = { + ...env.raw, + TRIGGER_POD_SCHEDULED_AT_MS: env.TRIGGER_POD_SCHEDULED_AT_MS.toISOString(), + TRIGGER_DEQUEUED_AT_MS: env.TRIGGER_DEQUEUED_AT_MS.toISOString(), + }; + + this.sendDebugLog({ + runId: env.TRIGGER_RUN_ID, + message: "Creating run controller", + properties, + }); + + if (env.TRIGGER_METADATA_URL) { + this.metadataClient = new MetadataClient(env.TRIGGER_METADATA_URL); + } + + if (env.TRIGGER_WARM_START_URL) { + this.warmStartClient = new WarmStartClient({ + apiUrl: new URL(env.TRIGGER_WARM_START_URL), + controllerId: env.TRIGGER_WORKLOAD_CONTROLLER_ID, + deploymentId: env.TRIGGER_DEPLOYMENT_ID, + deploymentVersion: env.TRIGGER_DEPLOYMENT_VERSION, + machineCpu: env.TRIGGER_MACHINE_CPU, + machineMemory: env.TRIGGER_MACHINE_MEMORY, + }); + } + + this.snapshotPoller = new RunExecutionSnapshotPoller({ + // @ts-expect-error + runFriendlyId: env.TRIGGER_RUN_ID, + // @ts-expect-error + snapshotFriendlyId: env.TRIGGER_SNAPSHOT_ID, + httpClient: this.httpClient, + logger: this.logger, + snapshotPollIntervalSeconds: this.snapshotPollIntervalSeconds, + handleSnapshotChange: this.handleSnapshotChange.bind(this), + }); + + this.runHeartbeat = new RunExecutionHeartbeat({ + // @ts-expect-error + runFriendlyId: env.TRIGGER_RUN_ID, + // @ts-expect-error + snapshotFriendlyId: env.TRIGGER_SNAPSHOT_ID, + httpClient: this.httpClient, + logger: this.logger, + heartbeatIntervalSeconds: this.heartbeatIntervalSeconds, + }); + + // Websocket notifications are only an optimisation so we don't need to wait for a successful connection + this.socket = this.createSupervisorSocket(); + + process.on("SIGTERM", async () => { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Received SIGTERM, stopping worker", + }); + await this.stop(); + }); + } + + // These settings depend on env vars that may be overridden, e.g. after runs and restores + + get heartbeatIntervalSeconds() { + return this.env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS; + } + + get snapshotPollIntervalSeconds() { + return this.env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS; + } + + get runnerId() { + return this.env.TRIGGER_RUNNER_ID; + } + + get successExitCode() { + return this.env.TRIGGER_SUCCESS_EXIT_CODE; + } + + get failureExitCode() { + return this.env.TRIGGER_FAILURE_EXIT_CODE; + } + + get workerApiUrl() { + return this.env.TRIGGER_SUPERVISOR_API_URL; + } + + get workerInstanceName() { + return this.env.TRIGGER_WORKER_INSTANCE_NAME; + } + + private state: + | { + phase: "RUN"; + run: Run; + snapshot: Snapshot; + } + | { + phase: "IDLE" | "WARM_START"; + } = { phase: "IDLE" }; + + private enterRunPhase(run: Run, snapshot: Snapshot) { + this.onExitRunPhase(run); + this.state = { phase: "RUN", run, snapshot }; + + this.runHeartbeat.start(); + this.snapshotPoller.start(); + } + + private enterWarmStartPhase() { + this.onExitRunPhase(); + this.state = { phase: "WARM_START" }; + } + + // This should only be used when we're already executing a run. Attempt number changes are not allowed. + private updateRunPhase(run: Run, snapshot: Snapshot) { + if (this.state.phase !== "RUN") { + this.sendDebugLog({ + runId: run.friendlyId, + message: `updateRunPhase: Invalid phase for updating snapshot: ${this.state.phase}`, + properties: { + currentPhase: this.state.phase, + snapshotId: snapshot.friendlyId, + }, + }); + + throw new Error(`Invalid phase for updating snapshot: ${this.state.phase}`); + } + + if (this.state.run.friendlyId !== run.friendlyId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: `updateRunPhase: Mismatched run IDs`, + properties: { + currentRunId: this.state.run.friendlyId, + newRunId: run.friendlyId, + currentSnapshotId: this.state.snapshot.friendlyId, + newSnapshotId: snapshot.friendlyId, + }, + }); + + throw new Error("Mismatched run IDs"); + } + + if (this.state.snapshot.friendlyId === snapshot.friendlyId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "updateRunPhase: Snapshot not changed", + properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, + }); + + this.sendDebugLog({ + runId: run.friendlyId, + message: `updateRunPhase: Snapshot not changed`, + properties: { + snapshotId: snapshot.friendlyId, + }, + }); + + return; + } + + if (this.state.run.attemptNumber !== run.attemptNumber) { + this.sendDebugLog({ + runId: run.friendlyId, + message: `updateRunPhase: Attempt number changed`, + properties: { + oldAttemptNumber: this.state.run.attemptNumber ?? undefined, + newAttemptNumber: run.attemptNumber ?? undefined, + }, + }); + throw new Error("Attempt number changed"); + } + + this.state = { + phase: "RUN", + run: { + friendlyId: run.friendlyId, + attemptNumber: run.attemptNumber, + }, + snapshot: { + friendlyId: snapshot.friendlyId, + }, + }; + } + + private onExitRunPhase(newRun: Run | undefined = undefined) { + // We're not in a run phase, nothing to do + if (this.state.phase !== "RUN") { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "onExitRunPhase: Not in run phase, skipping", + properties: { phase: this.state.phase }, + }); + return; + } + + // This is still the same run, so we're not exiting the phase + if (newRun?.friendlyId === this.state.run.friendlyId) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "onExitRunPhase: Same run, skipping", + properties: { newRun: newRun?.friendlyId }, + }); + return; + } + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "onExitRunPhase: Exiting run phase", + properties: { newRun: newRun?.friendlyId }, + }); + + this.runHeartbeat.stop(); + this.snapshotPoller.stop(); + + const { run, snapshot } = this.state; + + this.unsubscribeFromRunNotifications({ run, snapshot }); + } + + private subscribeToRunNotifications({ run, snapshot }: { run: Run; snapshot: Snapshot }) { + this.socket.emit("run:start", { + version: "1", + run: { + friendlyId: run.friendlyId, + }, + snapshot: { + friendlyId: snapshot.friendlyId, + }, + }); + } + + private unsubscribeFromRunNotifications({ run, snapshot }: { run: Run; snapshot: Snapshot }) { + this.socket.emit("run:stop", { + version: "1", + run: { + friendlyId: run.friendlyId, + }, + snapshot: { + friendlyId: snapshot.friendlyId, + }, + }); + } + + private get runFriendlyId() { + if (this.state.phase !== "RUN") { + return undefined; + } + + return this.state.run.friendlyId; + } + + private get snapshotFriendlyId() { + if (this.state.phase !== "RUN") { + return; + } + + return this.state.snapshot.friendlyId; + } + + private handleSnapshotChangeLock = false; + + private async handleSnapshotChange({ + run, + snapshot, + completedWaitpoints, + }: Pick) { + if (this.handleSnapshotChangeLock) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "handleSnapshotChange: already in progress", + }); + return; + } + + this.handleSnapshotChangeLock = true; + + try { + if (!this.snapshotFriendlyId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "handleSnapshotChange: Missing snapshot ID", + properties: { + newSnapshotId: snapshot.friendlyId, + newSnapshotStatus: snapshot.executionStatus, + }, + }); + + this.sendDebugLog({ + runId: run.friendlyId, + message: "snapshot change: missing snapshot ID", + properties: { + newSnapshotId: snapshot.friendlyId, + newSnapshotStatus: snapshot.executionStatus, + }, + }); + + return; + } + + if (this.snapshotFriendlyId === snapshot.friendlyId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "handleSnapshotChange: snapshot not changed, skipping", + properties: { snapshot: snapshot.friendlyId }, + }); + + this.sendDebugLog({ + runId: run.friendlyId, + message: "snapshot change: skipping, no change", + properties: { + snapshotId: this.snapshotFriendlyId, + snapshotStatus: snapshot.executionStatus, + }, + }); + + return; + } + + this.sendDebugLog({ + runId: run.friendlyId, + message: `snapshot change: ${snapshot.executionStatus}`, + properties: { + oldSnapshotId: this.snapshotFriendlyId, + newSnapshotId: snapshot.friendlyId, + completedWaitpoints: completedWaitpoints.length, + }, + }); + + try { + this.updateRunPhase(run, snapshot); + + this.runHeartbeat.updateSnapshotId(snapshot.friendlyId); + this.snapshotPoller.updateSnapshotId(snapshot.friendlyId); + } catch (error) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "snapshot change: failed to update run phase", + properties: { + currentPhase: this.state.phase, + error: error instanceof Error ? error.message : String(error), + }, + }); + + this.waitForNextRun(); + return; + } + + switch (snapshot.executionStatus) { + case "PENDING_CANCEL": { + try { + await this.cancelAttempt(run.friendlyId); + } catch (error) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "snapshot change: failed to cancel attempt", + properties: { + error: error instanceof Error ? error.message : String(error), + }, + }); + + this.waitForNextRun(); + return; + } + + return; + } + case "FINISHED": { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run is finished, will wait for next run", + }); + + if (this.activeRunExecution) { + // Let's pretend we've just suspended the run. This will kill the process and should automatically wait for the next run. + // We still explicitly call waitForNextRun() afterwards in case of race conditions. Locks will prevent this from causing issues. + await this.taskRunProcess?.suspend(); + } + + this.waitForNextRun(); + + return; + } + case "QUEUED_EXECUTING": + case "EXECUTING_WITH_WAITPOINTS": { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run is executing with waitpoints", + properties: { snapshot: snapshot.friendlyId }, + }); + + try { + // This should never throw. It should also never fail the run. + await this.taskRunProcess?.cleanup(false); + } catch (error) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Failed to cleanup task run process", + properties: { error: error instanceof Error ? error.message : String(error) }, + }); + } + + if (snapshot.friendlyId !== this.snapshotFriendlyId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Snapshot changed after cleanup, abort", + properties: { + oldSnapshotId: snapshot.friendlyId, + newSnapshotId: this.snapshotFriendlyId, + }, + }); + return; + } + + await sleep(this.env.TRIGGER_PRE_SUSPEND_WAIT_MS); + + if (snapshot.friendlyId !== this.snapshotFriendlyId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Snapshot changed after suspend threshold, abort", + properties: { + oldSnapshotId: snapshot.friendlyId, + newSnapshotId: this.snapshotFriendlyId, + }, + }); + return; + } + + if (!this.runFriendlyId || !this.snapshotFriendlyId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: + "handleSnapshotChange: Missing run ID or snapshot ID after suspension, abort", + properties: { + runId: this.runFriendlyId, + snapshotId: this.snapshotFriendlyId, + }, + }); + return; + } + + const suspendResult = await this.httpClient.suspendRun( + this.runFriendlyId, + this.snapshotFriendlyId + ); + + if (!suspendResult.success) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Failed to suspend run, staying alive 🎶", + properties: { + error: suspendResult.error, + }, + }); + + this.sendDebugLog({ + runId: run.friendlyId, + message: "checkpoint: suspend request failed", + properties: { + snapshotId: snapshot.friendlyId, + error: suspendResult.error, + }, + }); + + return; + } + + if (!suspendResult.data.ok) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "checkpoint: failed to suspend run", + properties: { + snapshotId: snapshot.friendlyId, + error: suspendResult.data.error, + }, + }); + + return; + } + + this.sendDebugLog({ + runId: run.friendlyId, + message: "Suspending, any day now 🚬", + properties: { ok: suspendResult.data.ok }, + }); + return; + } + case "SUSPENDED": { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run was suspended, kill the process and wait for more runs", + properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, + }); + + // This will kill the process and fail the execution with a SuspendedProcessError + await this.taskRunProcess?.suspend(); + + return; + } + case "PENDING_EXECUTING": { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run is pending execution", + properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, + }); + + if (completedWaitpoints.length === 0) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "No waitpoints to complete, nothing to do", + }); + return; + } + + // There are waitpoints to complete so we've been restored after being suspended + + // Short delay to give websocket time to reconnect + await sleep(100); + + // Env may have changed after restore + await this.processEnvOverrides(); + + // We need to let the platform know we're ready to continue + const continuationResult = await this.httpClient.continueRunExecution( + run.friendlyId, + snapshot.friendlyId + ); + + if (!continuationResult.success) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "failed to continue execution", + properties: { + error: continuationResult.error, + }, + }); + + this.waitForNextRun(); + return; + } + + return; + } + case "EXECUTING": { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run is now executing", + properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, + }); + + if (completedWaitpoints.length === 0) { + return; + } + + this.sendDebugLog({ + runId: run.friendlyId, + message: "Processing completed waitpoints", + properties: { completedWaitpoints: completedWaitpoints.length }, + }); + + if (!this.taskRunProcess) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "No task run process, ignoring completed waitpoints", + properties: { completedWaitpoints: completedWaitpoints.length }, + }); + return; + } + + for (const waitpoint of completedWaitpoints) { + this.taskRunProcess.waitpointCompleted(waitpoint); + } + + return; + } + case "RUN_CREATED": + case "QUEUED": { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Status change not handled", + properties: { status: snapshot.executionStatus }, + }); + return; + } + default: { + assertExhaustive(snapshot.executionStatus); + } + } + } catch (error) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "snapshot change: unexpected error", + properties: { + snapshotId: snapshot.friendlyId, + error: error instanceof Error ? error.message : String(error), + }, + }); + } finally { + this.handleSnapshotChangeLock = false; + } + } + + private async processEnvOverrides() { + if (!this.metadataClient) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "No metadata client, skipping env overrides", + }); + return; + } + + const overrides = await this.metadataClient.getEnvOverrides(); + + if (!overrides) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "No env overrides, skipping", + }); + return; + } + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Processing env overrides", + properties: { ...overrides }, + }); + + // Override the env with the new values + this.env.override(overrides); + + // Update services and clients with the new values + if (overrides.TRIGGER_HEARTBEAT_INTERVAL_SECONDS) { + this.runHeartbeat.updateInterval(this.env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS * 1000); + } + if (overrides.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS) { + this.snapshotPoller.updateInterval(this.env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS * 1000); + } + if ( + overrides.TRIGGER_SUPERVISOR_API_PROTOCOL || + overrides.TRIGGER_SUPERVISOR_API_DOMAIN || + overrides.TRIGGER_SUPERVISOR_API_PORT + ) { + this.httpClient.updateApiUrl(this.workerApiUrl); + } + if (overrides.TRIGGER_RUNNER_ID) { + this.httpClient.updateRunnerId(this.runnerId); + } + } + + private activeRunExecution: Promise | null = null; + + private async startAndExecuteRunAttempt({ + runFriendlyId, + snapshotFriendlyId, + dequeuedAt, + podScheduledAt, + isWarmStart, + skipLockCheckForImmediateRetry: skipLockCheck, + }: { + runFriendlyId: string; + snapshotFriendlyId: string; + dequeuedAt?: Date; + podScheduledAt?: Date; + isWarmStart?: boolean; + skipLockCheckForImmediateRetry?: boolean; + }) { + if (!skipLockCheck && this.activeRunExecution) { + this.sendDebugLog({ + runId: runFriendlyId, + message: "startAndExecuteRunAttempt: already in progress", + }); + return; + } + + const execution = async () => { + if (!this.socket) { + this.sendDebugLog({ + runId: runFriendlyId, + message: "Starting run without socket connection", + }); + } + + this.subscribeToRunNotifications({ + run: { friendlyId: runFriendlyId }, + snapshot: { friendlyId: snapshotFriendlyId }, + }); + + const attemptStartedAt = Date.now(); + + const start = await this.httpClient.startRunAttempt(runFriendlyId, snapshotFriendlyId, { + isWarmStart, + }); + + if (!start.success) { + this.sendDebugLog({ + runId: runFriendlyId, + message: "Failed to start run", + properties: { error: start.error }, + }); + + this.sendDebugLog({ + runId: runFriendlyId, + message: "failed to start run attempt", + properties: { + error: start.error, + }, + }); + + this.waitForNextRun(); + return; + } + + const attemptDuration = Date.now() - attemptStartedAt; + + const { run, snapshot, execution, envVars } = start.data; + + this.sendDebugLog({ + runId: run.friendlyId, + message: "Started run", + properties: { snapshot: snapshot.friendlyId }, + }); + + this.enterRunPhase(run, snapshot); + + const metrics = [ + { + name: "start", + event: "create_attempt", + timestamp: attemptStartedAt, + duration: attemptDuration, + }, + ] + .concat( + dequeuedAt + ? [ + { + name: "start", + event: "dequeue", + timestamp: dequeuedAt.getTime(), + duration: 0, + }, + ] + : [] + ) + .concat( + podScheduledAt + ? [ + { + name: "start", + event: "pod_scheduled", + timestamp: podScheduledAt.getTime(), + duration: 0, + }, + ] + : [] + ) satisfies TaskRunExecutionMetrics; + + const taskRunEnv = { + ...this.env.gatherProcessEnv(), + ...envVars, + }; + + try { + return await this.executeRun({ + run, + snapshot, + envVars: taskRunEnv, + execution, + metrics, + isWarmStart, + }); + } catch (error) { + if (error instanceof SuspendedProcessError) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run was suspended and task run process was killed, waiting for next run", + properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, + }); + + this.waitForNextRun(); + return; + } + + this.sendDebugLog({ + runId: run.friendlyId, + message: "Error while executing attempt", + properties: { error: error instanceof Error ? error.message : String(error) }, + }); + + this.sendDebugLog({ + runId: run.friendlyId, + message: "Submitting attempt completion", + properties: { + snapshotId: snapshot.friendlyId, + updatedSnapshotId: this.snapshotFriendlyId, + }, + }); + + const completion = { + id: execution.run.id, + ok: false, + retry: undefined, + error: TaskRunProcess.parseExecuteError(error), + } satisfies TaskRunFailedExecutionResult; + + const completionResult = await this.httpClient.completeRunAttempt( + run.friendlyId, + // FIXME: if the snapshot has changed since starting the run, this won't be accurate + // ..but we probably shouldn't fetch the latest snapshot either because we may be in an "unhealthy" state while the next runner has already taken over + this.snapshotFriendlyId ?? snapshot.friendlyId, + { completion } + ); + + if (!completionResult.success) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Failed to submit completion after error", + properties: { error: completionResult.error }, + }); + + this.sendDebugLog({ + runId: run.friendlyId, + message: "completion: failed to submit after error", + properties: { + error: completionResult.error, + }, + }); + + this.waitForNextRun(); + return; + } + + this.sendDebugLog({ + runId: run.friendlyId, + message: "Attempt completion submitted after error", + properties: { + attemptStatus: completionResult.data.result.attemptStatus, + runId: completionResult.data.result.run.friendlyId, + snapshotId: completionResult.data.result.snapshot.friendlyId, + }, + }); + + try { + await this.handleCompletionResult(completion, completionResult.data.result); + } catch (error) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Failed to handle completion result after error", + properties: { error: error instanceof Error ? error.message : String(error) }, + }); + + this.waitForNextRun(); + return; + } + } + }; + + this.activeRunExecution = execution(); + + try { + await this.activeRunExecution; + } catch (error) { + this.sendDebugLog({ + runId: runFriendlyId, + message: "startAndExecuteRunAttempt: unexpected error", + properties: { error: error instanceof Error ? error.message : String(error) }, + }); + } finally { + this.activeRunExecution = null; + } + } + + private waitForNextRunLock = false; + + /** This will kill the child process before spinning up a new one. It will never throw, + * but may exit the process on any errors or when no runs are available after the + * configured duration. */ + private async waitForNextRun() { + if (this.waitForNextRunLock) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun: already in progress", + }); + return; + } + + this.waitForNextRunLock = true; + const previousRunId = this.runFriendlyId; + + try { + // If there's a run execution in progress, we need to kill it and wait for it to finish + if (this.activeRunExecution) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun: waiting for existing run execution to finish", + }); + await this.activeRunExecution; + } + + // Just for good measure + await this.taskRunProcess?.kill("SIGKILL"); + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun: waiting for next run", + }); + + this.enterWarmStartPhase(); + + if (!this.warmStartClient) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun: warm starts disabled, shutting down", + }); + this.exitProcess(this.successExitCode); + } + + if (this.taskRunProcess) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun: eagerly recreating task run process with options", + }); + this.taskRunProcess = new TaskRunProcess({ + ...this.taskRunProcess.options, + isWarmStart: true, + }).initialize(); + } else { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun: no existing task run process, so we can't eagerly recreate it", + }); + } + + // Check the service is up and get additional warm start config + const connect = await this.warmStartClient.connect(); + + if (!connect.success) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun: failed to connect to warm start service", + properties: { + warmStartUrl: this.env.TRIGGER_WARM_START_URL, + error: connect.error, + }, + }); + this.exitProcess(this.successExitCode); + } + + const connectionTimeoutMs = + connect.data.connectionTimeoutMs ?? this.env.TRIGGER_WARM_START_CONNECTION_TIMEOUT_MS; + const keepaliveMs = connect.data.keepaliveMs ?? this.env.TRIGGER_WARM_START_KEEPALIVE_MS; + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun: connected to warm start service", + properties: { + connectionTimeoutMs, + keepaliveMs, + }, + }); + + if (previousRunId) { + this.sendDebugLog({ + runId: previousRunId, + message: "warm start: received config", + properties: { + connectionTimeoutMs, + keepaliveMs, + }, + }); + } + + if (!connectionTimeoutMs || !keepaliveMs) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun: warm starts disabled after connect", + properties: { + connectionTimeoutMs, + keepaliveMs, + }, + }); + this.exitProcess(this.successExitCode); + } + + const nextRun = await this.warmStartClient.warmStart({ + workerInstanceName: this.workerInstanceName, + connectionTimeoutMs, + keepaliveMs, + }); + + if (!nextRun) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun: warm start failed, shutting down", + }); + this.exitProcess(this.successExitCode); + } + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun: got next run", + properties: { nextRun: nextRun.run.friendlyId }, + }); + + this.startAndExecuteRunAttempt({ + runFriendlyId: nextRun.run.friendlyId, + snapshotFriendlyId: nextRun.snapshot.friendlyId, + dequeuedAt: nextRun.dequeuedAt, + isWarmStart: true, + }).finally(() => {}); + return; + } catch (error) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun: unexpected error", + properties: { error: error instanceof Error ? error.message : String(error) }, + }); + this.exitProcess(this.failureExitCode); + } finally { + this.waitForNextRunLock = false; + } + } + + private exitProcess(code?: number): never { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Exiting process", + properties: { code }, + }); + if (this.taskRunProcess?.isPreparedForNextRun) { + this.taskRunProcess.forceExit(); + } + process.exit(code); + } + + createSupervisorSocket(): SupervisorSocket { + const wsUrl = new URL("/workload", this.workerApiUrl); + + const socket = io(wsUrl.href, { + transports: ["websocket"], + extraHeaders: { + [WORKLOAD_HEADERS.DEPLOYMENT_ID]: this.env.TRIGGER_DEPLOYMENT_ID, + [WORKLOAD_HEADERS.RUNNER_ID]: this.env.TRIGGER_RUNNER_ID, + }, + }) satisfies SupervisorSocket; + + socket.on("run:notify", async ({ version, run }) => { + this.sendDebugLog({ + runId: run.friendlyId, + message: "run:notify received by runner", + properties: { version, runId: run.friendlyId }, + }); + + if (!this.runFriendlyId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "run:notify: ignoring notification, no local run ID", + properties: { + currentRunId: this.runFriendlyId, + currentSnapshotId: this.snapshotFriendlyId, + }, + }); + return; + } + + if (run.friendlyId !== this.runFriendlyId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "run:notify: ignoring notification for different run", + properties: { + currentRunId: this.runFriendlyId, + currentSnapshotId: this.snapshotFriendlyId, + notificationRunId: run.friendlyId, + }, + }); + return; + } + + // Reset the (fallback) snapshot poll interval so we don't do unnecessary work + this.snapshotPoller.resetCurrentInterval(); + + const latestSnapshot = await this.httpClient.getRunExecutionData(this.runFriendlyId); + + if (!latestSnapshot.success) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "run:notify: failed to get latest snapshot data", + properties: { + currentRunId: this.runFriendlyId, + currentSnapshotId: this.snapshotFriendlyId, + error: latestSnapshot.error, + }, + }); + return; + } + + await this.handleSnapshotChange(latestSnapshot.data.execution); + }); + + socket.on("connect", () => { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Connected to supervisor", + }); + + // This should handle the case where we reconnect after being restored + if (this.state.phase === "RUN") { + const { run, snapshot } = this.state; + this.subscribeToRunNotifications({ run, snapshot }); + } + }); + + socket.on("connect_error", (error) => { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Connection error", + properties: { error: error instanceof Error ? error.message : String(error) }, + }); + }); + + socket.on("disconnect", (reason, description) => { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Disconnected from supervisor", + properties: { reason, description: description?.toString() }, + }); + }); + + return socket; + } + + private async executeRun({ + run, + snapshot, + envVars, + execution, + metrics, + isWarmStart, + }: WorkloadRunAttemptStartResponseBody & { + metrics?: TaskRunExecutionMetrics; + isWarmStart?: boolean; + }) { + this.snapshotPoller.start(); + + if (!this.taskRunProcess || !this.taskRunProcess.isPreparedForNextRun) { + this.taskRunProcess = new TaskRunProcess({ + workerManifest: this.workerManifest, + env: envVars, + serverWorker: { + id: "unmanaged", + contentHash: this.env.TRIGGER_CONTENT_HASH, + version: this.env.TRIGGER_DEPLOYMENT_VERSION, + engine: "V2", + }, + machine: execution.machine, + isWarmStart, + }).initialize(); + } + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "executing task run process", + properties: { + attemptId: execution.attempt.id, + runId: execution.run.id, + }, + }); + + const completion = await this.taskRunProcess.execute( + { + payload: { + execution, + traceContext: execution.run.traceContext ?? {}, + metrics, + }, + messageId: run.friendlyId, + env: envVars, + }, + isWarmStart + ); + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Completed run", + properties: { completion: completion.ok }, + }); + + try { + // The execution has finished, so we can cleanup the task run process. Killing it should be safe. + await this.taskRunProcess.cleanup(true); + } catch (error) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Failed to cleanup task run process, submitting completion anyway", + properties: { error: error instanceof Error ? error.message : String(error) }, + }); + } + + if (!this.runFriendlyId || !this.snapshotFriendlyId) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "executeRun: Missing run ID or snapshot ID after execution", + properties: { + runId: this.runFriendlyId, + snapshotId: this.snapshotFriendlyId, + }, + }); + + this.waitForNextRun(); + return; + } + + const completionResult = await this.httpClient.completeRunAttempt( + this.runFriendlyId, + this.snapshotFriendlyId, + { + completion, + } + ); + + if (!completionResult.success) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "completion: failed to submit", + properties: { + error: completionResult.error, + }, + }); + + this.sendDebugLog({ + runId: run.friendlyId, + message: "completion: failed to submit", + properties: { + error: completionResult.error, + }, + }); + + this.waitForNextRun(); + return; + } + + this.sendDebugLog({ + runId: run.friendlyId, + message: "Attempt completion submitted", + properties: { + attemptStatus: completionResult.data.result.attemptStatus, + runId: completionResult.data.result.run.friendlyId, + snapshotId: completionResult.data.result.snapshot.friendlyId, + }, + }); + + try { + await this.handleCompletionResult(completion, completionResult.data.result); + } catch (error) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Failed to handle completion result", + properties: { error: error instanceof Error ? error.message : String(error) }, + }); + + this.waitForNextRun(); + return; + } + } + + private async handleCompletionResult( + completion: TaskRunExecutionResult, + result: CompleteRunAttemptResult + ) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Handling completion result", + properties: { + completion: completion.ok, + attemptStatus: result.attemptStatus, + snapshotId: result.snapshot.friendlyId, + runId: result.run.friendlyId, + }, + }); + + const { attemptStatus, snapshot: completionSnapshot, run } = result; + + try { + this.updateRunPhase(run, completionSnapshot); + } catch (error) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Failed to update run phase after completion", + properties: { error: error instanceof Error ? error.message : String(error) }, + }); + + this.waitForNextRun(); + return; + } + + if (attemptStatus === "RUN_FINISHED") { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run finished", + }); + + this.waitForNextRun(); + return; + } + + if (attemptStatus === "RUN_PENDING_CANCEL") { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run pending cancel", + }); + return; + } + + if (attemptStatus === "RETRY_QUEUED") { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Retry queued", + }); + + this.waitForNextRun(); + return; + } + + if (attemptStatus === "RETRY_IMMEDIATELY") { + if (completion.ok) { + throw new Error("Should retry but completion OK."); + } + + if (!completion.retry) { + throw new Error("Should retry but missing retry params."); + } + + await sleep(completion.retry.delay); + + if (!this.snapshotFriendlyId) { + throw new Error("Missing snapshot ID after retry"); + } + + this.startAndExecuteRunAttempt({ + runFriendlyId: run.friendlyId, + snapshotFriendlyId: this.snapshotFriendlyId, + skipLockCheckForImmediateRetry: true, + isWarmStart: true, + }).finally(() => {}); + return; + } + + assertExhaustive(attemptStatus); + } + + sendDebugLog(opts: SendDebugLogOptions) { + this.logger.sendDebugLog(opts); + } + + async cancelAttempt(runId: string) { + this.sendDebugLog({ + runId, + message: "cancelling attempt", + properties: { runId }, + }); + + await this.taskRunProcess?.cancel(); + } + + start() { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Starting up", + }); + + // If we have run and snapshot IDs, we can start an attempt immediately + if (this.env.TRIGGER_RUN_ID && this.env.TRIGGER_SNAPSHOT_ID) { + this.startAndExecuteRunAttempt({ + runFriendlyId: this.env.TRIGGER_RUN_ID, + snapshotFriendlyId: this.env.TRIGGER_SNAPSHOT_ID, + dequeuedAt: this.env.TRIGGER_DEQUEUED_AT_MS, + podScheduledAt: this.env.TRIGGER_POD_SCHEDULED_AT_MS, + }).finally(() => {}); + return; + } + + // ..otherwise we need to wait for a run + this.waitForNextRun(); + return; + } + + async stop() { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Shutting down", + }); + + if (this.taskRunProcess) { + await this.taskRunProcess.cleanup(true); + } + + this.runHeartbeat.stop(); + this.snapshotPoller.stop(); + + this.socket.close(); + } +} diff --git a/packages/cli-v3/src/entryPoints/managed/env.ts b/packages/cli-v3/src/entryPoints/managed/env.ts index 2e15276971..1355f68d82 100644 --- a/packages/cli-v3/src/entryPoints/managed/env.ts +++ b/packages/cli-v3/src/entryPoints/managed/env.ts @@ -1,6 +1,7 @@ import { randomUUID } from "node:crypto"; import { Metadata } from "./overrides.js"; import { z } from "zod"; +import { EnvObject } from "std-env"; const DateEnv = z .string() @@ -54,7 +55,7 @@ export class RunnerEnv { private env: Env; public readonly initial: Env; - constructor(env: Record) { + constructor(env: EnvObject) { this.env = Env.parse(env); this.initial = { ...this.env }; } From 1f76bc7f0d184a49fcf6cfeb5dfc1a6fdb397cc2 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 16:52:21 +0100 Subject: [PATCH 19/35] add warm start count to all controller logs --- .../src/entryPoints/managed/controller.ts | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/packages/cli-v3/src/entryPoints/managed/controller.ts b/packages/cli-v3/src/entryPoints/managed/controller.ts index fa1709e142..507823f3fc 100644 --- a/packages/cli-v3/src/entryPoints/managed/controller.ts +++ b/packages/cli-v3/src/entryPoints/managed/controller.ts @@ -59,6 +59,8 @@ export class ManagedRunController { private readonly runHeartbeat: RunExecutionHeartbeat; private readonly snapshotPoller: RunExecutionSnapshotPoller; + private warmStartCount = 0; + constructor(opts: ManagedRunControllerOptions) { const env = new RunnerEnv(opts.env); this.env = env; @@ -138,6 +140,12 @@ export class ManagedRunController { }); } + get metrics() { + return { + warmStartCount: this.warmStartCount, + }; + } + // These settings depend on env vars that may be overridden, e.g. after runs and restores get heartbeatIntervalSeconds() { @@ -1076,6 +1084,8 @@ export class ManagedRunController { this.exitProcess(this.successExitCode); } + this.warmStartCount++; + this.sendDebugLog({ runId: this.runFriendlyId, message: "waitForNextRun: got next run", @@ -1429,7 +1439,13 @@ export class ManagedRunController { } sendDebugLog(opts: SendDebugLogOptions) { - this.logger.sendDebugLog(opts); + this.logger.sendDebugLog({ + ...opts, + properties: { + ...opts.properties, + warmStartCount: this.warmStartCount, + }, + }); } async cancelAttempt(runId: string) { From e77b14bf64680b0e0a4916ed4a98e434852dc04e Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Fri, 11 Apr 2025 16:54:06 +0100 Subject: [PATCH 20/35] add restore count --- packages/cli-v3/src/entryPoints/managed/controller.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/packages/cli-v3/src/entryPoints/managed/controller.ts b/packages/cli-v3/src/entryPoints/managed/controller.ts index 507823f3fc..26bb9dbb43 100644 --- a/packages/cli-v3/src/entryPoints/managed/controller.ts +++ b/packages/cli-v3/src/entryPoints/managed/controller.ts @@ -60,6 +60,7 @@ export class ManagedRunController { private readonly snapshotPoller: RunExecutionSnapshotPoller; private warmStartCount = 0; + private restoreCount = 0; constructor(opts: ManagedRunControllerOptions) { const env = new RunnerEnv(opts.env); @@ -143,6 +144,7 @@ export class ManagedRunController { get metrics() { return { warmStartCount: this.warmStartCount, + restoreCount: this.restoreCount, }; } @@ -601,6 +603,7 @@ export class ManagedRunController { } // There are waitpoints to complete so we've been restored after being suspended + this.restoreCount++; // Short delay to give websocket time to reconnect await sleep(100); @@ -1444,6 +1447,7 @@ export class ManagedRunController { properties: { ...opts.properties, warmStartCount: this.warmStartCount, + restoreCount: this.restoreCount, }, }); } From 71cd80e4989884c1de24308dff36a5461564a957 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Mon, 14 Apr 2025 11:29:12 +0100 Subject: [PATCH 21/35] pull out run execution logic --- .../run-engine/src/engine/db/worker.ts | 3 +- packages/cli-v3/e2e/utils.ts | 22 +- .../src/entryPoints/dev-run-controller.ts | 2 +- .../src/entryPoints/managed/controller.ts | 1137 ++--------------- .../src/entryPoints/managed/execution.ts | 1019 +++++++++++++++ .../cli-v3/src/executions/taskRunProcess.ts | 6 +- packages/core/src/utils.ts | 8 +- packages/core/src/v3/machines/index.ts | 20 +- packages/core/src/v3/schemas/common.ts | 2 + 9 files changed, 1138 insertions(+), 1081 deletions(-) create mode 100644 packages/cli-v3/src/entryPoints/managed/execution.ts diff --git a/internal-packages/run-engine/src/engine/db/worker.ts b/internal-packages/run-engine/src/engine/db/worker.ts index 34abf2cd32..d431f7397d 100644 --- a/internal-packages/run-engine/src/engine/db/worker.ts +++ b/internal-packages/run-engine/src/engine/db/worker.ts @@ -193,7 +193,7 @@ export async function getWorkerDeploymentFromWorker( prisma: PrismaClientOrTransaction, workerId: string ): Promise { - const worker = await prisma.backgroundWorker.findUnique({ + const worker = await prisma.backgroundWorker.findFirst({ where: { id: workerId, }, @@ -264,6 +264,7 @@ export async function getWorkerFromCurrentlyPromotedDeployment( prisma: PrismaClientOrTransaction, environmentId: string ): Promise { + // TODO: fixme const promotion = await prisma.workerDeploymentPromotion.findUnique({ where: { environmentId_label: { diff --git a/packages/cli-v3/e2e/utils.ts b/packages/cli-v3/e2e/utils.ts index be158ef599..73530208c7 100644 --- a/packages/cli-v3/e2e/utils.ts +++ b/packages/cli-v3/e2e/utils.ts @@ -8,6 +8,7 @@ import { TaskRunProcess } from "../src/executions/taskRunProcess.js"; import { createTestHttpServer } from "@epic-web/test-server/http"; import { TestCase, TestCaseRun } from "./fixtures.js"; import { access } from "node:fs/promises"; +import { MachinePreset } from "@trigger.dev/core/v3"; export type PackageManager = "npm" | "pnpm" | "yarn"; @@ -295,6 +296,13 @@ export async function executeTestCaseRun({ }, }); + const machine = { + name: "small-1x", + cpu: 1, + memory: 256, + centsPerMs: 0.0000001, + } satisfies MachinePreset; + try { const taskRunProcess = new TaskRunProcess({ workerManifest: workerManifest!, @@ -314,12 +322,7 @@ export async function executeTestCaseRun({ version: "1.0.0", contentHash, }, - machine: { - name: "small-1x", - cpu: 1, - memory: 256, - centsPerMs: 0.0000001, - }, + machineResources: machine, }).initialize(); const result = await taskRunProcess.execute({ @@ -372,12 +375,7 @@ export async function executeTestCaseRun({ ref: "main", name: "test", }, - machine: { - name: "small-1x", - cpu: 1, - memory: 256, - centsPerMs: 0.0000001, - }, + machine, }, }, messageId: "run_1234", diff --git a/packages/cli-v3/src/entryPoints/dev-run-controller.ts b/packages/cli-v3/src/entryPoints/dev-run-controller.ts index ccfc68e259..d6660c69d4 100644 --- a/packages/cli-v3/src/entryPoints/dev-run-controller.ts +++ b/packages/cli-v3/src/entryPoints/dev-run-controller.ts @@ -619,7 +619,7 @@ export class DevRunController { version: this.opts.worker.serverWorker?.version, engine: "V2", }, - machine: execution.machine, + machineResources: execution.machine, }).initialize(); logger.debug("executing task run process", { diff --git a/packages/cli-v3/src/entryPoints/managed/controller.ts b/packages/cli-v3/src/entryPoints/managed/controller.ts index 26bb9dbb43..d88df789e3 100644 --- a/packages/cli-v3/src/entryPoints/managed/controller.ts +++ b/packages/cli-v3/src/entryPoints/managed/controller.ts @@ -1,67 +1,38 @@ -import { TaskRunProcess } from "../../executions/taskRunProcess.js"; -import { - type CompleteRunAttemptResult, - type RunExecutionData, - SuspendedProcessError, - type TaskRunExecutionMetrics, - type TaskRunExecutionResult, - type TaskRunFailedExecutionResult, - WorkerManifest, -} from "@trigger.dev/core/v3"; +import { WorkerManifest } from "@trigger.dev/core/v3"; import { WarmStartClient, WORKLOAD_HEADERS, type WorkloadClientToServerEvents, WorkloadHttpClient, type WorkloadServerToClientEvents, - type WorkloadRunAttemptStartResponseBody, } from "@trigger.dev/core/v3/workers"; -import { assertExhaustive } from "../../utilities/assertExhaustive.js"; -import { setTimeout as sleep } from "timers/promises"; import { io, type Socket } from "socket.io-client"; import { RunnerEnv } from "./env.js"; -import { MetadataClient } from "./overrides.js"; import { RunLogger, SendDebugLogOptions } from "./logger.js"; -import { RunExecutionHeartbeat } from "./heartbeat.js"; -import { RunExecutionSnapshotPoller } from "./poller.js"; import { EnvObject } from "std-env"; +import { RunExecution } from "./execution.js"; +import { tryCatch } from "@trigger.dev/core/utils"; type ManagedRunControllerOptions = { workerManifest: WorkerManifest; env: EnvObject; }; -type Run = { - friendlyId: string; - attemptNumber?: number | null; -}; - -type Snapshot = { - friendlyId: string; -}; - type SupervisorSocket = Socket; export class ManagedRunController { private readonly env: RunnerEnv; - - private taskRunProcess?: TaskRunProcess; - private readonly workerManifest: WorkerManifest; - private readonly httpClient: WorkloadHttpClient; private readonly warmStartClient: WarmStartClient | undefined; - private readonly metadataClient?: MetadataClient; - private socket: SupervisorSocket; private readonly logger: RunLogger; - private readonly runHeartbeat: RunExecutionHeartbeat; - private readonly snapshotPoller: RunExecutionSnapshotPoller; - private warmStartCount = 0; private restoreCount = 0; + private currentExecution: RunExecution | null = null; + constructor(opts: ManagedRunControllerOptions) { const env = new RunnerEnv(opts.env); this.env = env; @@ -93,10 +64,6 @@ export class ManagedRunController { properties, }); - if (env.TRIGGER_METADATA_URL) { - this.metadataClient = new MetadataClient(env.TRIGGER_METADATA_URL); - } - if (env.TRIGGER_WARM_START_URL) { this.warmStartClient = new WarmStartClient({ apiUrl: new URL(env.TRIGGER_WARM_START_URL), @@ -108,27 +75,6 @@ export class ManagedRunController { }); } - this.snapshotPoller = new RunExecutionSnapshotPoller({ - // @ts-expect-error - runFriendlyId: env.TRIGGER_RUN_ID, - // @ts-expect-error - snapshotFriendlyId: env.TRIGGER_SNAPSHOT_ID, - httpClient: this.httpClient, - logger: this.logger, - snapshotPollIntervalSeconds: this.snapshotPollIntervalSeconds, - handleSnapshotChange: this.handleSnapshotChange.bind(this), - }); - - this.runHeartbeat = new RunExecutionHeartbeat({ - // @ts-expect-error - runFriendlyId: env.TRIGGER_RUN_ID, - // @ts-expect-error - snapshotFriendlyId: env.TRIGGER_SNAPSHOT_ID, - httpClient: this.httpClient, - logger: this.logger, - heartbeatIntervalSeconds: this.heartbeatIntervalSeconds, - }); - // Websocket notifications are only an optimisation so we don't need to wait for a successful connection this.socket = this.createSupervisorSocket(); @@ -148,16 +94,6 @@ export class ManagedRunController { }; } - // These settings depend on env vars that may be overridden, e.g. after runs and restores - - get heartbeatIntervalSeconds() { - return this.env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS; - } - - get snapshotPollIntervalSeconds() { - return this.env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS; - } - get runnerId() { return this.env.TRIGGER_RUNNER_ID; } @@ -178,567 +114,39 @@ export class ManagedRunController { return this.env.TRIGGER_WORKER_INSTANCE_NAME; } - private state: - | { - phase: "RUN"; - run: Run; - snapshot: Snapshot; - } - | { - phase: "IDLE" | "WARM_START"; - } = { phase: "IDLE" }; - - private enterRunPhase(run: Run, snapshot: Snapshot) { - this.onExitRunPhase(run); - this.state = { phase: "RUN", run, snapshot }; - - this.runHeartbeat.start(); - this.snapshotPoller.start(); - } - - private enterWarmStartPhase() { - this.onExitRunPhase(); - this.state = { phase: "WARM_START" }; - } - - // This should only be used when we're already executing a run. Attempt number changes are not allowed. - private updateRunPhase(run: Run, snapshot: Snapshot) { - if (this.state.phase !== "RUN") { - this.sendDebugLog({ - runId: run.friendlyId, - message: `updateRunPhase: Invalid phase for updating snapshot: ${this.state.phase}`, - properties: { - currentPhase: this.state.phase, - snapshotId: snapshot.friendlyId, - }, - }); - - throw new Error(`Invalid phase for updating snapshot: ${this.state.phase}`); - } - - if (this.state.run.friendlyId !== run.friendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: `updateRunPhase: Mismatched run IDs`, - properties: { - currentRunId: this.state.run.friendlyId, - newRunId: run.friendlyId, - currentSnapshotId: this.state.snapshot.friendlyId, - newSnapshotId: snapshot.friendlyId, - }, - }); - - throw new Error("Mismatched run IDs"); - } - - if (this.state.snapshot.friendlyId === snapshot.friendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "updateRunPhase: Snapshot not changed", - properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: `updateRunPhase: Snapshot not changed`, - properties: { - snapshotId: snapshot.friendlyId, - }, - }); - - return; - } - - if (this.state.run.attemptNumber !== run.attemptNumber) { - this.sendDebugLog({ - runId: run.friendlyId, - message: `updateRunPhase: Attempt number changed`, - properties: { - oldAttemptNumber: this.state.run.attemptNumber ?? undefined, - newAttemptNumber: run.attemptNumber ?? undefined, - }, - }); - throw new Error("Attempt number changed"); - } - - this.state = { - phase: "RUN", - run: { - friendlyId: run.friendlyId, - attemptNumber: run.attemptNumber, - }, - snapshot: { - friendlyId: snapshot.friendlyId, - }, - }; - } - - private onExitRunPhase(newRun: Run | undefined = undefined) { - // We're not in a run phase, nothing to do - if (this.state.phase !== "RUN") { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "onExitRunPhase: Not in run phase, skipping", - properties: { phase: this.state.phase }, - }); - return; - } - - // This is still the same run, so we're not exiting the phase - if (newRun?.friendlyId === this.state.run.friendlyId) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "onExitRunPhase: Same run, skipping", - properties: { newRun: newRun?.friendlyId }, - }); - return; - } - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "onExitRunPhase: Exiting run phase", - properties: { newRun: newRun?.friendlyId }, - }); - - this.runHeartbeat.stop(); - this.snapshotPoller.stop(); - - const { run, snapshot } = this.state; - - this.unsubscribeFromRunNotifications({ run, snapshot }); - } - - private subscribeToRunNotifications({ run, snapshot }: { run: Run; snapshot: Snapshot }) { + private subscribeToRunNotifications(runFriendlyId: string, snapshotFriendlyId: string) { this.socket.emit("run:start", { version: "1", run: { - friendlyId: run.friendlyId, + friendlyId: runFriendlyId, }, snapshot: { - friendlyId: snapshot.friendlyId, + friendlyId: snapshotFriendlyId, }, }); } - private unsubscribeFromRunNotifications({ run, snapshot }: { run: Run; snapshot: Snapshot }) { + private unsubscribeFromRunNotifications(runFriendlyId: string, snapshotFriendlyId: string) { this.socket.emit("run:stop", { version: "1", run: { - friendlyId: run.friendlyId, + friendlyId: runFriendlyId, }, snapshot: { - friendlyId: snapshot.friendlyId, + friendlyId: snapshotFriendlyId, }, }); } private get runFriendlyId() { - if (this.state.phase !== "RUN") { - return undefined; - } - - return this.state.run.friendlyId; + return this.currentExecution?.runFriendlyId; } private get snapshotFriendlyId() { - if (this.state.phase !== "RUN") { - return; - } - - return this.state.snapshot.friendlyId; + return this.currentExecution?.snapshotFriendlyId; } - private handleSnapshotChangeLock = false; - - private async handleSnapshotChange({ - run, - snapshot, - completedWaitpoints, - }: Pick) { - if (this.handleSnapshotChangeLock) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "handleSnapshotChange: already in progress", - }); - return; - } - - this.handleSnapshotChangeLock = true; - - try { - if (!this.snapshotFriendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "handleSnapshotChange: Missing snapshot ID", - properties: { - newSnapshotId: snapshot.friendlyId, - newSnapshotStatus: snapshot.executionStatus, - }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: "snapshot change: missing snapshot ID", - properties: { - newSnapshotId: snapshot.friendlyId, - newSnapshotStatus: snapshot.executionStatus, - }, - }); - - return; - } - - if (this.snapshotFriendlyId === snapshot.friendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "handleSnapshotChange: snapshot not changed, skipping", - properties: { snapshot: snapshot.friendlyId }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: "snapshot change: skipping, no change", - properties: { - snapshotId: this.snapshotFriendlyId, - snapshotStatus: snapshot.executionStatus, - }, - }); - - return; - } - - this.sendDebugLog({ - runId: run.friendlyId, - message: `snapshot change: ${snapshot.executionStatus}`, - properties: { - oldSnapshotId: this.snapshotFriendlyId, - newSnapshotId: snapshot.friendlyId, - completedWaitpoints: completedWaitpoints.length, - }, - }); - - try { - this.updateRunPhase(run, snapshot); - - this.runHeartbeat.updateSnapshotId(snapshot.friendlyId); - this.snapshotPoller.updateSnapshotId(snapshot.friendlyId); - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "snapshot change: failed to update run phase", - properties: { - currentPhase: this.state.phase, - error: error instanceof Error ? error.message : String(error), - }, - }); - - this.waitForNextRun(); - return; - } - - switch (snapshot.executionStatus) { - case "PENDING_CANCEL": { - try { - await this.cancelAttempt(run.friendlyId); - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "snapshot change: failed to cancel attempt", - properties: { - error: error instanceof Error ? error.message : String(error), - }, - }); - - this.waitForNextRun(); - return; - } - - return; - } - case "FINISHED": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run is finished, will wait for next run", - }); - - if (this.activeRunExecution) { - // Let's pretend we've just suspended the run. This will kill the process and should automatically wait for the next run. - // We still explicitly call waitForNextRun() afterwards in case of race conditions. Locks will prevent this from causing issues. - await this.taskRunProcess?.suspend(); - } - - this.waitForNextRun(); - - return; - } - case "QUEUED_EXECUTING": - case "EXECUTING_WITH_WAITPOINTS": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run is executing with waitpoints", - properties: { snapshot: snapshot.friendlyId }, - }); - - try { - // This should never throw. It should also never fail the run. - await this.taskRunProcess?.cleanup(false); - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to cleanup task run process", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - } - - if (snapshot.friendlyId !== this.snapshotFriendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Snapshot changed after cleanup, abort", - properties: { - oldSnapshotId: snapshot.friendlyId, - newSnapshotId: this.snapshotFriendlyId, - }, - }); - return; - } - - await sleep(this.env.TRIGGER_PRE_SUSPEND_WAIT_MS); - - if (snapshot.friendlyId !== this.snapshotFriendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Snapshot changed after suspend threshold, abort", - properties: { - oldSnapshotId: snapshot.friendlyId, - newSnapshotId: this.snapshotFriendlyId, - }, - }); - return; - } - - if (!this.runFriendlyId || !this.snapshotFriendlyId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: - "handleSnapshotChange: Missing run ID or snapshot ID after suspension, abort", - properties: { - runId: this.runFriendlyId, - snapshotId: this.snapshotFriendlyId, - }, - }); - return; - } - - const suspendResult = await this.httpClient.suspendRun( - this.runFriendlyId, - this.snapshotFriendlyId - ); - - if (!suspendResult.success) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to suspend run, staying alive 🎶", - properties: { - error: suspendResult.error, - }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: "checkpoint: suspend request failed", - properties: { - snapshotId: snapshot.friendlyId, - error: suspendResult.error, - }, - }); - - return; - } - - if (!suspendResult.data.ok) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "checkpoint: failed to suspend run", - properties: { - snapshotId: snapshot.friendlyId, - error: suspendResult.data.error, - }, - }); - - return; - } - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Suspending, any day now 🚬", - properties: { ok: suspendResult.data.ok }, - }); - return; - } - case "SUSPENDED": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run was suspended, kill the process and wait for more runs", - properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, - }); - - // This will kill the process and fail the execution with a SuspendedProcessError - await this.taskRunProcess?.suspend(); - - return; - } - case "PENDING_EXECUTING": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run is pending execution", - properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, - }); - - if (completedWaitpoints.length === 0) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "No waitpoints to complete, nothing to do", - }); - return; - } - - // There are waitpoints to complete so we've been restored after being suspended - this.restoreCount++; - - // Short delay to give websocket time to reconnect - await sleep(100); - - // Env may have changed after restore - await this.processEnvOverrides(); - - // We need to let the platform know we're ready to continue - const continuationResult = await this.httpClient.continueRunExecution( - run.friendlyId, - snapshot.friendlyId - ); - - if (!continuationResult.success) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "failed to continue execution", - properties: { - error: continuationResult.error, - }, - }); - - this.waitForNextRun(); - return; - } - - return; - } - case "EXECUTING": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run is now executing", - properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, - }); - - if (completedWaitpoints.length === 0) { - return; - } - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Processing completed waitpoints", - properties: { completedWaitpoints: completedWaitpoints.length }, - }); - - if (!this.taskRunProcess) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "No task run process, ignoring completed waitpoints", - properties: { completedWaitpoints: completedWaitpoints.length }, - }); - return; - } - - for (const waitpoint of completedWaitpoints) { - this.taskRunProcess.waitpointCompleted(waitpoint); - } - - return; - } - case "RUN_CREATED": - case "QUEUED": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Status change not handled", - properties: { status: snapshot.executionStatus }, - }); - return; - } - default: { - assertExhaustive(snapshot.executionStatus); - } - } - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "snapshot change: unexpected error", - properties: { - snapshotId: snapshot.friendlyId, - error: error instanceof Error ? error.message : String(error), - }, - }); - } finally { - this.handleSnapshotChangeLock = false; - } - } - - private async processEnvOverrides() { - if (!this.metadataClient) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "No metadata client, skipping env overrides", - }); - return; - } - - const overrides = await this.metadataClient.getEnvOverrides(); - - if (!overrides) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "No env overrides, skipping", - }); - return; - } - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Processing env overrides", - properties: { ...overrides }, - }); - - // Override the env with the new values - this.env.override(overrides); - - // Update services and clients with the new values - if (overrides.TRIGGER_HEARTBEAT_INTERVAL_SECONDS) { - this.runHeartbeat.updateInterval(this.env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS * 1000); - } - if (overrides.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS) { - this.snapshotPoller.updateInterval(this.env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS * 1000); - } - if ( - overrides.TRIGGER_SUPERVISOR_API_PROTOCOL || - overrides.TRIGGER_SUPERVISOR_API_DOMAIN || - overrides.TRIGGER_SUPERVISOR_API_PORT - ) { - this.httpClient.updateApiUrl(this.workerApiUrl); - } - if (overrides.TRIGGER_RUNNER_ID) { - this.httpClient.updateRunnerId(this.runnerId); - } - } - - private activeRunExecution: Promise | null = null; + private lockedRunExecution: Promise | null = null; private async startAndExecuteRunAttempt({ runFriendlyId, @@ -746,16 +154,14 @@ export class ManagedRunController { dequeuedAt, podScheduledAt, isWarmStart, - skipLockCheckForImmediateRetry: skipLockCheck, }: { runFriendlyId: string; snapshotFriendlyId: string; dequeuedAt?: Date; podScheduledAt?: Date; isWarmStart?: boolean; - skipLockCheckForImmediateRetry?: boolean; }) { - if (!skipLockCheck && this.activeRunExecution) { + if (this.lockedRunExecution) { this.sendDebugLog({ runId: runFriendlyId, message: "startAndExecuteRunAttempt: already in progress", @@ -771,194 +177,45 @@ export class ManagedRunController { }); } - this.subscribeToRunNotifications({ - run: { friendlyId: runFriendlyId }, - snapshot: { friendlyId: snapshotFriendlyId }, - }); - - const attemptStartedAt = Date.now(); + this.subscribeToRunNotifications(runFriendlyId, snapshotFriendlyId); - const start = await this.httpClient.startRunAttempt(runFriendlyId, snapshotFriendlyId, { + // Create a new RunExecution instance for this attempt + const newExecution = new RunExecution({ + runFriendlyId, + snapshotFriendlyId, + dequeuedAt, + podScheduledAt, isWarmStart, + workerManifest: this.workerManifest, + env: this.env, + httpClient: this.httpClient, + logger: this.logger, }); - if (!start.success) { - this.sendDebugLog({ - runId: runFriendlyId, - message: "Failed to start run", - properties: { error: start.error }, - }); - - this.sendDebugLog({ - runId: runFriendlyId, - message: "failed to start run attempt", - properties: { - error: start.error, - }, - }); - - this.waitForNextRun(); - return; + if (this.currentExecution?.taskRunEnv) { + newExecution.prepareForExecution(this.currentExecution.taskRunEnv); } - const attemptDuration = Date.now() - attemptStartedAt; - - const { run, snapshot, execution, envVars } = start.data; - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Started run", - properties: { snapshot: snapshot.friendlyId }, - }); - - this.enterRunPhase(run, snapshot); - - const metrics = [ - { - name: "start", - event: "create_attempt", - timestamp: attemptStartedAt, - duration: attemptDuration, - }, - ] - .concat( - dequeuedAt - ? [ - { - name: "start", - event: "dequeue", - timestamp: dequeuedAt.getTime(), - duration: 0, - }, - ] - : [] - ) - .concat( - podScheduledAt - ? [ - { - name: "start", - event: "pod_scheduled", - timestamp: podScheduledAt.getTime(), - duration: 0, - }, - ] - : [] - ) satisfies TaskRunExecutionMetrics; - - const taskRunEnv = { - ...this.env.gatherProcessEnv(), - ...envVars, - }; - - try { - return await this.executeRun({ - run, - snapshot, - envVars: taskRunEnv, - execution, - metrics, - isWarmStart, - }); - } catch (error) { - if (error instanceof SuspendedProcessError) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run was suspended and task run process was killed, waiting for next run", - properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, - }); - - this.waitForNextRun(); - return; - } - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Error while executing attempt", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Submitting attempt completion", - properties: { - snapshotId: snapshot.friendlyId, - updatedSnapshotId: this.snapshotFriendlyId, - }, - }); + this.currentExecution = newExecution; - const completion = { - id: execution.run.id, - ok: false, - retry: undefined, - error: TaskRunProcess.parseExecuteError(error), - } satisfies TaskRunFailedExecutionResult; - - const completionResult = await this.httpClient.completeRunAttempt( - run.friendlyId, - // FIXME: if the snapshot has changed since starting the run, this won't be accurate - // ..but we probably shouldn't fetch the latest snapshot either because we may be in an "unhealthy" state while the next runner has already taken over - this.snapshotFriendlyId ?? snapshot.friendlyId, - { completion } - ); - - if (!completionResult.success) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to submit completion after error", - properties: { error: completionResult.error }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: "completion: failed to submit after error", - properties: { - error: completionResult.error, - }, - }); - - this.waitForNextRun(); - return; - } - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Attempt completion submitted after error", - properties: { - attemptStatus: completionResult.data.result.attemptStatus, - runId: completionResult.data.result.run.friendlyId, - snapshotId: completionResult.data.result.snapshot.friendlyId, - }, - }); - - try { - await this.handleCompletionResult(completion, completionResult.data.result); - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to handle completion result after error", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - - this.waitForNextRun(); - return; - } - } + await this.currentExecution.execute(); }; - this.activeRunExecution = execution(); + this.lockedRunExecution = execution(); - try { - await this.activeRunExecution; - } catch (error) { + const [error] = await tryCatch(this.lockedRunExecution); + + if (error) { this.sendDebugLog({ runId: runFriendlyId, - message: "startAndExecuteRunAttempt: unexpected error", - properties: { error: error instanceof Error ? error.message : String(error) }, + message: "Error during execution", + properties: { error: error.message }, }); - } finally { - this.activeRunExecution = null; } + + this.lockedRunExecution = null; + this.unsubscribeFromRunNotifications(runFriendlyId, snapshotFriendlyId); + this.waitForNextRun(); } private waitForNextRunLock = false; @@ -979,25 +236,21 @@ export class ManagedRunController { const previousRunId = this.runFriendlyId; try { - // If there's a run execution in progress, we need to kill it and wait for it to finish - if (this.activeRunExecution) { + // If there's a run execution in progress, we need to wait for it to finish + if (this.lockedRunExecution) { this.sendDebugLog({ runId: this.runFriendlyId, message: "waitForNextRun: waiting for existing run execution to finish", }); - await this.activeRunExecution; + // TODO: maybe kill the process? + await this.lockedRunExecution; } - // Just for good measure - await this.taskRunProcess?.kill("SIGKILL"); - this.sendDebugLog({ runId: this.runFriendlyId, message: "waitForNextRun: waiting for next run", }); - this.enterWarmStartPhase(); - if (!this.warmStartClient) { this.sendDebugLog({ runId: this.runFriendlyId, @@ -1006,22 +259,6 @@ export class ManagedRunController { this.exitProcess(this.successExitCode); } - if (this.taskRunProcess) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: eagerly recreating task run process with options", - }); - this.taskRunProcess = new TaskRunProcess({ - ...this.taskRunProcess.options, - isWarmStart: true, - }).initialize(); - } else { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: no existing task run process, so we can't eagerly recreate it", - }); - } - // Check the service is up and get additional warm start config const connect = await this.warmStartClient.connect(); @@ -1101,7 +338,6 @@ export class ManagedRunController { dequeuedAt: nextRun.dequeuedAt, isWarmStart: true, }).finally(() => {}); - return; } catch (error) { this.sendDebugLog({ runId: this.runFriendlyId, @@ -1120,9 +356,9 @@ export class ManagedRunController { message: "Exiting process", properties: { code }, }); - if (this.taskRunProcess?.isPreparedForNextRun) { - this.taskRunProcess.forceExit(); - } + + this.currentExecution?.exit(); + process.exit(code); } @@ -1169,9 +405,6 @@ export class ManagedRunController { return; } - // Reset the (fallback) snapshot poll interval so we don't do unnecessary work - this.snapshotPoller.resetCurrentInterval(); - const latestSnapshot = await this.httpClient.getRunExecutionData(this.runFriendlyId); if (!latestSnapshot.success) { @@ -1187,7 +420,25 @@ export class ManagedRunController { return; } - await this.handleSnapshotChange(latestSnapshot.data.execution); + const runExecutionData = latestSnapshot.data.execution; + + if (!this.currentExecution) { + this.sendDebugLog({ + runId: runExecutionData.run.friendlyId, + message: "handleSnapshotChange: no current execution", + }); + return; + } + + const [error] = await tryCatch(this.currentExecution.handleSnapshotChange(runExecutionData)); + + if (error) { + this.sendDebugLog({ + runId: runExecutionData.run.friendlyId, + message: "handleSnapshotChange: unexpected error", + properties: { error: error.message }, + }); + } }); socket.on("connect", () => { @@ -1197,9 +448,8 @@ export class ManagedRunController { }); // This should handle the case where we reconnect after being restored - if (this.state.phase === "RUN") { - const { run, snapshot } = this.state; - this.subscribeToRunNotifications({ run, snapshot }); + if (this.runFriendlyId && this.snapshotFriendlyId) { + this.subscribeToRunNotifications(this.runFriendlyId, this.snapshotFriendlyId); } }); @@ -1222,236 +472,6 @@ export class ManagedRunController { return socket; } - private async executeRun({ - run, - snapshot, - envVars, - execution, - metrics, - isWarmStart, - }: WorkloadRunAttemptStartResponseBody & { - metrics?: TaskRunExecutionMetrics; - isWarmStart?: boolean; - }) { - this.snapshotPoller.start(); - - if (!this.taskRunProcess || !this.taskRunProcess.isPreparedForNextRun) { - this.taskRunProcess = new TaskRunProcess({ - workerManifest: this.workerManifest, - env: envVars, - serverWorker: { - id: "unmanaged", - contentHash: this.env.TRIGGER_CONTENT_HASH, - version: this.env.TRIGGER_DEPLOYMENT_VERSION, - engine: "V2", - }, - machine: execution.machine, - isWarmStart, - }).initialize(); - } - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "executing task run process", - properties: { - attemptId: execution.attempt.id, - runId: execution.run.id, - }, - }); - - const completion = await this.taskRunProcess.execute( - { - payload: { - execution, - traceContext: execution.run.traceContext ?? {}, - metrics, - }, - messageId: run.friendlyId, - env: envVars, - }, - isWarmStart - ); - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Completed run", - properties: { completion: completion.ok }, - }); - - try { - // The execution has finished, so we can cleanup the task run process. Killing it should be safe. - await this.taskRunProcess.cleanup(true); - } catch (error) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Failed to cleanup task run process, submitting completion anyway", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - } - - if (!this.runFriendlyId || !this.snapshotFriendlyId) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "executeRun: Missing run ID or snapshot ID after execution", - properties: { - runId: this.runFriendlyId, - snapshotId: this.snapshotFriendlyId, - }, - }); - - this.waitForNextRun(); - return; - } - - const completionResult = await this.httpClient.completeRunAttempt( - this.runFriendlyId, - this.snapshotFriendlyId, - { - completion, - } - ); - - if (!completionResult.success) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "completion: failed to submit", - properties: { - error: completionResult.error, - }, - }); - - this.sendDebugLog({ - runId: run.friendlyId, - message: "completion: failed to submit", - properties: { - error: completionResult.error, - }, - }); - - this.waitForNextRun(); - return; - } - - this.sendDebugLog({ - runId: run.friendlyId, - message: "Attempt completion submitted", - properties: { - attemptStatus: completionResult.data.result.attemptStatus, - runId: completionResult.data.result.run.friendlyId, - snapshotId: completionResult.data.result.snapshot.friendlyId, - }, - }); - - try { - await this.handleCompletionResult(completion, completionResult.data.result); - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to handle completion result", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - - this.waitForNextRun(); - return; - } - } - - private async handleCompletionResult( - completion: TaskRunExecutionResult, - result: CompleteRunAttemptResult - ) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Handling completion result", - properties: { - completion: completion.ok, - attemptStatus: result.attemptStatus, - snapshotId: result.snapshot.friendlyId, - runId: result.run.friendlyId, - }, - }); - - const { attemptStatus, snapshot: completionSnapshot, run } = result; - - try { - this.updateRunPhase(run, completionSnapshot); - } catch (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to update run phase after completion", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - - this.waitForNextRun(); - return; - } - - if (attemptStatus === "RUN_FINISHED") { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run finished", - }); - - this.waitForNextRun(); - return; - } - - if (attemptStatus === "RUN_PENDING_CANCEL") { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run pending cancel", - }); - return; - } - - if (attemptStatus === "RETRY_QUEUED") { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Retry queued", - }); - - this.waitForNextRun(); - return; - } - - if (attemptStatus === "RETRY_IMMEDIATELY") { - if (completion.ok) { - throw new Error("Should retry but completion OK."); - } - - if (!completion.retry) { - throw new Error("Should retry but missing retry params."); - } - - await sleep(completion.retry.delay); - - if (!this.snapshotFriendlyId) { - throw new Error("Missing snapshot ID after retry"); - } - - this.startAndExecuteRunAttempt({ - runFriendlyId: run.friendlyId, - snapshotFriendlyId: this.snapshotFriendlyId, - skipLockCheckForImmediateRetry: true, - isWarmStart: true, - }).finally(() => {}); - return; - } - - assertExhaustive(attemptStatus); - } - - sendDebugLog(opts: SendDebugLogOptions) { - this.logger.sendDebugLog({ - ...opts, - properties: { - ...opts.properties, - warmStartCount: this.warmStartCount, - restoreCount: this.restoreCount, - }, - }); - } - async cancelAttempt(runId: string) { this.sendDebugLog({ runId, @@ -1459,7 +479,7 @@ export class ManagedRunController { properties: { runId }, }); - await this.taskRunProcess?.cancel(); + await this.currentExecution?.cancel(); } start() { @@ -1490,13 +510,18 @@ export class ManagedRunController { message: "Shutting down", }); - if (this.taskRunProcess) { - await this.taskRunProcess.cleanup(true); - } - - this.runHeartbeat.stop(); - this.snapshotPoller.stop(); - + await this.currentExecution?.cancel(); this.socket.close(); } + + sendDebugLog(opts: SendDebugLogOptions) { + this.logger.sendDebugLog({ + ...opts, + properties: { + ...opts.properties, + runnerWarmStartCount: this.warmStartCount, + runnerRestoreCount: this.restoreCount, + }, + }); + } } diff --git a/packages/cli-v3/src/entryPoints/managed/execution.ts b/packages/cli-v3/src/entryPoints/managed/execution.ts new file mode 100644 index 0000000000..e4fb712374 --- /dev/null +++ b/packages/cli-v3/src/entryPoints/managed/execution.ts @@ -0,0 +1,1019 @@ +import { + type CompleteRunAttemptResult, + type RunExecutionData, + SuspendedProcessError, + type TaskRunExecutionMetrics, + type TaskRunExecutionResult, + type TaskRunFailedExecutionResult, + WorkerManifest, +} from "@trigger.dev/core/v3"; +import { type WorkloadRunAttemptStartResponseBody } from "@trigger.dev/core/v3/workers"; +import { TaskRunProcess } from "../../executions/taskRunProcess.js"; +import { RunLogger, SendDebugLogOptions } from "./logger.js"; +import { RunnerEnv } from "./env.js"; +import { WorkloadHttpClient } from "@trigger.dev/core/v3/workers"; +import { setTimeout as sleep } from "timers/promises"; +import { RunExecutionHeartbeat } from "./heartbeat.js"; +import { RunExecutionSnapshotPoller } from "./poller.js"; +import { assertExhaustive, tryCatch } from "@trigger.dev/core/utils"; +import { MetadataClient } from "./overrides.js"; + +class ExecutionExitError extends Error { + constructor(message: string) { + super(message); + this.name = "ExecutionExitError"; + } +} + +type RunExecutionOptions = { + runFriendlyId: string; + snapshotFriendlyId: string; + dequeuedAt?: Date; + podScheduledAt?: Date; + isWarmStart?: boolean; + workerManifest: WorkerManifest; + env: RunnerEnv; + httpClient: WorkloadHttpClient; + logger: RunLogger; +}; + +export class RunExecution { + private executionAbortController = new AbortController(); + private isExecutionActive = false; + + public readonly runFriendlyId: string; + + private currentSnapshotId: string; + private currentTaskRunEnv: Record | null = null; + + private readonly dequeuedAt?: Date; + private readonly podScheduledAt?: Date; + private isWarmStart: boolean; + private readonly workerManifest: WorkerManifest; + private readonly env: RunnerEnv; + private readonly httpClient: WorkloadHttpClient; + private readonly logger: RunLogger; + private restoreCount = 0; + + private taskRunProcess?: TaskRunProcess; + private readonly runHeartbeat: RunExecutionHeartbeat; + private readonly snapshotPoller: RunExecutionSnapshotPoller; + + constructor(opts: RunExecutionOptions) { + this.runFriendlyId = opts.runFriendlyId; + this.currentSnapshotId = opts.snapshotFriendlyId; + this.dequeuedAt = opts.dequeuedAt; + this.podScheduledAt = opts.podScheduledAt; + this.isWarmStart = opts.isWarmStart ?? false; + this.workerManifest = opts.workerManifest; + this.env = opts.env; + this.httpClient = opts.httpClient; + this.logger = opts.logger; + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Run execution created", + properties: { + runId: this.runFriendlyId, + snapshotId: this.currentSnapshotId, + isWarmStart: this.isWarmStart, + dequeuedAt: this.dequeuedAt?.toISOString(), + podScheduledAt: this.podScheduledAt?.toISOString(), + }, + }); + + this.runHeartbeat = new RunExecutionHeartbeat({ + runFriendlyId: this.runFriendlyId, + snapshotFriendlyId: this.currentSnapshotId, + httpClient: this.httpClient, + logger: this.logger, + heartbeatIntervalSeconds: this.env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS, + }); + + this.snapshotPoller = new RunExecutionSnapshotPoller({ + runFriendlyId: this.runFriendlyId, + snapshotFriendlyId: this.currentSnapshotId, + httpClient: this.httpClient, + logger: this.logger, + snapshotPollIntervalSeconds: this.env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS, + handleSnapshotChange: this.handleSnapshotChange.bind(this), + }); + } + + // TODO: we need to be able to exit the execution here if we need to + /** + * Called by the RunController when it receives a websocket notification + * or when the snapshot poller detects a change + */ + public async handleSnapshotChange(runData: RunExecutionData): Promise { + const { run, snapshot, completedWaitpoints } = runData; + + // Ensure the run ID matches + if (run.friendlyId !== this.runFriendlyId) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "handleSnapshotChange: mismatched run IDs", + properties: { + currentRunId: this.runFriendlyId, + newRunId: run.friendlyId, + currentSnapshotId: this.currentSnapshotId, + newSnapshotId: snapshot.friendlyId, + }, + }); + return; + } + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: `enqueued snapshot change: ${snapshot.executionStatus}`, + properties: { + oldSnapshotId: this.currentSnapshotId, + newSnapshotId: snapshot.friendlyId, + completedWaitpoints: completedWaitpoints.length, + }, + }); + + this.snapshotChangeQueue.push(runData); + await this.processSnapshotChangeQueue(); + } + + private snapshotChangeQueue: RunExecutionData[] = []; + private snapshotChangeQueueLock = false; + + private async processSnapshotChangeQueue() { + if (this.snapshotChangeQueueLock) { + return; + } + + this.snapshotChangeQueueLock = true; + while (this.snapshotChangeQueue.length > 0) { + const runData = this.snapshotChangeQueue.shift(); + + if (!runData) { + continue; + } + + const [error] = await tryCatch(this.processSnapshotChange(runData)); + + if (error) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Failed to process snapshot change", + properties: { + error: error.message, + currentSnapshotId: this.currentSnapshotId, + }, + }); + } + } + this.snapshotChangeQueueLock = false; + } + + private async processSnapshotChange(runData: RunExecutionData): Promise { + const { run, snapshot, completedWaitpoints } = runData; + + // Check if the incoming snapshot is newer than the current one + if (snapshot.friendlyId < this.currentSnapshotId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "handleSnapshotChange: received older snapshot, skipping", + properties: { + currentSnapshotId: this.currentSnapshotId, + receivedSnapshotId: snapshot.friendlyId, + }, + }); + return; + } + + if (snapshot.friendlyId === this.currentSnapshotId) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "handleSnapshotChange: snapshot not changed", + properties: { snapshot: snapshot.friendlyId }, + }); + return; + } + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: `snapshot change: ${snapshot.executionStatus}`, + properties: { + oldSnapshotId: this.currentSnapshotId, + newSnapshotId: snapshot.friendlyId, + completedWaitpoints: completedWaitpoints.length, + }, + }); + + // Reset the snapshot poll interval so we don't do unnecessary work + this.snapshotPoller.resetCurrentInterval(); + + // Update internal state + this.currentSnapshotId = snapshot.friendlyId; + + // Update services + this.runHeartbeat.updateSnapshotId(snapshot.friendlyId); + this.snapshotPoller.updateSnapshotId(snapshot.friendlyId); + + switch (snapshot.executionStatus) { + case "PENDING_CANCEL": { + const [error] = await tryCatch(this.cancel()); + + if (error) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "snapshot change: failed to cancel attempt", + properties: { error: error.message }, + }); + } + + this.signalExecutionExit(); + return; + } + case "FINISHED": { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run is finished", + }); + + // Pretend we've just suspended the run. This will kill the process without failing the run. + await this.taskRunProcess?.suspend(); + this.signalExecutionExit(); + return; + } + case "QUEUED_EXECUTING": + case "EXECUTING_WITH_WAITPOINTS": { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run is executing with waitpoints", + properties: { snapshot: snapshot.friendlyId }, + }); + + const [error] = await tryCatch(this.taskRunProcess?.cleanup(false)); + + if (error) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Failed to cleanup task run process", + properties: { error: error.message }, + }); + } + + if (snapshot.friendlyId !== this.snapshotFriendlyId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Snapshot changed after cleanup, abort", + properties: { + oldSnapshotId: snapshot.friendlyId, + newSnapshotId: this.snapshotFriendlyId, + }, + }); + return; + } + + await sleep(this.env.TRIGGER_PRE_SUSPEND_WAIT_MS); + + if (snapshot.friendlyId !== this.snapshotFriendlyId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Snapshot changed after suspend threshold, abort", + properties: { + oldSnapshotId: snapshot.friendlyId, + newSnapshotId: this.snapshotFriendlyId, + }, + }); + return; + } + + if (!this.runFriendlyId || !this.snapshotFriendlyId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "handleSnapshotChange: Missing run ID or snapshot ID after suspension, abort", + properties: { + runId: this.runFriendlyId, + snapshotId: this.snapshotFriendlyId, + }, + }); + return; + } + + const suspendResult = await this.httpClient.suspendRun( + this.runFriendlyId, + this.snapshotFriendlyId + ); + + if (!suspendResult.success) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Failed to suspend run, staying alive 🎶", + properties: { + error: suspendResult.error, + }, + }); + + this.sendDebugLog({ + runId: run.friendlyId, + message: "checkpoint: suspend request failed", + properties: { + snapshotId: snapshot.friendlyId, + error: suspendResult.error, + }, + }); + + return; + } + + if (!suspendResult.data.ok) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "checkpoint: failed to suspend run", + properties: { + snapshotId: snapshot.friendlyId, + error: suspendResult.data.error, + }, + }); + + return; + } + + this.sendDebugLog({ + runId: run.friendlyId, + message: "Suspending, any day now 🚬", + properties: { ok: suspendResult.data.ok }, + }); + return; + } + case "SUSPENDED": { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run was suspended, kill the process", + properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, + }); + + await this.taskRunProcess?.suspend(); + this.signalExecutionExit(); + return; + } + case "PENDING_EXECUTING": { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run is pending execution", + properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, + }); + + if (completedWaitpoints.length === 0) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "No waitpoints to complete, nothing to do", + }); + return; + } + + // Track restore count + this.restoreCount++; + + // Short delay to give websocket time to reconnect + await sleep(100); + + // Process any env overrides + await this.processEnvOverrides(); + + // We need to let the platform know we're ready to continue + const continuationResult = await this.httpClient.continueRunExecution( + run.friendlyId, + snapshot.friendlyId + ); + + if (!continuationResult.success) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "failed to continue execution", + properties: { + error: continuationResult.error, + }, + }); + + // TODO: exit any active executions + return; + } + + return; + } + case "EXECUTING": { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run is now executing", + properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, + }); + + if (completedWaitpoints.length === 0) { + return; + } + + this.sendDebugLog({ + runId: run.friendlyId, + message: "Processing completed waitpoints", + properties: { completedWaitpoints: completedWaitpoints.length }, + }); + + if (!this.taskRunProcess) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "No task run process, ignoring completed waitpoints", + properties: { completedWaitpoints: completedWaitpoints.length }, + }); + return; + } + + for (const waitpoint of completedWaitpoints) { + this.taskRunProcess.waitpointCompleted(waitpoint); + } + + return; + } + case "RUN_CREATED": + case "QUEUED": { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Status change not handled", + properties: { status: snapshot.executionStatus }, + }); + return; + } + default: { + assertExhaustive(snapshot.executionStatus); + } + } + } + + /** + * Eagerly creates the TaskRunProcess for this execution. + * This is useful for warm starts where we want to prepare the process before we have the run details. + */ + public prepareForExecution(taskRunEnv: Record): void { + if (this.taskRunProcess) { + return; + } + + this.taskRunProcess = new TaskRunProcess({ + workerManifest: this.workerManifest, + // FIXME: this is not enough, we need the env vars of the first run - think secret API keys etc + env: taskRunEnv, + serverWorker: { + id: "managed", + contentHash: this.env.TRIGGER_CONTENT_HASH, + version: this.env.TRIGGER_DEPLOYMENT_VERSION, + engine: "V2", + }, + machineResources: { + cpu: Number(this.env.TRIGGER_MACHINE_CPU), + memory: Number(this.env.TRIGGER_MACHINE_MEMORY), + }, + isWarmStart: this.isWarmStart, + }).initialize(); + } + + /** + * Executes the run. This will return when the execution is complete and we should warm start. + * When this returns, the child process will have been cleaned up. + */ + public async execute(): Promise { + // Reset abort controller for new execution + this.executionAbortController = new AbortController(); + + // Start the heartbeat and poller + this.runHeartbeat.start(); + this.snapshotPoller.start(); + + try { + const attemptStartedAt = Date.now(); + + // Check for abort before each major async operation + if (this.executionAbortController.signal.aborted) { + throw new ExecutionExitError("Execution aborted before start"); + } + + const start = await this.httpClient.startRunAttempt( + this.runFriendlyId, + this.currentSnapshotId, + { + isWarmStart: this.isWarmStart, + } + ); + + if (this.executionAbortController.signal.aborted) { + throw new ExecutionExitError("Execution aborted after start"); + } + + if (!start.success) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Failed to start run", + properties: { error: start.error }, + }); + + return; + } + + // A snapshot was just created, so update the snapshot ID + this.currentSnapshotId = start.data.snapshot.friendlyId; + + const attemptDuration = Date.now() - attemptStartedAt; + + const { run, snapshot, execution, envVars } = start.data; + + this.sendDebugLog({ + runId: run.friendlyId, + message: "Started run", + properties: { snapshot: snapshot.friendlyId }, + }); + + const metrics = [ + { + name: "start", + event: "create_attempt", + timestamp: attemptStartedAt, + duration: attemptDuration, + }, + ] + .concat( + this.dequeuedAt + ? [ + { + name: "start", + event: "dequeue", + timestamp: this.dequeuedAt.getTime(), + duration: 0, + }, + ] + : [] + ) + .concat( + this.podScheduledAt + ? [ + { + name: "start", + event: "pod_scheduled", + timestamp: this.podScheduledAt.getTime(), + duration: 0, + }, + ] + : [] + ) satisfies TaskRunExecutionMetrics; + + this.currentTaskRunEnv = { + ...this.env.gatherProcessEnv(), + ...envVars, + }; + + const [error] = await tryCatch( + this.executeRun({ + run, + snapshot, + envVars: this.currentTaskRunEnv, + execution, + metrics, + }) + ); + + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run execution completed", + properties: { error: error?.message }, + }); + + if (!error) { + // Stop the heartbeat and poller + this.runHeartbeat.stop(); + this.snapshotPoller.stop(); + } + + if (error) { + if (error instanceof SuspendedProcessError) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run was suspended", + properties: { + run: run.friendlyId, + snapshot: snapshot.friendlyId, + error: error.message, + }, + }); + + return; + } + + if (error instanceof ExecutionExitError) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "Run was interrupted", + properties: { + run: run.friendlyId, + snapshot: snapshot.friendlyId, + error: error.message, + }, + }); + + return; + } + + this.sendDebugLog({ + runId: run.friendlyId, + message: "Error while executing attempt", + properties: { + error: error.message, + runId: run.friendlyId, + snapshotId: snapshot.friendlyId, + }, + }); + + const completion = { + id: execution.run.id, + ok: false, + retry: undefined, + error: TaskRunProcess.parseExecuteError(error), + } satisfies TaskRunFailedExecutionResult; + + this.snapshotPoller.stop(); + await this.complete(completion); + this.runHeartbeat.stop(); + } + } finally { + // Ensure we clean up even if aborted + this.runHeartbeat.stop(); + this.snapshotPoller.stop(); + } + } + + /** + * Cancels the current execution. + */ + public async cancel(): Promise { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "cancelling attempt", + properties: { runId: this.runFriendlyId }, + }); + + await this.taskRunProcess?.cancel(); + } + + public exit() { + if (this.taskRunProcess?.isPreparedForNextRun) { + this.taskRunProcess.forceExit(); + } + } + + private async executeRun({ + run, + snapshot, + envVars, + execution, + metrics, + }: WorkloadRunAttemptStartResponseBody & { + metrics?: TaskRunExecutionMetrics; + }) { + this.isExecutionActive = true; + try { + if (!this.taskRunProcess || !this.taskRunProcess.isPreparedForNextRun) { + this.taskRunProcess = new TaskRunProcess({ + workerManifest: this.workerManifest, + env: envVars, + serverWorker: { + id: "managed", + contentHash: this.env.TRIGGER_CONTENT_HASH, + version: this.env.TRIGGER_DEPLOYMENT_VERSION, + engine: "V2", + }, + machineResources: execution.machine, + isWarmStart: this.isWarmStart, + }).initialize(); + } + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "executing task run process", + properties: { + attemptId: execution.attempt.id, + runId: execution.run.id, + }, + }); + + // Set up an abort handler that will cleanup the task run process + this.executionAbortController.signal.addEventListener("abort", async () => { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Execution aborted during task run, cleaning up process", + properties: { + attemptId: execution.attempt.id, + runId: execution.run.id, + }, + }); + + await this.taskRunProcess?.cleanup(true); + throw new ExecutionExitError("Execution aborted during task run"); + }); + + const completion = await this.taskRunProcess.execute( + { + payload: { + execution, + traceContext: execution.run.traceContext ?? {}, + metrics, + }, + messageId: run.friendlyId, + env: envVars, + }, + this.isWarmStart + ); + + // If we get here, the task completed normally + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Completed run", + properties: { completion: completion.ok }, + }); + + // The execution has finished, so we can cleanup the task run process. Killing it should be safe. + const [error] = await tryCatch(this.taskRunProcess.cleanup(true)); + + if (error) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Failed to cleanup task run process, submitting completion anyway", + properties: { error: error.message }, + }); + } + + const [completionError] = await tryCatch(this.complete(completion)); + + if (completionError) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Failed to complete run", + properties: { error: completionError.message }, + }); + } + } finally { + this.isExecutionActive = false; + } + } + + private async complete(completion: TaskRunExecutionResult): Promise { + const completionResult = await this.httpClient.completeRunAttempt( + this.runFriendlyId, + this.currentSnapshotId, + { completion } + ); + + if (!completionResult.success) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "completion: failed to submit", + properties: { + error: completionResult.error, + }, + }); + + return; + } + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Attempt completion submitted", + properties: { + attemptStatus: completionResult.data.result.attemptStatus, + runId: completionResult.data.result.run.friendlyId, + snapshotId: completionResult.data.result.snapshot.friendlyId, + }, + }); + + await this.handleCompletionResult(completion, completionResult.data.result); + } + + private async handleCompletionResult( + completion: TaskRunExecutionResult, + result: CompleteRunAttemptResult + ) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Handling completion result", + properties: { + completion: completion.ok, + attemptStatus: result.attemptStatus, + snapshotId: result.snapshot.friendlyId, + runId: result.run.friendlyId, + }, + }); + + // Update our snapshot ID to match the completion result + // This ensures any subsequent API calls use the correct snapshot + this.currentSnapshotId = result.snapshot.friendlyId; + + const { attemptStatus } = result; + + if (attemptStatus === "RUN_FINISHED") { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Run finished", + }); + + return; + } + + if (attemptStatus === "RUN_PENDING_CANCEL") { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Run pending cancel", + }); + return; + } + + if (attemptStatus === "RETRY_QUEUED") { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Retry queued", + }); + + return; + } + + if (attemptStatus === "RETRY_IMMEDIATELY") { + if (completion.ok) { + throw new Error("Should retry but completion OK."); + } + + if (!completion.retry) { + throw new Error("Should retry but missing retry params."); + } + + await sleep(completion.retry.delay); + + // FIXME: this is wrong + // Create a new execution for the retry + const retryExecution = new RunExecution({ + ...this, + isWarmStart: true, + }); + + this.isWarmStart = true; + + await this.execute(); + return; + } + + assertExhaustive(attemptStatus); + } + + /** + * Suspends the current execution. + */ + public async suspend(): Promise { + const suspendResult = await this.httpClient.suspendRun( + this.runFriendlyId, + this.currentSnapshotId + ); + + if (!suspendResult.success) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Failed to suspend run, staying alive 🎶", + properties: { + error: suspendResult.error, + }, + }); + + return; + } + + if (!suspendResult.data.ok) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "checkpoint: failed to suspend run", + properties: { + snapshotId: this.currentSnapshotId, + error: suspendResult.data.error, + }, + }); + + return; + } + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Suspending, any day now 🚬", + properties: { ok: suspendResult.data.ok }, + }); + + await this.taskRunProcess?.suspend(); + } + + // TODO: remove if not needed + /** + * Resumes a suspended execution. + */ + public async resume(): Promise { + // Process any env overrides + await this.processEnvOverrides(); + + const continuationResult = await this.httpClient.continueRunExecution( + this.runFriendlyId, + this.currentSnapshotId + ); + + if (!continuationResult.success) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "failed to continue execution", + properties: { + error: continuationResult.error, + }, + }); + + return; + } + } + + /** + * Processes env overrides from the metadata service. Generally called when we're resuming from a suspended state. + */ + private async processEnvOverrides() { + if (!this.env.TRIGGER_METADATA_URL) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "No metadata URL, skipping env overrides", + }); + return; + } + + const metadataClient = new MetadataClient(this.env.TRIGGER_METADATA_URL); + const overrides = await metadataClient.getEnvOverrides(); + + if (!overrides) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "No env overrides, skipping", + }); + return; + } + + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Processing env overrides", + properties: { ...overrides }, + }); + + // Override the env with the new values + this.env.override(overrides); + + // Update services with new values + if (overrides.TRIGGER_HEARTBEAT_INTERVAL_SECONDS) { + this.runHeartbeat.updateInterval(this.env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS * 1000); + } + if (overrides.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS) { + this.snapshotPoller.updateInterval(this.env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS * 1000); + } + if ( + overrides.TRIGGER_SUPERVISOR_API_PROTOCOL || + overrides.TRIGGER_SUPERVISOR_API_DOMAIN || + overrides.TRIGGER_SUPERVISOR_API_PORT + ) { + this.httpClient.updateApiUrl(this.env.TRIGGER_SUPERVISOR_API_URL); + } + if (overrides.TRIGGER_RUNNER_ID) { + this.httpClient.updateRunnerId(this.env.TRIGGER_RUNNER_ID); + } + } + + sendDebugLog(opts: SendDebugLogOptions) { + this.logger.sendDebugLog({ + ...opts, + properties: { + ...opts.properties, + executionRestoreCount: this.restoreCount, + }, + }); + } + + // Add getter for current snapshot ID + public get snapshotFriendlyId(): string { + return this.currentSnapshotId; + } + + // Add getter for current task run env + public get taskRunEnv(): Record | null { + return this.currentTaskRunEnv; + } + + // Add getter for metrics + public get metrics() { + return { + restoreCount: this.restoreCount, + }; + } + + private signalExecutionExit() { + if (this.isExecutionActive) { + this.executionAbortController.abort(); + } + } +} diff --git a/packages/cli-v3/src/executions/taskRunProcess.ts b/packages/cli-v3/src/executions/taskRunProcess.ts index abe7c93389..96f68f0f42 100644 --- a/packages/cli-v3/src/executions/taskRunProcess.ts +++ b/packages/cli-v3/src/executions/taskRunProcess.ts @@ -1,7 +1,7 @@ import { CompletedWaitpoint, ExecutorToWorkerMessageCatalog, - MachinePreset, + MachinePresetResources, ServerBackgroundWorker, TaskRunErrorCodes, TaskRunExecution, @@ -50,7 +50,7 @@ export type TaskRunProcessOptions = { workerManifest: WorkerManifest; serverWorker: ServerBackgroundWorker; env: Record; - machine: MachinePreset; + machineResources: MachinePresetResources; isWarmStart?: boolean; cwd?: string; }; @@ -125,7 +125,7 @@ export class TaskRunProcess { } initialize() { - const { env: $env, workerManifest, cwd, machine } = this.options; + const { env: $env, workerManifest, cwd, machineResources: machine } = this.options; const maxOldSpaceSize = nodeOptionsWithMaxOldSpaceSize(undefined, machine); diff --git a/packages/core/src/utils.ts b/packages/core/src/utils.ts index 1fb82cc714..4a214a4536 100644 --- a/packages/core/src/utils.ts +++ b/packages/core/src/utils.ts @@ -2,7 +2,13 @@ export function assertExhaustive(x: never): never { throw new Error("Unexpected object: " + x); } -export async function tryCatch(promise: Promise): Promise<[null, T] | [E, null]> { +export async function tryCatch( + promise: Promise | undefined +): Promise<[null, T] | [E, null]> { + if (!promise) { + return [null, undefined as T]; + } + try { const data = await promise; return [null, data]; diff --git a/packages/core/src/v3/machines/index.ts b/packages/core/src/v3/machines/index.ts index 771f4345a4..e5dcb097dc 100644 --- a/packages/core/src/v3/machines/index.ts +++ b/packages/core/src/v3/machines/index.ts @@ -1,14 +1,17 @@ -import { MachinePreset } from "../schemas/common.js"; +import { MachinePresetResources } from "../schemas/common.js"; /** * Returns a value to be used for `--max-old-space-size`. It is in MiB. * Setting this correctly means V8 spends more times running Garbage Collection (GC). * It won't eliminate crashes but it will help avoid them. - * @param {MachinePreset} machine - The machine preset configuration containing memory specifications + * @param {MachinePresetResources} machine - The machine preset configuration containing memory specifications * @param {number} [overhead=0.2] - The memory overhead factor (0.2 = 20% reserved for system operations) * @returns {number} The calculated max old space size in MiB */ -export function maxOldSpaceSizeForMachine(machine: MachinePreset, overhead: number = 0.2): number { +export function maxOldSpaceSizeForMachine( + machine: MachinePresetResources, + overhead: number = 0.2 +): number { return Math.round(machine.memory * 1_024 * (1 - overhead)); } @@ -16,24 +19,27 @@ export function maxOldSpaceSizeForMachine(machine: MachinePreset, overhead: numb * Returns a flag to be used for `--max-old-space-size`. It is in MiB. * Setting this correctly means V8 spends more times running Garbage Collection (GC). * It won't eliminate crashes but it will help avoid them. - * @param {MachinePreset} machine - The machine preset configuration containing memory specifications + * @param {MachinePresetResources} machine - The machine preset configuration containing memory specifications * @param {number} [overhead=0.2] - The memory overhead factor (0.2 = 20% reserved for system operations) * @returns {string} The calculated max old space size flag */ -export function maxOldSpaceSizeFlag(machine: MachinePreset, overhead: number = 0.2): string { +export function maxOldSpaceSizeFlag( + machine: MachinePresetResources, + overhead: number = 0.2 +): string { return `--max-old-space-size=${maxOldSpaceSizeForMachine(machine, overhead)}`; } /** * Takes the existing NODE_OPTIONS value, removes any existing max-old-space-size flag, and adds a new one. * @param {string | undefined} existingOptions - The existing NODE_OPTIONS value - * @param {MachinePreset} machine - The machine preset configuration containing memory specifications + * @param {MachinePresetResources} machine - The machine preset configuration containing memory specifications * @param {number} [overhead=0.2] - The memory overhead factor (0.2 = 20% reserved for system operations) * @returns {string} The updated NODE_OPTIONS value with the new max-old-space-size flag */ export function nodeOptionsWithMaxOldSpaceSize( existingOptions: string | undefined, - machine: MachinePreset, + machine: MachinePresetResources, overhead: number = 0.2 ): string { let options = existingOptions ?? ""; diff --git a/packages/core/src/v3/schemas/common.ts b/packages/core/src/v3/schemas/common.ts index 030dd4dcee..a4d37409a2 100644 --- a/packages/core/src/v3/schemas/common.ts +++ b/packages/core/src/v3/schemas/common.ts @@ -123,6 +123,8 @@ export const MachinePreset = z.object({ export type MachinePreset = z.infer; +export type MachinePresetResources = Pick; + export const TaskRunBuiltInError = z.object({ type: z.literal("BUILT_IN_ERROR"), name: z.string(), From e03f4179de6a731c17253b68a6e00bcb7ac1736b Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Mon, 14 Apr 2025 15:40:16 +0100 Subject: [PATCH 22/35] temp disable pre --- .changeset/pre.json | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 .changeset/pre.json diff --git a/.changeset/pre.json b/.changeset/pre.json deleted file mode 100644 index 3863ea8ac2..0000000000 --- a/.changeset/pre.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "mode": "pre", - "tag": "v4-beta", - "initialVersions": { - "coordinator": "0.0.1", - "docker-provider": "0.0.1", - "kubernetes-provider": "0.0.1", - "supervisor": "0.0.1", - "webapp": "1.0.0", - "@trigger.dev/build": "3.3.17", - "trigger.dev": "3.3.17", - "@trigger.dev/core": "3.3.17", - "@trigger.dev/python": "3.3.17", - "@trigger.dev/react-hooks": "3.3.17", - "@trigger.dev/redis-worker": "3.3.17", - "@trigger.dev/rsc": "3.3.17", - "@trigger.dev/sdk": "3.3.17" - }, - "changesets": [ - "breezy-turtles-talk", - "four-needles-add", - "honest-files-decide", - "late-chairs-ring", - "moody-squids-count", - "nice-colts-boil", - "polite-lies-fix", - "red-wasps-cover", - "shiny-kiwis-beam", - "smart-coins-hammer", - "weak-jobs-hide" - ] -} From 235372dd9c6d58fe51f8da6685c33bfb3aa38e93 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Mon, 14 Apr 2025 16:39:11 +0100 Subject: [PATCH 23/35] add a controller log when starting an execution --- packages/cli-v3/src/entryPoints/managed/controller.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/packages/cli-v3/src/entryPoints/managed/controller.ts b/packages/cli-v3/src/entryPoints/managed/controller.ts index d88df789e3..6b782e3a3f 100644 --- a/packages/cli-v3/src/entryPoints/managed/controller.ts +++ b/packages/cli-v3/src/entryPoints/managed/controller.ts @@ -169,6 +169,11 @@ export class ManagedRunController { return; } + this.sendDebugLog({ + runId: runFriendlyId, + message: "startAndExecuteRunAttempt: called", + }); + const execution = async () => { if (!this.socket) { this.sendDebugLog({ From 650533b4bda02c899c1c1b6669522d355be8a548 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Mon, 14 Apr 2025 19:28:16 +0100 Subject: [PATCH 24/35] refactor execution and squash some bugs --- .../src/entryPoints/managed/controller.ts | 25 +- .../src/entryPoints/managed/execution.ts | 529 ++++++++++-------- .../src/entryPoints/managed/heartbeat.ts | 21 +- .../cli-v3/src/entryPoints/managed/poller.ts | 23 +- 4 files changed, 331 insertions(+), 267 deletions(-) diff --git a/packages/cli-v3/src/entryPoints/managed/controller.ts b/packages/cli-v3/src/entryPoints/managed/controller.ts index 6b782e3a3f..116b72c0dd 100644 --- a/packages/cli-v3/src/entryPoints/managed/controller.ts +++ b/packages/cli-v3/src/entryPoints/managed/controller.ts @@ -143,7 +143,7 @@ export class ManagedRunController { } private get snapshotFriendlyId() { - return this.currentExecution?.snapshotFriendlyId; + return this.currentExecution?.currentSnapshotFriendlyId; } private lockedRunExecution: Promise | null = null; @@ -182,28 +182,33 @@ export class ManagedRunController { }); } - this.subscribeToRunNotifications(runFriendlyId, snapshotFriendlyId); - // Create a new RunExecution instance for this attempt const newExecution = new RunExecution({ - runFriendlyId, - snapshotFriendlyId, - dequeuedAt, - podScheduledAt, - isWarmStart, workerManifest: this.workerManifest, env: this.env, httpClient: this.httpClient, logger: this.logger, }); + // If we have a current execution with task run env, prepare the new execution if (this.currentExecution?.taskRunEnv) { - newExecution.prepareForExecution(this.currentExecution.taskRunEnv); + newExecution.prepareForExecution({ + taskRunEnv: this.currentExecution.taskRunEnv, + }); } this.currentExecution = newExecution; - await this.currentExecution.execute(); + // Subscribe to run notifications + this.subscribeToRunNotifications(runFriendlyId, snapshotFriendlyId); + + await this.currentExecution.execute({ + runFriendlyId, + snapshotFriendlyId, + dequeuedAt, + podScheduledAt, + isWarmStart, + }); }; this.lockedRunExecution = execution(); diff --git a/packages/cli-v3/src/entryPoints/managed/execution.ts b/packages/cli-v3/src/entryPoints/managed/execution.ts index e4fb712374..a4de858b42 100644 --- a/packages/cli-v3/src/entryPoints/managed/execution.ts +++ b/packages/cli-v3/src/entryPoints/managed/execution.ts @@ -2,6 +2,7 @@ import { type CompleteRunAttemptResult, type RunExecutionData, SuspendedProcessError, + TaskRunExecution, type TaskRunExecutionMetrics, type TaskRunExecutionResult, type TaskRunFailedExecutionResult, @@ -18,36 +19,43 @@ import { RunExecutionSnapshotPoller } from "./poller.js"; import { assertExhaustive, tryCatch } from "@trigger.dev/core/utils"; import { MetadataClient } from "./overrides.js"; -class ExecutionExitError extends Error { +class ExecutionAbortError extends Error { constructor(message: string) { super(message); - this.name = "ExecutionExitError"; + this.name = "ExecutionAbortError"; } } type RunExecutionOptions = { + workerManifest: WorkerManifest; + env: RunnerEnv; + httpClient: WorkloadHttpClient; + logger: RunLogger; +}; + +type RunExecutionPrepareOptions = { + taskRunEnv: Record; +}; + +type RunExecutionRunOptions = { runFriendlyId: string; snapshotFriendlyId: string; dequeuedAt?: Date; podScheduledAt?: Date; isWarmStart?: boolean; - workerManifest: WorkerManifest; - env: RunnerEnv; - httpClient: WorkloadHttpClient; - logger: RunLogger; }; export class RunExecution { private executionAbortController = new AbortController(); private isExecutionActive = false; + private isPrepared = false; - public readonly runFriendlyId: string; - - private currentSnapshotId: string; + private _runFriendlyId?: string; + private currentSnapshotId?: string; private currentTaskRunEnv: Record | null = null; - private readonly dequeuedAt?: Date; - private readonly podScheduledAt?: Date; + private dequeuedAt?: Date; + private podScheduledAt?: Date; private isWarmStart: boolean; private readonly workerManifest: WorkerManifest; private readonly env: RunnerEnv; @@ -56,51 +64,52 @@ export class RunExecution { private restoreCount = 0; private taskRunProcess?: TaskRunProcess; - private readonly runHeartbeat: RunExecutionHeartbeat; - private readonly snapshotPoller: RunExecutionSnapshotPoller; + private runHeartbeat?: RunExecutionHeartbeat; + private snapshotPoller?: RunExecutionSnapshotPoller; constructor(opts: RunExecutionOptions) { - this.runFriendlyId = opts.runFriendlyId; - this.currentSnapshotId = opts.snapshotFriendlyId; - this.dequeuedAt = opts.dequeuedAt; - this.podScheduledAt = opts.podScheduledAt; - this.isWarmStart = opts.isWarmStart ?? false; this.workerManifest = opts.workerManifest; this.env = opts.env; this.httpClient = opts.httpClient; this.logger = opts.logger; + this.isWarmStart = false; + } - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Run execution created", - properties: { - runId: this.runFriendlyId, - snapshotId: this.currentSnapshotId, + /** + * Prepares the execution with task run environment variables. + * This should be called before executing, typically after a successful run to prepare for the next one. + */ + public prepareForExecution(opts: RunExecutionPrepareOptions): void { + this.currentTaskRunEnv = opts.taskRunEnv; + + if (!this.taskRunProcess || !this.taskRunProcess.isPreparedForNextRun) { + this.taskRunProcess = new TaskRunProcess({ + workerManifest: this.workerManifest, + env: opts.taskRunEnv, + serverWorker: { + id: "managed", + contentHash: this.env.TRIGGER_CONTENT_HASH, + version: this.env.TRIGGER_DEPLOYMENT_VERSION, + engine: "V2", + }, + machineResources: { + cpu: Number(this.env.TRIGGER_MACHINE_CPU), + memory: Number(this.env.TRIGGER_MACHINE_MEMORY), + }, isWarmStart: this.isWarmStart, - dequeuedAt: this.dequeuedAt?.toISOString(), - podScheduledAt: this.podScheduledAt?.toISOString(), - }, - }); + }).initialize(); + } - this.runHeartbeat = new RunExecutionHeartbeat({ - runFriendlyId: this.runFriendlyId, - snapshotFriendlyId: this.currentSnapshotId, - httpClient: this.httpClient, - logger: this.logger, - heartbeatIntervalSeconds: this.env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS, - }); + this.isPrepared = true; + } - this.snapshotPoller = new RunExecutionSnapshotPoller({ - runFriendlyId: this.runFriendlyId, - snapshotFriendlyId: this.currentSnapshotId, - httpClient: this.httpClient, - logger: this.logger, - snapshotPollIntervalSeconds: this.env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS, - handleSnapshotChange: this.handleSnapshotChange.bind(this), - }); + /** + * Returns true if the execution has been prepared with task run env. + */ + public isPreparedForExecution(): boolean { + return this.isPrepared; } - // TODO: we need to be able to exit the execution here if we need to /** * Called by the RunController when it receives a websocket notification * or when the snapshot poller detects a change @@ -108,13 +117,28 @@ export class RunExecution { public async handleSnapshotChange(runData: RunExecutionData): Promise { const { run, snapshot, completedWaitpoints } = runData; + // Ensure we have run details + if (!this.runFriendlyId || !this.currentSnapshotId) { + this.sendDebugLog({ + runId: run.friendlyId, + message: "handleSnapshotChange: missing run or snapshot ID", + properties: { + currentRunId: this.runFriendlyId, + newRunId: run.friendlyId, + currentSnapshotId: this.currentSnapshotId, + newSnapshotId: snapshot.friendlyId, + }, + }); + return; + } + // Ensure the run ID matches - if (run.friendlyId !== this.runFriendlyId) { + if (run.friendlyId !== this._runFriendlyId) { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "handleSnapshotChange: mismatched run IDs", properties: { - currentRunId: this.runFriendlyId, + currentRunId: this._runFriendlyId, newRunId: run.friendlyId, currentSnapshotId: this.currentSnapshotId, newSnapshotId: snapshot.friendlyId, @@ -124,7 +148,7 @@ export class RunExecution { } this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: `enqueued snapshot change: ${snapshot.executionStatus}`, properties: { oldSnapshotId: this.currentSnapshotId, @@ -157,7 +181,7 @@ export class RunExecution { if (error) { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "Failed to process snapshot change", properties: { error: error.message, @@ -173,7 +197,7 @@ export class RunExecution { const { run, snapshot, completedWaitpoints } = runData; // Check if the incoming snapshot is newer than the current one - if (snapshot.friendlyId < this.currentSnapshotId) { + if (!this.currentSnapshotId || snapshot.friendlyId < this.currentSnapshotId) { this.sendDebugLog({ runId: run.friendlyId, message: "handleSnapshotChange: received older snapshot, skipping", @@ -187,7 +211,7 @@ export class RunExecution { if (snapshot.friendlyId === this.currentSnapshotId) { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "handleSnapshotChange: snapshot not changed", properties: { snapshot: snapshot.friendlyId }, }); @@ -195,7 +219,7 @@ export class RunExecution { } this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: `snapshot change: ${snapshot.executionStatus}`, properties: { oldSnapshotId: this.currentSnapshotId, @@ -205,14 +229,14 @@ export class RunExecution { }); // Reset the snapshot poll interval so we don't do unnecessary work - this.snapshotPoller.resetCurrentInterval(); + this.snapshotPoller?.resetCurrentInterval(); // Update internal state this.currentSnapshotId = snapshot.friendlyId; // Update services - this.runHeartbeat.updateSnapshotId(snapshot.friendlyId); - this.snapshotPoller.updateSnapshotId(snapshot.friendlyId); + this.runHeartbeat?.updateSnapshotId(snapshot.friendlyId); + this.snapshotPoller?.updateSnapshotId(snapshot.friendlyId); switch (snapshot.executionStatus) { case "PENDING_CANCEL": { @@ -226,7 +250,7 @@ export class RunExecution { }); } - this.signalExecutionExit(); + this.abortExecution(); return; } case "FINISHED": { @@ -236,8 +260,7 @@ export class RunExecution { }); // Pretend we've just suspended the run. This will kill the process without failing the run. - await this.taskRunProcess?.suspend(); - this.signalExecutionExit(); + await this.suspend(); return; } case "QUEUED_EXECUTING": @@ -253,52 +276,58 @@ export class RunExecution { if (error) { this.sendDebugLog({ runId: run.friendlyId, - message: "Failed to cleanup task run process", + message: "Failed to cleanup task run process, carrying on", properties: { error: error.message }, }); } - if (snapshot.friendlyId !== this.snapshotFriendlyId) { + if (snapshot.friendlyId !== this.currentSnapshotId) { this.sendDebugLog({ runId: run.friendlyId, message: "Snapshot changed after cleanup, abort", properties: { oldSnapshotId: snapshot.friendlyId, - newSnapshotId: this.snapshotFriendlyId, + newSnapshotId: this.currentSnapshotId, }, }); + + this.abortExecution(); return; } await sleep(this.env.TRIGGER_PRE_SUSPEND_WAIT_MS); - if (snapshot.friendlyId !== this.snapshotFriendlyId) { + if (snapshot.friendlyId !== this.currentSnapshotId) { this.sendDebugLog({ runId: run.friendlyId, message: "Snapshot changed after suspend threshold, abort", properties: { oldSnapshotId: snapshot.friendlyId, - newSnapshotId: this.snapshotFriendlyId, + newSnapshotId: this.currentSnapshotId, }, }); + + this.abortExecution(); return; } - if (!this.runFriendlyId || !this.snapshotFriendlyId) { + if (!this._runFriendlyId || !this.currentSnapshotId) { this.sendDebugLog({ runId: run.friendlyId, message: "handleSnapshotChange: Missing run ID or snapshot ID after suspension, abort", properties: { - runId: this.runFriendlyId, - snapshotId: this.snapshotFriendlyId, + runId: this._runFriendlyId, + snapshotId: this.currentSnapshotId, }, }); + + this.abortExecution(); return; } const suspendResult = await this.httpClient.suspendRun( - this.runFriendlyId, - this.snapshotFriendlyId + this._runFriendlyId, + this.currentSnapshotId ); if (!suspendResult.success) { @@ -314,11 +343,12 @@ export class RunExecution { runId: run.friendlyId, message: "checkpoint: suspend request failed", properties: { - snapshotId: snapshot.friendlyId, + snapshotId: this.currentSnapshotId, error: suspendResult.error, }, }); + // This is fine, we'll wait for the next status change return; } @@ -327,11 +357,12 @@ export class RunExecution { runId: run.friendlyId, message: "checkpoint: failed to suspend run", properties: { - snapshotId: snapshot.friendlyId, + snapshotId: this.currentSnapshotId, error: suspendResult.data.error, }, }); + // This is fine, we'll wait for the next status change return; } @@ -340,24 +371,25 @@ export class RunExecution { message: "Suspending, any day now 🚬", properties: { ok: suspendResult.data.ok }, }); + + // Wait for next status change return; } case "SUSPENDED": { this.sendDebugLog({ runId: run.friendlyId, message: "Run was suspended, kill the process", - properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, + properties: { run: run.friendlyId, snapshot: this.currentSnapshotId }, }); - await this.taskRunProcess?.suspend(); - this.signalExecutionExit(); + await this.suspend(); return; } case "PENDING_EXECUTING": { this.sendDebugLog({ runId: run.friendlyId, message: "Run is pending execution", - properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, + properties: { run: run.friendlyId, snapshot: this.currentSnapshotId }, }); if (completedWaitpoints.length === 0) { @@ -368,41 +400,14 @@ export class RunExecution { return; } - // Track restore count - this.restoreCount++; - - // Short delay to give websocket time to reconnect - await sleep(100); - - // Process any env overrides - await this.processEnvOverrides(); - - // We need to let the platform know we're ready to continue - const continuationResult = await this.httpClient.continueRunExecution( - run.friendlyId, - snapshot.friendlyId - ); - - if (!continuationResult.success) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "failed to continue execution", - properties: { - error: continuationResult.error, - }, - }); - - // TODO: exit any active executions - return; - } - + await this.restore(); return; } case "EXECUTING": { this.sendDebugLog({ runId: run.friendlyId, message: "Run is now executing", - properties: { run: run.friendlyId, snapshot: snapshot.friendlyId }, + properties: { run: run.friendlyId, snapshot: this.currentSnapshotId }, }); if (completedWaitpoints.length === 0) { @@ -421,6 +426,8 @@ export class RunExecution { message: "No task run process, ignoring completed waitpoints", properties: { completedWaitpoints: completedWaitpoints.length }, }); + + this.abortExecution(); return; } @@ -434,9 +441,11 @@ export class RunExecution { case "QUEUED": { this.sendDebugLog({ runId: run.friendlyId, - message: "Status change not handled", + message: "Invalid status change", properties: { status: snapshot.executionStatus }, }); + + this.abortExecution(); return; } default: { @@ -445,42 +454,38 @@ export class RunExecution { } } - /** - * Eagerly creates the TaskRunProcess for this execution. - * This is useful for warm starts where we want to prepare the process before we have the run details. - */ - public prepareForExecution(taskRunEnv: Record): void { - if (this.taskRunProcess) { - return; - } - - this.taskRunProcess = new TaskRunProcess({ - workerManifest: this.workerManifest, - // FIXME: this is not enough, we need the env vars of the first run - think secret API keys etc - env: taskRunEnv, - serverWorker: { - id: "managed", - contentHash: this.env.TRIGGER_CONTENT_HASH, - version: this.env.TRIGGER_DEPLOYMENT_VERSION, - engine: "V2", - }, - machineResources: { - cpu: Number(this.env.TRIGGER_MACHINE_CPU), - memory: Number(this.env.TRIGGER_MACHINE_MEMORY), - }, - isWarmStart: this.isWarmStart, - }).initialize(); - } - /** * Executes the run. This will return when the execution is complete and we should warm start. * When this returns, the child process will have been cleaned up. */ - public async execute(): Promise { + public async execute(runOpts: RunExecutionRunOptions): Promise { + this._runFriendlyId = runOpts.runFriendlyId; + this.currentSnapshotId = runOpts.snapshotFriendlyId; + this.dequeuedAt = runOpts.dequeuedAt; + this.podScheduledAt = runOpts.podScheduledAt; + this.isWarmStart = runOpts.isWarmStart ?? false; + // Reset abort controller for new execution this.executionAbortController = new AbortController(); - // Start the heartbeat and poller + // Create and start the heartbeat and poller services + this.runHeartbeat = new RunExecutionHeartbeat({ + runFriendlyId: this._runFriendlyId, + snapshotFriendlyId: this.currentSnapshotId, + httpClient: this.httpClient, + logger: this.logger, + heartbeatIntervalSeconds: this.env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS, + }); + + this.snapshotPoller = new RunExecutionSnapshotPoller({ + runFriendlyId: this._runFriendlyId, + snapshotFriendlyId: this.currentSnapshotId, + httpClient: this.httpClient, + logger: this.logger, + snapshotPollIntervalSeconds: this.env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS, + handleSnapshotChange: this.handleSnapshotChange.bind(this), + }); + this.runHeartbeat.start(); this.snapshotPoller.start(); @@ -489,11 +494,11 @@ export class RunExecution { // Check for abort before each major async operation if (this.executionAbortController.signal.aborted) { - throw new ExecutionExitError("Execution aborted before start"); + throw new ExecutionAbortError("Execution aborted before start"); } const start = await this.httpClient.startRunAttempt( - this.runFriendlyId, + this._runFriendlyId, this.currentSnapshotId, { isWarmStart: this.isWarmStart, @@ -501,12 +506,12 @@ export class RunExecution { ); if (this.executionAbortController.signal.aborted) { - throw new ExecutionExitError("Execution aborted after start"); + throw new ExecutionAbortError("Execution aborted after start"); } if (!start.success) { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "Failed to start run", properties: { error: start.error }, }); @@ -583,8 +588,8 @@ export class RunExecution { if (!error) { // Stop the heartbeat and poller - this.runHeartbeat.stop(); - this.snapshotPoller.stop(); + this.runHeartbeat?.stop(); + this.snapshotPoller?.stop(); } if (error) { @@ -602,7 +607,7 @@ export class RunExecution { return; } - if (error instanceof ExecutionExitError) { + if (error instanceof ExecutionAbortError) { this.sendDebugLog({ runId: run.friendlyId, message: "Run was interrupted", @@ -633,14 +638,14 @@ export class RunExecution { error: TaskRunProcess.parseExecuteError(error), } satisfies TaskRunFailedExecutionResult; - this.snapshotPoller.stop(); - await this.complete(completion); - this.runHeartbeat.stop(); + this.snapshotPoller?.stop(); + await this.complete({ execution, completion }); + this.runHeartbeat?.stop(); } } finally { // Ensure we clean up even if aborted - this.runHeartbeat.stop(); - this.snapshotPoller.stop(); + this.runHeartbeat?.stop(); + this.snapshotPoller?.stop(); } } @@ -649,9 +654,9 @@ export class RunExecution { */ public async cancel(): Promise { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "cancelling attempt", - properties: { runId: this.runFriendlyId }, + properties: { runId: this._runFriendlyId }, }); await this.taskRunProcess?.cancel(); @@ -673,7 +678,9 @@ export class RunExecution { metrics?: TaskRunExecutionMetrics; }) { this.isExecutionActive = true; + try { + // To skip this step and eagerly create the task run process, run prepareForExecution first if (!this.taskRunProcess || !this.taskRunProcess.isPreparedForNextRun) { this.taskRunProcess = new TaskRunProcess({ workerManifest: this.workerManifest, @@ -690,7 +697,7 @@ export class RunExecution { } this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "executing task run process", properties: { attemptId: execution.attempt.id, @@ -701,7 +708,7 @@ export class RunExecution { // Set up an abort handler that will cleanup the task run process this.executionAbortController.signal.addEventListener("abort", async () => { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "Execution aborted during task run, cleaning up process", properties: { attemptId: execution.attempt.id, @@ -710,7 +717,7 @@ export class RunExecution { }); await this.taskRunProcess?.cleanup(true); - throw new ExecutionExitError("Execution aborted during task run"); + throw new ExecutionAbortError("Execution aborted during task run"); }); const completion = await this.taskRunProcess.execute( @@ -728,7 +735,7 @@ export class RunExecution { // If we get here, the task completed normally this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "Completed run", properties: { completion: completion.ok }, }); @@ -738,17 +745,17 @@ export class RunExecution { if (error) { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "Failed to cleanup task run process, submitting completion anyway", properties: { error: error.message }, }); } - const [completionError] = await tryCatch(this.complete(completion)); + const [completionError] = await tryCatch(this.complete({ execution, completion })); if (completionError) { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "Failed to complete run", properties: { error: completionError.message }, }); @@ -758,16 +765,26 @@ export class RunExecution { } } - private async complete(completion: TaskRunExecutionResult): Promise { + private async complete({ + execution, + completion, + }: { + execution: TaskRunExecution; + completion: TaskRunExecutionResult; + }): Promise { + if (!this._runFriendlyId || !this.currentSnapshotId) { + throw new Error("Cannot complete run: missing run or snapshot ID"); + } + const completionResult = await this.httpClient.completeRunAttempt( - this.runFriendlyId, + this._runFriendlyId, this.currentSnapshotId, { completion } ); if (!completionResult.success) { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "completion: failed to submit", properties: { error: completionResult.error, @@ -778,7 +795,7 @@ export class RunExecution { } this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "Attempt completion submitted", properties: { attemptStatus: completionResult.data.result.attemptStatus, @@ -787,15 +804,21 @@ export class RunExecution { }, }); - await this.handleCompletionResult(completion, completionResult.data.result); + await this.handleCompletionResult({ + completion, + result: completionResult.data.result, + }); } - private async handleCompletionResult( - completion: TaskRunExecutionResult, - result: CompleteRunAttemptResult - ) { + private async handleCompletionResult({ + completion, + result, + }: { + completion: TaskRunExecutionResult; + result: CompleteRunAttemptResult; + }) { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "Handling completion result", properties: { completion: completion.ok, @@ -813,7 +836,7 @@ export class RunExecution { if (attemptStatus === "RUN_FINISHED") { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "Run finished", }); @@ -822,7 +845,7 @@ export class RunExecution { if (attemptStatus === "RUN_PENDING_CANCEL") { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "Run pending cancel", }); return; @@ -830,7 +853,7 @@ export class RunExecution { if (attemptStatus === "RETRY_QUEUED") { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "Retry queued", }); @@ -846,90 +869,121 @@ export class RunExecution { throw new Error("Should retry but missing retry params."); } - await sleep(completion.retry.delay); - - // FIXME: this is wrong - // Create a new execution for the retry - const retryExecution = new RunExecution({ - ...this, - isWarmStart: true, - }); - - this.isWarmStart = true; - - await this.execute(); + await this.retry({ result, delay: completion.retry.delay }); return; } assertExhaustive(attemptStatus); } + private async retry({ result, delay }: { result: CompleteRunAttemptResult; delay: number }) { + await sleep(delay); + + await this.execute({ + runFriendlyId: result.run.id, + snapshotFriendlyId: result.snapshot.friendlyId, + isWarmStart: true, + }); + } + /** * Suspends the current execution. */ - public async suspend(): Promise { - const suspendResult = await this.httpClient.suspendRun( - this.runFriendlyId, - this.currentSnapshotId - ); + private async suspend(): Promise { + try { + if (!this._runFriendlyId || !this.currentSnapshotId) { + this.sendDebugLog({ + runId: this._runFriendlyId, + message: "Cannot suspend: missing run or snapshot ID", + }); - if (!suspendResult.success) { - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Failed to suspend run, staying alive 🎶", - properties: { - error: suspendResult.error, - }, - }); + return; + } - return; - } + const suspendResult = await this.httpClient.suspendRun( + this._runFriendlyId, + this.currentSnapshotId + ); + + if (!suspendResult.success) { + this.sendDebugLog({ + runId: this._runFriendlyId, + message: "Failed to suspend run, staying alive 🎶", + properties: { + error: suspendResult.error, + }, + }); + + return; + } + + if (!suspendResult.data.ok) { + this.sendDebugLog({ + runId: this._runFriendlyId, + message: "checkpoint: failed to suspend run", + properties: { + snapshotId: this.currentSnapshotId, + error: suspendResult.data.error, + }, + }); + + return; + } - if (!suspendResult.data.ok) { this.sendDebugLog({ - runId: this.runFriendlyId, - message: "checkpoint: failed to suspend run", - properties: { - snapshotId: this.currentSnapshotId, - error: suspendResult.data.error, - }, + runId: this._runFriendlyId, + message: "Suspending, any day now 🚬", + properties: { ok: suspendResult.data.ok }, }); - return; + await this.taskRunProcess?.suspend(); + } finally { + this.abortExecution(); } - - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "Suspending, any day now 🚬", - properties: { ok: suspendResult.data.ok }, - }); - - await this.taskRunProcess?.suspend(); } - // TODO: remove if not needed /** - * Resumes a suspended execution. + * Restores a suspended execution from PENDING_EXECUTING */ - public async resume(): Promise { - // Process any env overrides - await this.processEnvOverrides(); + private async restore(): Promise { + try { + if (!this._runFriendlyId || !this.currentSnapshotId) { + throw new Error("Cannot restore: missing run or snapshot ID"); + } - const continuationResult = await this.httpClient.continueRunExecution( - this.runFriendlyId, - this.currentSnapshotId - ); + // Track restore count + this.restoreCount++; - if (!continuationResult.success) { + // Short delay to give websocket time to reconnect + await sleep(100); + + // Process any env overrides + await this.processEnvOverrides(); + + const continuationResult = await this.httpClient.continueRunExecution( + this._runFriendlyId, + this.currentSnapshotId + ); + + if (!continuationResult.success) { + this.sendDebugLog({ + runId: this._runFriendlyId, + message: "failed to restore execution", + properties: { + error: continuationResult.error, + }, + }); + + return; + } + } catch (error) { this.sendDebugLog({ - runId: this.runFriendlyId, - message: "failed to continue execution", - properties: { - error: continuationResult.error, - }, + runId: this._runFriendlyId, + message: "failed to restore execution", + properties: { error: error instanceof Error ? error.message : String(error) }, }); - - return; + } finally { + this.abortExecution(); } } @@ -939,7 +993,7 @@ export class RunExecution { private async processEnvOverrides() { if (!this.env.TRIGGER_METADATA_URL) { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "No metadata URL, skipping env overrides", }); return; @@ -950,14 +1004,14 @@ export class RunExecution { if (!overrides) { this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "No env overrides, skipping", }); return; } this.sendDebugLog({ - runId: this.runFriendlyId, + runId: this._runFriendlyId, message: "Processing env overrides", properties: { ...overrides }, }); @@ -967,10 +1021,10 @@ export class RunExecution { // Update services with new values if (overrides.TRIGGER_HEARTBEAT_INTERVAL_SECONDS) { - this.runHeartbeat.updateInterval(this.env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS * 1000); + this.runHeartbeat?.updateInterval(this.env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS * 1000); } if (overrides.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS) { - this.snapshotPoller.updateInterval(this.env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS * 1000); + this.snapshotPoller?.updateInterval(this.env.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS * 1000); } if ( overrides.TRIGGER_SUPERVISOR_API_PROTOCOL || @@ -994,24 +1048,25 @@ export class RunExecution { }); } - // Add getter for current snapshot ID - public get snapshotFriendlyId(): string { + public get runFriendlyId(): string | undefined { + return this._runFriendlyId; + } + + public get currentSnapshotFriendlyId(): string | undefined { return this.currentSnapshotId; } - // Add getter for current task run env public get taskRunEnv(): Record | null { return this.currentTaskRunEnv; } - // Add getter for metrics public get metrics() { return { restoreCount: this.restoreCount, }; } - private signalExecutionExit() { + private abortExecution() { if (this.isExecutionActive) { this.executionAbortController.abort(); } diff --git a/packages/cli-v3/src/entryPoints/managed/heartbeat.ts b/packages/cli-v3/src/entryPoints/managed/heartbeat.ts index 3863a0828f..799408e13b 100644 --- a/packages/cli-v3/src/entryPoints/managed/heartbeat.ts +++ b/packages/cli-v3/src/entryPoints/managed/heartbeat.ts @@ -11,19 +11,20 @@ export type RunExecutionHeartbeatOptions = { }; export class RunExecutionHeartbeat { - private readonly logger: RunLogger; - private readonly heartbeat: HeartbeatService; - private readonly httpClient: WorkloadHttpClient; - private readonly runFriendlyId: string; private snapshotFriendlyId: string; - constructor(opts: RunExecutionHeartbeatOptions) { - this.logger = opts.logger; - this.httpClient = opts.httpClient; + private readonly httpClient: WorkloadHttpClient; + private readonly logger: RunLogger; + private readonly heartbeatIntervalMs: number; + private readonly heartbeat: HeartbeatService; + constructor(opts: RunExecutionHeartbeatOptions) { this.runFriendlyId = opts.runFriendlyId; this.snapshotFriendlyId = opts.snapshotFriendlyId; + this.httpClient = opts.httpClient; + this.logger = opts.logger; + this.heartbeatIntervalMs = opts.heartbeatIntervalSeconds * 1000; this.logger.sendDebugLog({ runId: this.runFriendlyId, @@ -57,7 +58,7 @@ export class RunExecutionHeartbeat { }); } }, - intervalMs: opts.heartbeatIntervalSeconds * 1000, + intervalMs: this.heartbeatIntervalMs, leadingEdge: false, onError: async (error) => { this.logger.sendDebugLog({ @@ -73,8 +74,8 @@ export class RunExecutionHeartbeat { this.heartbeat.resetCurrentInterval(); } - updateSnapshotId(snapshotId: string) { - this.snapshotFriendlyId = snapshotId; + updateSnapshotId(snapshotFriendlyId: string) { + this.snapshotFriendlyId = snapshotFriendlyId; } updateInterval(intervalMs: number) { diff --git a/packages/cli-v3/src/entryPoints/managed/poller.ts b/packages/cli-v3/src/entryPoints/managed/poller.ts index 65d225391a..bb85886233 100644 --- a/packages/cli-v3/src/entryPoints/managed/poller.ts +++ b/packages/cli-v3/src/entryPoints/managed/poller.ts @@ -12,19 +12,22 @@ export type RunExecutionSnapshotPollerOptions = { }; export class RunExecutionSnapshotPoller { + private runFriendlyId: string; + private snapshotFriendlyId: string; + + private readonly httpClient: WorkloadHttpClient; private readonly logger: RunLogger; + private readonly snapshotPollIntervalMs: number; + private readonly handleSnapshotChange: (runData: RunExecutionData) => Promise; private readonly poller: HeartbeatService; - private readonly httpClient: WorkloadHttpClient; - - private readonly runFriendlyId: string; - private snapshotFriendlyId: string; constructor(opts: RunExecutionSnapshotPollerOptions) { - this.logger = opts.logger; - this.httpClient = opts.httpClient; - this.runFriendlyId = opts.runFriendlyId; this.snapshotFriendlyId = opts.snapshotFriendlyId; + this.httpClient = opts.httpClient; + this.logger = opts.logger; + this.snapshotPollIntervalMs = opts.snapshotPollIntervalSeconds * 1000; + this.handleSnapshotChange = opts.handleSnapshotChange; this.logger.sendDebugLog({ runId: this.runFriendlyId, @@ -82,7 +85,7 @@ export class RunExecutionSnapshotPoller { return; } - await opts.handleSnapshotChange(response.data.execution); + await this.handleSnapshotChange(response.data.execution); }, intervalMs: opts.snapshotPollIntervalSeconds * 1000, leadingEdge: false, @@ -100,8 +103,8 @@ export class RunExecutionSnapshotPoller { this.poller.resetCurrentInterval(); } - updateSnapshotId(snapshotId: string) { - this.snapshotFriendlyId = snapshotId; + updateSnapshotId(snapshotFriendlyId: string) { + this.snapshotFriendlyId = snapshotFriendlyId; } updateInterval(intervalMs: number) { From fdf14e04fe7c3dcb14196e12b95d8cb7e6371866 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 15 Apr 2025 12:28:36 +0100 Subject: [PATCH 25/35] cleanup completed docker containers by default --- apps/supervisor/src/env.ts | 1 + apps/supervisor/src/index.ts | 1 + apps/supervisor/src/workloadManager/docker.ts | 4 ++++ apps/supervisor/src/workloadManager/types.ts | 1 + 4 files changed, 7 insertions(+) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index d7caccbd80..72498075cd 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -27,6 +27,7 @@ const Env = z.object({ RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(), RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().optional(), RUNNER_ADDITIONAL_ENV_VARS: AdditionalEnvVars, // optional (csv) + RUNNER_DOCKER_AUTOREMOVE: BoolEnv.default(true), // Dequeue settings (provider mode) TRIGGER_DEQUEUE_ENABLED: BoolEnv.default("true"), diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index 350b81e3ff..811ee8746d 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -66,6 +66,7 @@ class ManagedSupervisor { heartbeatIntervalSeconds: env.RUNNER_HEARTBEAT_INTERVAL_SECONDS, snapshotPollIntervalSeconds: env.RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS, additionalEnvVars: env.RUNNER_ADDITIONAL_ENV_VARS, + dockerAutoremove: env.RUNNER_DOCKER_AUTOREMOVE, } satisfies WorkloadManagerOptions; if (this.isKubernetes) { diff --git a/apps/supervisor/src/workloadManager/docker.ts b/apps/supervisor/src/workloadManager/docker.ts index 9e4ba29594..171e2c0971 100644 --- a/apps/supervisor/src/workloadManager/docker.ts +++ b/apps/supervisor/src/workloadManager/docker.ts @@ -43,6 +43,10 @@ export class DockerWorkloadManager implements WorkloadManager { `--name=${runnerId}`, ]; + if (this.opts.dockerAutoremove) { + runArgs.push("--rm"); + } + if (this.opts.warmStartUrl) { runArgs.push(`--env=TRIGGER_WARM_START_URL=${this.opts.warmStartUrl}`); } diff --git a/apps/supervisor/src/workloadManager/types.ts b/apps/supervisor/src/workloadManager/types.ts index a5d7ed3c90..b3cd418f1e 100644 --- a/apps/supervisor/src/workloadManager/types.ts +++ b/apps/supervisor/src/workloadManager/types.ts @@ -10,6 +10,7 @@ export interface WorkloadManagerOptions { heartbeatIntervalSeconds?: number; snapshotPollIntervalSeconds?: number; additionalEnvVars?: Record; + dockerAutoremove?: boolean; } export interface WorkloadManager { From 1643743d1ba44e39e87f6debb1e4cddff9096808 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 15 Apr 2025 12:29:16 +0100 Subject: [PATCH 26/35] execution fixes and logging improvements --- apps/supervisor/src/workloadServer/index.ts | 2 +- .../src/entryPoints/managed/controller.ts | 157 +-- .../src/entryPoints/managed/execution.ts | 1028 +++++++---------- 3 files changed, 526 insertions(+), 661 deletions(-) diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index ed90c450c3..2dcf329736 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -452,7 +452,7 @@ export class WorkloadServer extends EventEmitter { logger.debug("runConnected", { ...getSocketMetadata() }); // If there's already a run ID set, we should "disconnect" it from this socket - if (socket.data.runFriendlyId) { + if (socket.data.runFriendlyId && socket.data.runFriendlyId !== friendlyId) { logger.debug("runConnected: disconnecting existing run", { ...getSocketMetadata(), newRunId: friendlyId, diff --git a/packages/cli-v3/src/entryPoints/managed/controller.ts b/packages/cli-v3/src/entryPoints/managed/controller.ts index 116b72c0dd..35fec13932 100644 --- a/packages/cli-v3/src/entryPoints/managed/controller.ts +++ b/packages/cli-v3/src/entryPoints/managed/controller.ts @@ -148,60 +148,49 @@ export class ManagedRunController { private lockedRunExecution: Promise | null = null; - private async startAndExecuteRunAttempt({ + private async startRunExecution({ runFriendlyId, snapshotFriendlyId, dequeuedAt, podScheduledAt, isWarmStart, + previousRunId, }: { runFriendlyId: string; snapshotFriendlyId: string; dequeuedAt?: Date; podScheduledAt?: Date; isWarmStart?: boolean; + previousRunId?: string; }) { + this.sendDebugLog({ + runId: runFriendlyId, + message: "startAndExecuteRunAttempt()", + properties: { previousRunId }, + }); + if (this.lockedRunExecution) { this.sendDebugLog({ runId: runFriendlyId, - message: "startAndExecuteRunAttempt: already in progress", + message: "startAndExecuteRunAttempt: execution already locked", }); return; } - this.sendDebugLog({ - runId: runFriendlyId, - message: "startAndExecuteRunAttempt: called", - }); - const execution = async () => { - if (!this.socket) { - this.sendDebugLog({ - runId: runFriendlyId, - message: "Starting run without socket connection", - }); - } - - // Create a new RunExecution instance for this attempt - const newExecution = new RunExecution({ - workerManifest: this.workerManifest, - env: this.env, - httpClient: this.httpClient, - logger: this.logger, - }); - - // If we have a current execution with task run env, prepare the new execution - if (this.currentExecution?.taskRunEnv) { - newExecution.prepareForExecution({ - taskRunEnv: this.currentExecution.taskRunEnv, + if (!this.currentExecution || !this.currentExecution.isPreparedForNextRun) { + this.currentExecution = new RunExecution({ + workerManifest: this.workerManifest, + env: this.env, + httpClient: this.httpClient, + logger: this.logger, }); } - this.currentExecution = newExecution; - // Subscribe to run notifications this.subscribeToRunNotifications(runFriendlyId, snapshotFriendlyId); + // We're prepared for the next run so we can start executing await this.currentExecution.execute({ runFriendlyId, snapshotFriendlyId, @@ -223,6 +212,12 @@ export class ManagedRunController { }); } + const metrics = this.currentExecution?.metrics; + + if (metrics?.restoreCount) { + this.restoreCount += metrics.restoreCount; + } + this.lockedRunExecution = null; this.unsubscribeFromRunNotifications(runFriendlyId, snapshotFriendlyId); this.waitForNextRun(); @@ -230,43 +225,61 @@ export class ManagedRunController { private waitForNextRunLock = false; - /** This will kill the child process before spinning up a new one. It will never throw, - * but may exit the process on any errors or when no runs are available after the - * configured duration. */ + /** + * This will eagerly create a new run execution. It will never throw, but may exit + * the process on any errors or when no runs are available after the configured duration. + */ private async waitForNextRun() { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun()", + }); + if (this.waitForNextRunLock) { this.sendDebugLog({ runId: this.runFriendlyId, - message: "waitForNextRun: already in progress", + message: "waitForNextRun: already in progress, skipping", + }); + return; + } + + if (this.lockedRunExecution) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "waitForNextRun: execution locked, skipping", }); return; } this.waitForNextRunLock = true; - const previousRunId = this.runFriendlyId; try { - // If there's a run execution in progress, we need to wait for it to finish - if (this.lockedRunExecution) { + if (!this.warmStartClient) { this.sendDebugLog({ runId: this.runFriendlyId, - message: "waitForNextRun: waiting for existing run execution to finish", + message: "waitForNextRun: warm starts disabled, shutting down", }); - // TODO: maybe kill the process? - await this.lockedRunExecution; + this.exitProcess(this.successExitCode); } - this.sendDebugLog({ - runId: this.runFriendlyId, - message: "waitForNextRun: waiting for next run", - }); + const previousRunId = this.runFriendlyId; - if (!this.warmStartClient) { + if (this.currentExecution?.taskRunEnv) { this.sendDebugLog({ runId: this.runFriendlyId, - message: "waitForNextRun: warm starts disabled, shutting down", + message: "waitForNextRun: eagerly recreating task run process", + }); + + const previousTaskRunEnv = this.currentExecution.taskRunEnv; + + this.currentExecution = new RunExecution({ + workerManifest: this.workerManifest, + env: this.env, + httpClient: this.httpClient, + logger: this.logger, + }).prepareForExecution({ + taskRunEnv: previousTaskRunEnv, }); - this.exitProcess(this.successExitCode); } // Check the service is up and get additional warm start config @@ -288,34 +301,22 @@ export class ManagedRunController { connect.data.connectionTimeoutMs ?? this.env.TRIGGER_WARM_START_CONNECTION_TIMEOUT_MS; const keepaliveMs = connect.data.keepaliveMs ?? this.env.TRIGGER_WARM_START_KEEPALIVE_MS; + const warmStartConfig = { + connectionTimeoutMs, + keepaliveMs, + }; + this.sendDebugLog({ runId: this.runFriendlyId, message: "waitForNextRun: connected to warm start service", - properties: { - connectionTimeoutMs, - keepaliveMs, - }, + properties: warmStartConfig, }); - if (previousRunId) { - this.sendDebugLog({ - runId: previousRunId, - message: "warm start: received config", - properties: { - connectionTimeoutMs, - keepaliveMs, - }, - }); - } - if (!connectionTimeoutMs || !keepaliveMs) { this.sendDebugLog({ runId: this.runFriendlyId, message: "waitForNextRun: warm starts disabled after connect", - properties: { - connectionTimeoutMs, - keepaliveMs, - }, + properties: warmStartConfig, }); this.exitProcess(this.successExitCode); } @@ -330,6 +331,7 @@ export class ManagedRunController { this.sendDebugLog({ runId: this.runFriendlyId, message: "waitForNextRun: warm start failed, shutting down", + properties: warmStartConfig, }); this.exitProcess(this.successExitCode); } @@ -339,14 +341,18 @@ export class ManagedRunController { this.sendDebugLog({ runId: this.runFriendlyId, message: "waitForNextRun: got next run", - properties: { nextRun: nextRun.run.friendlyId }, + properties: { + ...warmStartConfig, + nextRunId: nextRun.run.friendlyId, + }, }); - this.startAndExecuteRunAttempt({ + this.startRunExecution({ runFriendlyId: nextRun.run.friendlyId, snapshotFriendlyId: nextRun.snapshot.friendlyId, dequeuedAt: nextRun.dequeuedAt, isWarmStart: true, + previousRunId, }).finally(() => {}); } catch (error) { this.sendDebugLog({ @@ -454,11 +460,19 @@ export class ManagedRunController { socket.on("connect", () => { this.sendDebugLog({ runId: this.runFriendlyId, - message: "Connected to supervisor", + message: "Socket connected to supervisor", }); // This should handle the case where we reconnect after being restored - if (this.runFriendlyId && this.snapshotFriendlyId) { + if ( + this.runFriendlyId && + this.snapshotFriendlyId && + this.runFriendlyId !== this.env.TRIGGER_RUN_ID + ) { + this.sendDebugLog({ + runId: this.runFriendlyId, + message: "Subscribing to notifications for in-progress run", + }); this.subscribeToRunNotifications(this.runFriendlyId, this.snapshotFriendlyId); } }); @@ -466,7 +480,7 @@ export class ManagedRunController { socket.on("connect_error", (error) => { this.sendDebugLog({ runId: this.runFriendlyId, - message: "Connection error", + message: "Socket connection error", properties: { error: error instanceof Error ? error.message : String(error) }, }); }); @@ -474,7 +488,7 @@ export class ManagedRunController { socket.on("disconnect", (reason, description) => { this.sendDebugLog({ runId: this.runFriendlyId, - message: "Disconnected from supervisor", + message: "Socket disconnected from supervisor", properties: { reason, description: description?.toString() }, }); }); @@ -500,7 +514,7 @@ export class ManagedRunController { // If we have run and snapshot IDs, we can start an attempt immediately if (this.env.TRIGGER_RUN_ID && this.env.TRIGGER_SNAPSHOT_ID) { - this.startAndExecuteRunAttempt({ + this.startRunExecution({ runFriendlyId: this.env.TRIGGER_RUN_ID, snapshotFriendlyId: this.env.TRIGGER_SNAPSHOT_ID, dequeuedAt: this.env.TRIGGER_DEQUEUED_AT_MS, @@ -527,6 +541,7 @@ export class ManagedRunController { sendDebugLog(opts: SendDebugLogOptions) { this.logger.sendDebugLog({ ...opts, + message: `[controller] ${opts.message}`, properties: { ...opts.properties, runnerWarmStartCount: this.warmStartCount, diff --git a/packages/cli-v3/src/entryPoints/managed/execution.ts b/packages/cli-v3/src/entryPoints/managed/execution.ts index a4de858b42..70ad6b6c39 100644 --- a/packages/cli-v3/src/entryPoints/managed/execution.ts +++ b/packages/cli-v3/src/entryPoints/managed/execution.ts @@ -2,9 +2,9 @@ import { type CompleteRunAttemptResult, type RunExecutionData, SuspendedProcessError, - TaskRunExecution, type TaskRunExecutionMetrics, type TaskRunExecutionResult, + TaskRunExecutionRetry, type TaskRunFailedExecutionResult, WorkerManifest, } from "@trigger.dev/core/v3"; @@ -46,22 +46,19 @@ type RunExecutionRunOptions = { }; export class RunExecution { - private executionAbortController = new AbortController(); - private isExecutionActive = false; - private isPrepared = false; + private executionAbortController: AbortController; private _runFriendlyId?: string; private currentSnapshotId?: string; - private currentTaskRunEnv: Record | null = null; + private currentTaskRunEnv?: Record; private dequeuedAt?: Date; private podScheduledAt?: Date; - private isWarmStart: boolean; private readonly workerManifest: WorkerManifest; private readonly env: RunnerEnv; private readonly httpClient: WorkloadHttpClient; private readonly logger: RunLogger; - private restoreCount = 0; + private restoreCount: number; private taskRunProcess?: TaskRunProcess; private runHeartbeat?: RunExecutionHeartbeat; @@ -72,42 +69,64 @@ export class RunExecution { this.env = opts.env; this.httpClient = opts.httpClient; this.logger = opts.logger; - this.isWarmStart = false; + + this.restoreCount = 0; + this.executionAbortController = new AbortController(); } /** * Prepares the execution with task run environment variables. * This should be called before executing, typically after a successful run to prepare for the next one. */ - public prepareForExecution(opts: RunExecutionPrepareOptions): void { - this.currentTaskRunEnv = opts.taskRunEnv; - - if (!this.taskRunProcess || !this.taskRunProcess.isPreparedForNextRun) { - this.taskRunProcess = new TaskRunProcess({ - workerManifest: this.workerManifest, - env: opts.taskRunEnv, - serverWorker: { - id: "managed", - contentHash: this.env.TRIGGER_CONTENT_HASH, - version: this.env.TRIGGER_DEPLOYMENT_VERSION, - engine: "V2", - }, - machineResources: { - cpu: Number(this.env.TRIGGER_MACHINE_CPU), - memory: Number(this.env.TRIGGER_MACHINE_MEMORY), - }, - isWarmStart: this.isWarmStart, - }).initialize(); + public prepareForExecution(opts: RunExecutionPrepareOptions): this { + if (this.taskRunProcess) { + throw new Error("prepareForExecution called after process was already created"); } - this.isPrepared = true; + if (this.isPreparedForNextRun) { + throw new Error("prepareForExecution called after execution was already prepared"); + } + + this.taskRunProcess = this.createTaskRunProcess({ + envVars: opts.taskRunEnv, + isWarmStart: true, + }); + + return this; + } + + private createTaskRunProcess({ + envVars, + isWarmStart, + }: { + envVars: Record; + isWarmStart?: boolean; + }) { + return new TaskRunProcess({ + workerManifest: this.workerManifest, + env: { + ...envVars, + ...this.env.gatherProcessEnv(), + }, + serverWorker: { + id: "managed", + contentHash: this.env.TRIGGER_CONTENT_HASH, + version: this.env.TRIGGER_DEPLOYMENT_VERSION, + engine: "V2", + }, + machineResources: { + cpu: Number(this.env.TRIGGER_MACHINE_CPU), + memory: Number(this.env.TRIGGER_MACHINE_MEMORY), + }, + isWarmStart, + }).initialize(); } /** * Returns true if the execution has been prepared with task run env. */ - public isPreparedForExecution(): boolean { - return this.isPrepared; + get isPreparedForNextRun(): boolean { + return !!this.taskRunProcess?.isPreparedForNextRun; } /** @@ -117,45 +136,35 @@ export class RunExecution { public async handleSnapshotChange(runData: RunExecutionData): Promise { const { run, snapshot, completedWaitpoints } = runData; + const snapshotMetadata = { + incomingRunId: run.friendlyId, + incomingSnapshotId: snapshot.friendlyId, + completedWaitpoints: completedWaitpoints.length, + }; + // Ensure we have run details if (!this.runFriendlyId || !this.currentSnapshotId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "handleSnapshotChange: missing run or snapshot ID", - properties: { - currentRunId: this.runFriendlyId, - newRunId: run.friendlyId, - currentSnapshotId: this.currentSnapshotId, - newSnapshotId: snapshot.friendlyId, - }, - }); + this.sendDebugLog( + "handleSnapshotChange: missing run or snapshot ID", + snapshotMetadata, + run.friendlyId + ); return; } // Ensure the run ID matches - if (run.friendlyId !== this._runFriendlyId) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "handleSnapshotChange: mismatched run IDs", - properties: { - currentRunId: this._runFriendlyId, - newRunId: run.friendlyId, - currentSnapshotId: this.currentSnapshotId, - newSnapshotId: snapshot.friendlyId, - }, - }); + if (run.friendlyId !== this.runFriendlyId) { + // Send debug log to both runs + this.sendDebugLog("handleSnapshotChange: mismatched run IDs", snapshotMetadata); + this.sendDebugLog( + "handleSnapshotChange: mismatched run IDs", + snapshotMetadata, + run.friendlyId + ); return; } - this.sendDebugLog({ - runId: this._runFriendlyId, - message: `enqueued snapshot change: ${snapshot.executionStatus}`, - properties: { - oldSnapshotId: this.currentSnapshotId, - newSnapshotId: snapshot.friendlyId, - completedWaitpoints: completedWaitpoints.length, - }, - }); + this.sendDebugLog(`enqueued snapshot change: ${snapshot.executionStatus}`, snapshotMetadata); this.snapshotChangeQueue.push(runData); await this.processSnapshotChangeQueue(); @@ -180,14 +189,7 @@ export class RunExecution { const [error] = await tryCatch(this.processSnapshotChange(runData)); if (error) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Failed to process snapshot change", - properties: { - error: error.message, - currentSnapshotId: this.currentSnapshotId, - }, - }); + this.sendDebugLog("Failed to process snapshot change", { error: error.message }); } } this.snapshotChangeQueueLock = false; @@ -196,37 +198,26 @@ export class RunExecution { private async processSnapshotChange(runData: RunExecutionData): Promise { const { run, snapshot, completedWaitpoints } = runData; + const snapshotMetadata = { + incomingSnapshotId: snapshot.friendlyId, + completedWaitpoints: completedWaitpoints.length, + }; + // Check if the incoming snapshot is newer than the current one if (!this.currentSnapshotId || snapshot.friendlyId < this.currentSnapshotId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "handleSnapshotChange: received older snapshot, skipping", - properties: { - currentSnapshotId: this.currentSnapshotId, - receivedSnapshotId: snapshot.friendlyId, - }, - }); + this.sendDebugLog( + "handleSnapshotChange: received older snapshot, skipping", + snapshotMetadata + ); return; } if (snapshot.friendlyId === this.currentSnapshotId) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "handleSnapshotChange: snapshot not changed", - properties: { snapshot: snapshot.friendlyId }, - }); + this.sendDebugLog("handleSnapshotChange: snapshot not changed", snapshotMetadata); return; } - this.sendDebugLog({ - runId: this._runFriendlyId, - message: `snapshot change: ${snapshot.executionStatus}`, - properties: { - oldSnapshotId: this.currentSnapshotId, - newSnapshotId: snapshot.friendlyId, - completedWaitpoints: completedWaitpoints.length, - }, - }); + this.sendDebugLog(`snapshot change: ${snapshot.executionStatus}`, snapshotMetadata); // Reset the snapshot poll interval so we don't do unnecessary work this.snapshotPoller?.resetCurrentInterval(); @@ -243,10 +234,9 @@ export class RunExecution { const [error] = await tryCatch(this.cancel()); if (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "snapshot change: failed to cancel attempt", - properties: { error: error.message }, + this.sendDebugLog("snapshot change: failed to cancel attempt", { + ...snapshotMetadata, + error: error.message, }); } @@ -254,42 +244,27 @@ export class RunExecution { return; } case "FINISHED": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run is finished", - }); + this.sendDebugLog("Run is finished", snapshotMetadata); // Pretend we've just suspended the run. This will kill the process without failing the run. - await this.suspend(); + await this.taskRunProcess?.suspend(); return; } case "QUEUED_EXECUTING": case "EXECUTING_WITH_WAITPOINTS": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run is executing with waitpoints", - properties: { snapshot: snapshot.friendlyId }, - }); + this.sendDebugLog("Run is executing with waitpoints", snapshotMetadata); const [error] = await tryCatch(this.taskRunProcess?.cleanup(false)); if (error) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to cleanup task run process, carrying on", - properties: { error: error.message }, + this.sendDebugLog("Failed to cleanup task run process, carrying on", { + ...snapshotMetadata, + error: error.message, }); } if (snapshot.friendlyId !== this.currentSnapshotId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Snapshot changed after cleanup, abort", - properties: { - oldSnapshotId: snapshot.friendlyId, - newSnapshotId: this.currentSnapshotId, - }, - }); + this.sendDebugLog("Snapshot changed after cleanup, abort", snapshotMetadata); this.abortExecution(); return; @@ -298,54 +273,36 @@ export class RunExecution { await sleep(this.env.TRIGGER_PRE_SUSPEND_WAIT_MS); if (snapshot.friendlyId !== this.currentSnapshotId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Snapshot changed after suspend threshold, abort", - properties: { - oldSnapshotId: snapshot.friendlyId, - newSnapshotId: this.currentSnapshotId, - }, - }); + this.sendDebugLog("Snapshot changed after suspend threshold, abort", snapshotMetadata); this.abortExecution(); return; } - if (!this._runFriendlyId || !this.currentSnapshotId) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "handleSnapshotChange: Missing run ID or snapshot ID after suspension, abort", - properties: { - runId: this._runFriendlyId, - snapshotId: this.currentSnapshotId, - }, - }); + if (!this.runFriendlyId || !this.currentSnapshotId) { + this.sendDebugLog( + "handleSnapshotChange: Missing run ID or snapshot ID after suspension, abort", + snapshotMetadata + ); this.abortExecution(); return; } const suspendResult = await this.httpClient.suspendRun( - this._runFriendlyId, + this.runFriendlyId, this.currentSnapshotId ); if (!suspendResult.success) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Failed to suspend run, staying alive 🎶", - properties: { - error: suspendResult.error, - }, + this.sendDebugLog("Failed to suspend run, staying alive 🎶", { + ...snapshotMetadata, + error: suspendResult.error, }); - this.sendDebugLog({ - runId: run.friendlyId, - message: "checkpoint: suspend request failed", - properties: { - snapshotId: this.currentSnapshotId, - error: suspendResult.error, - }, + this.sendDebugLog("checkpoint: suspend request failed", { + ...snapshotMetadata, + error: suspendResult.error, }); // This is fine, we'll wait for the next status change @@ -353,79 +310,61 @@ export class RunExecution { } if (!suspendResult.data.ok) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "checkpoint: failed to suspend run", - properties: { - snapshotId: this.currentSnapshotId, - error: suspendResult.data.error, - }, + this.sendDebugLog("checkpoint: failed to suspend run", { + snapshotId: this.currentSnapshotId, + error: suspendResult.data.error, }); // This is fine, we'll wait for the next status change return; } - this.sendDebugLog({ - runId: run.friendlyId, - message: "Suspending, any day now 🚬", - properties: { ok: suspendResult.data.ok }, - }); + this.sendDebugLog("Suspending, any day now 🚬", snapshotMetadata); // Wait for next status change return; } case "SUSPENDED": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run was suspended, kill the process", - properties: { run: run.friendlyId, snapshot: this.currentSnapshotId }, - }); + this.sendDebugLog("Run was suspended, kill the process", snapshotMetadata); + + // This will kill the process and fail the execution with a SuspendedProcessError + await this.taskRunProcess?.suspend(); - await this.suspend(); return; } case "PENDING_EXECUTING": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run is pending execution", - properties: { run: run.friendlyId, snapshot: this.currentSnapshotId }, - }); + this.sendDebugLog("Run is pending execution", snapshotMetadata); if (completedWaitpoints.length === 0) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "No waitpoints to complete, nothing to do", + this.sendDebugLog("No waitpoints to complete, nothing to do", snapshotMetadata); + return; + } + + const [error] = await tryCatch(this.restore()); + + if (error) { + this.sendDebugLog("Failed to restore execution", { + ...snapshotMetadata, + error: error.message, }); + + this.abortExecution(); return; } - await this.restore(); return; } case "EXECUTING": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run is now executing", - properties: { run: run.friendlyId, snapshot: this.currentSnapshotId }, - }); + this.sendDebugLog("Run is now executing", snapshotMetadata); if (completedWaitpoints.length === 0) { return; } - this.sendDebugLog({ - runId: run.friendlyId, - message: "Processing completed waitpoints", - properties: { completedWaitpoints: completedWaitpoints.length }, - }); + this.sendDebugLog("Processing completed waitpoints", snapshotMetadata); if (!this.taskRunProcess) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "No task run process, ignoring completed waitpoints", - properties: { completedWaitpoints: completedWaitpoints.length }, - }); + this.sendDebugLog("No task run process, ignoring completed waitpoints", snapshotMetadata); this.abortExecution(); return; @@ -439,11 +378,7 @@ export class RunExecution { } case "RUN_CREATED": case "QUEUED": { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Invalid status change", - properties: { status: snapshot.executionStatus }, - }); + this.sendDebugLog("Invalid status change", snapshotMetadata); this.abortExecution(); return; @@ -454,31 +389,73 @@ export class RunExecution { } } + private async startAttempt({ + isWarmStart, + }: { + isWarmStart?: boolean; + }): Promise { + if (!this.runFriendlyId || !this.currentSnapshotId) { + throw new Error("Cannot start attempt: missing run or snapshot ID"); + } + + this.sendDebugLog("Starting attempt"); + + const attemptStartedAt = Date.now(); + + // Check for abort before each major async operation + if (this.executionAbortController.signal.aborted) { + throw new ExecutionAbortError("Execution aborted before start"); + } + + const start = await this.httpClient.startRunAttempt( + this.runFriendlyId, + this.currentSnapshotId, + { isWarmStart } + ); + + if (this.executionAbortController.signal.aborted) { + throw new ExecutionAbortError("Execution aborted after start"); + } + + if (!start.success) { + throw new Error(`Start API call failed: ${start.error}`); + } + + // A snapshot was just created, so update the snapshot ID + this.currentSnapshotId = start.data.snapshot.friendlyId; + + const metrics = this.measureExecutionMetrics({ + attemptCreatedAt: attemptStartedAt, + dequeuedAt: this.dequeuedAt?.getTime(), + podScheduledAt: this.podScheduledAt?.getTime(), + }); + + this.sendDebugLog("Started attempt"); + + return { ...start.data, metrics }; + } + /** * Executes the run. This will return when the execution is complete and we should warm start. * When this returns, the child process will have been cleaned up. */ public async execute(runOpts: RunExecutionRunOptions): Promise { - this._runFriendlyId = runOpts.runFriendlyId; + // Setup initial state + this.runFriendlyId = runOpts.runFriendlyId; this.currentSnapshotId = runOpts.snapshotFriendlyId; this.dequeuedAt = runOpts.dequeuedAt; this.podScheduledAt = runOpts.podScheduledAt; - this.isWarmStart = runOpts.isWarmStart ?? false; - - // Reset abort controller for new execution - this.executionAbortController = new AbortController(); - // Create and start the heartbeat and poller services + // Create and start services this.runHeartbeat = new RunExecutionHeartbeat({ - runFriendlyId: this._runFriendlyId, + runFriendlyId: this.runFriendlyId, snapshotFriendlyId: this.currentSnapshotId, httpClient: this.httpClient, logger: this.logger, heartbeatIntervalSeconds: this.env.TRIGGER_HEARTBEAT_INTERVAL_SECONDS, }); - this.snapshotPoller = new RunExecutionSnapshotPoller({ - runFriendlyId: this._runFriendlyId, + runFriendlyId: this.runFriendlyId, snapshotFriendlyId: this.currentSnapshotId, httpClient: this.httpClient, logger: this.logger, @@ -489,183 +466,100 @@ export class RunExecution { this.runHeartbeat.start(); this.snapshotPoller.start(); - try { - const attemptStartedAt = Date.now(); - - // Check for abort before each major async operation - if (this.executionAbortController.signal.aborted) { - throw new ExecutionAbortError("Execution aborted before start"); - } - - const start = await this.httpClient.startRunAttempt( - this._runFriendlyId, - this.currentSnapshotId, - { - isWarmStart: this.isWarmStart, - } - ); + const [startError, start] = await tryCatch( + this.startAttempt({ isWarmStart: runOpts.isWarmStart }) + ); - if (this.executionAbortController.signal.aborted) { - throw new ExecutionAbortError("Execution aborted after start"); - } + if (startError) { + this.sendDebugLog("Failed to start attempt", { error: startError.message }); - if (!start.success) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Failed to start run", - properties: { error: start.error }, - }); + this.stopServices(); + return; + } - return; - } + const [executeError] = await tryCatch(this.executeRunWrapper(start)); - // A snapshot was just created, so update the snapshot ID - this.currentSnapshotId = start.data.snapshot.friendlyId; + if (executeError) { + this.sendDebugLog("Failed to execute run", { error: executeError.message }); - const attemptDuration = Date.now() - attemptStartedAt; + this.stopServices(); + return; + } - const { run, snapshot, execution, envVars } = start.data; + this.stopServices(); + } - this.sendDebugLog({ - runId: run.friendlyId, - message: "Started run", - properties: { snapshot: snapshot.friendlyId }, - }); + private async executeRunWrapper({ + run, + snapshot, + envVars, + execution, + metrics, + isWarmStart, + }: WorkloadRunAttemptStartResponseBody & { + metrics: TaskRunExecutionMetrics; + isWarmStart?: boolean; + }) { + this.currentTaskRunEnv = envVars; + + const [executeError] = await tryCatch( + this.executeRun({ + run, + snapshot, + envVars, + execution, + metrics, + isWarmStart, + }) + ); - const metrics = [ - { - name: "start", - event: "create_attempt", - timestamp: attemptStartedAt, - duration: attemptDuration, - }, - ] - .concat( - this.dequeuedAt - ? [ - { - name: "start", - event: "dequeue", - timestamp: this.dequeuedAt.getTime(), - duration: 0, - }, - ] - : [] - ) - .concat( - this.podScheduledAt - ? [ - { - name: "start", - event: "pod_scheduled", - timestamp: this.podScheduledAt.getTime(), - duration: 0, - }, - ] - : [] - ) satisfies TaskRunExecutionMetrics; - - this.currentTaskRunEnv = { - ...this.env.gatherProcessEnv(), - ...envVars, - }; + this.sendDebugLog("Run execution completed", { error: executeError?.message }); - const [error] = await tryCatch( - this.executeRun({ - run, - snapshot, - envVars: this.currentTaskRunEnv, - execution, - metrics, - }) - ); + if (!executeError) { + this.stopServices(); + return; + } - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run execution completed", - properties: { error: error?.message }, + if (executeError instanceof SuspendedProcessError) { + this.sendDebugLog("Run was suspended", { + run: run.friendlyId, + snapshot: snapshot.friendlyId, + error: executeError.message, }); - if (!error) { - // Stop the heartbeat and poller - this.runHeartbeat?.stop(); - this.snapshotPoller?.stop(); - } - - if (error) { - if (error instanceof SuspendedProcessError) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run was suspended", - properties: { - run: run.friendlyId, - snapshot: snapshot.friendlyId, - error: error.message, - }, - }); - - return; - } - - if (error instanceof ExecutionAbortError) { - this.sendDebugLog({ - runId: run.friendlyId, - message: "Run was interrupted", - properties: { - run: run.friendlyId, - snapshot: snapshot.friendlyId, - error: error.message, - }, - }); + return; + } - return; - } + if (executeError instanceof ExecutionAbortError) { + this.sendDebugLog("Run was interrupted", { + run: run.friendlyId, + snapshot: snapshot.friendlyId, + error: executeError.message, + }); - this.sendDebugLog({ - runId: run.friendlyId, - message: "Error while executing attempt", - properties: { - error: error.message, - runId: run.friendlyId, - snapshotId: snapshot.friendlyId, - }, - }); - - const completion = { - id: execution.run.id, - ok: false, - retry: undefined, - error: TaskRunProcess.parseExecuteError(error), - } satisfies TaskRunFailedExecutionResult; - - this.snapshotPoller?.stop(); - await this.complete({ execution, completion }); - this.runHeartbeat?.stop(); - } - } finally { - // Ensure we clean up even if aborted - this.runHeartbeat?.stop(); - this.snapshotPoller?.stop(); + return; } - } - /** - * Cancels the current execution. - */ - public async cancel(): Promise { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "cancelling attempt", - properties: { runId: this._runFriendlyId }, + this.sendDebugLog("Error while executing attempt", { + error: executeError.message, + runId: run.friendlyId, + snapshotId: snapshot.friendlyId, }); - await this.taskRunProcess?.cancel(); - } + const completion = { + id: execution.run.id, + ok: false, + retry: undefined, + error: TaskRunProcess.parseExecuteError(executeError), + } satisfies TaskRunFailedExecutionResult; - public exit() { - if (this.taskRunProcess?.isPreparedForNextRun) { - this.taskRunProcess.forceExit(); + const [completeError] = await tryCatch(this.complete({ completion })); + + if (completeError) { + this.sendDebugLog("Failed to complete run", { error: completeError.message }); } + + this.stopServices(); } private async executeRun({ @@ -674,136 +568,94 @@ export class RunExecution { envVars, execution, metrics, + isWarmStart, }: WorkloadRunAttemptStartResponseBody & { - metrics?: TaskRunExecutionMetrics; + metrics: TaskRunExecutionMetrics; + isWarmStart?: boolean; }) { - this.isExecutionActive = true; - - try { - // To skip this step and eagerly create the task run process, run prepareForExecution first - if (!this.taskRunProcess || !this.taskRunProcess.isPreparedForNextRun) { - this.taskRunProcess = new TaskRunProcess({ - workerManifest: this.workerManifest, - env: envVars, - serverWorker: { - id: "managed", - contentHash: this.env.TRIGGER_CONTENT_HASH, - version: this.env.TRIGGER_DEPLOYMENT_VERSION, - engine: "V2", - }, - machineResources: execution.machine, - isWarmStart: this.isWarmStart, - }).initialize(); - } + // To skip this step and eagerly create the task run process, run prepareForExecution first + if (!this.taskRunProcess || !this.isPreparedForNextRun) { + this.taskRunProcess = this.createTaskRunProcess({ envVars, isWarmStart }); + } - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "executing task run process", - properties: { - attemptId: execution.attempt.id, - runId: execution.run.id, - }, - }); + this.sendDebugLog("executing task run process", { + attemptId: execution.attempt.id, + runId: execution.run.id, + }); - // Set up an abort handler that will cleanup the task run process - this.executionAbortController.signal.addEventListener("abort", async () => { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Execution aborted during task run, cleaning up process", - properties: { - attemptId: execution.attempt.id, - runId: execution.run.id, - }, - }); - - await this.taskRunProcess?.cleanup(true); - throw new ExecutionAbortError("Execution aborted during task run"); + // Set up an abort handler that will cleanup the task run process + this.executionAbortController.signal.addEventListener("abort", async () => { + this.sendDebugLog("Execution aborted during task run, cleaning up process", { + attemptId: execution.attempt.id, + runId: execution.run.id, }); - const completion = await this.taskRunProcess.execute( - { - payload: { - execution, - traceContext: execution.run.traceContext ?? {}, - metrics, - }, - messageId: run.friendlyId, - env: envVars, + await this.taskRunProcess?.cleanup(true); + throw new ExecutionAbortError("Execution aborted during task run"); + }); + + const completion = await this.taskRunProcess.execute( + { + payload: { + execution, + traceContext: execution.run.traceContext ?? {}, + metrics, }, - this.isWarmStart - ); + messageId: run.friendlyId, + env: envVars, + }, + isWarmStart + ); - // If we get here, the task completed normally - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Completed run", - properties: { completion: completion.ok }, + // If we get here, the task completed normally + this.sendDebugLog("Completed run attempt", { attemptSuccess: completion.ok }); + + // The execution has finished, so we can cleanup the task run process. Killing it should be safe. + const [error] = await tryCatch(this.taskRunProcess.cleanup(true)); + + if (error) { + this.sendDebugLog("Failed to cleanup task run process, submitting completion anyway", { + error: error.message, }); + } - // The execution has finished, so we can cleanup the task run process. Killing it should be safe. - const [error] = await tryCatch(this.taskRunProcess.cleanup(true)); + const [completionError] = await tryCatch(this.complete({ completion })); - if (error) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Failed to cleanup task run process, submitting completion anyway", - properties: { error: error.message }, - }); - } + if (completionError) { + this.sendDebugLog("Failed to complete run", { error: completionError.message }); + } + } + + /** + * Cancels the current execution. + */ + public async cancel(): Promise { + this.sendDebugLog("cancelling attempt", { runId: this.runFriendlyId }); - const [completionError] = await tryCatch(this.complete({ execution, completion })); + await this.taskRunProcess?.cancel(); + } - if (completionError) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Failed to complete run", - properties: { error: completionError.message }, - }); - } - } finally { - this.isExecutionActive = false; + public exit() { + if (this.isPreparedForNextRun) { + this.taskRunProcess?.forceExit(); } } - private async complete({ - execution, - completion, - }: { - execution: TaskRunExecution; - completion: TaskRunExecutionResult; - }): Promise { - if (!this._runFriendlyId || !this.currentSnapshotId) { + private async complete({ completion }: { completion: TaskRunExecutionResult }): Promise { + if (!this.runFriendlyId || !this.currentSnapshotId) { throw new Error("Cannot complete run: missing run or snapshot ID"); } const completionResult = await this.httpClient.completeRunAttempt( - this._runFriendlyId, + this.runFriendlyId, this.currentSnapshotId, { completion } ); if (!completionResult.success) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "completion: failed to submit", - properties: { - error: completionResult.error, - }, - }); - - return; + throw new Error(`failed to submit completion: ${completionResult.error}`); } - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Attempt completion submitted", - properties: { - attemptStatus: completionResult.data.result.attemptStatus, - runId: completionResult.data.result.run.friendlyId, - snapshotId: completionResult.data.result.snapshot.friendlyId, - }, - }); - await this.handleCompletionResult({ completion, result: completionResult.data.result, @@ -817,15 +669,11 @@ export class RunExecution { completion: TaskRunExecutionResult; result: CompleteRunAttemptResult; }) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Handling completion result", - properties: { - completion: completion.ok, - attemptStatus: result.attemptStatus, - snapshotId: result.snapshot.friendlyId, - runId: result.run.friendlyId, - }, + this.sendDebugLog("Handling completion result", { + attemptSuccess: completion.ok, + attemptStatus: result.attemptStatus, + snapshotId: result.snapshot.friendlyId, + runId: result.run.friendlyId, }); // Update our snapshot ID to match the completion result @@ -835,27 +683,18 @@ export class RunExecution { const { attemptStatus } = result; if (attemptStatus === "RUN_FINISHED") { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Run finished", - }); + this.sendDebugLog("Run finished"); return; } if (attemptStatus === "RUN_PENDING_CANCEL") { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Run pending cancel", - }); + this.sendDebugLog("Run pending cancel"); return; } if (attemptStatus === "RETRY_QUEUED") { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Retry queued", - }); + this.sendDebugLog("Retry queued"); return; } @@ -869,122 +708,114 @@ export class RunExecution { throw new Error("Should retry but missing retry params."); } - await this.retry({ result, delay: completion.retry.delay }); + await this.retryImmediately({ retryOpts: completion.retry }); return; } assertExhaustive(attemptStatus); } - private async retry({ result, delay }: { result: CompleteRunAttemptResult; delay: number }) { - await sleep(delay); + private measureExecutionMetrics({ + attemptCreatedAt, + dequeuedAt, + podScheduledAt, + }: { + attemptCreatedAt: number; + dequeuedAt?: number; + podScheduledAt?: number; + }): TaskRunExecutionMetrics { + const metrics: TaskRunExecutionMetrics = [ + { + name: "start", + event: "create_attempt", + timestamp: attemptCreatedAt, + duration: Date.now() - attemptCreatedAt, + }, + ]; + + if (dequeuedAt) { + metrics.push({ + name: "start", + event: "dequeue", + timestamp: dequeuedAt, + duration: 0, + }); + } - await this.execute({ - runFriendlyId: result.run.id, - snapshotFriendlyId: result.snapshot.friendlyId, - isWarmStart: true, - }); + if (podScheduledAt) { + metrics.push({ + name: "start", + event: "pod_scheduled", + timestamp: podScheduledAt, + duration: 0, + }); + } + + return metrics; } - /** - * Suspends the current execution. - */ - private async suspend(): Promise { - try { - if (!this._runFriendlyId || !this.currentSnapshotId) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Cannot suspend: missing run or snapshot ID", - }); + private async retryImmediately({ retryOpts }: { retryOpts: TaskRunExecutionRetry }) { + this.sendDebugLog("Retrying run immediately", { + timestamp: retryOpts.timestamp, + delay: retryOpts.delay, + }); - return; - } + const delay = retryOpts.timestamp - Date.now(); - const suspendResult = await this.httpClient.suspendRun( - this._runFriendlyId, - this.currentSnapshotId - ); + if (delay > 0) { + // Wait for retry delay to pass + await sleep(delay); + } - if (!suspendResult.success) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Failed to suspend run, staying alive 🎶", - properties: { - error: suspendResult.error, - }, - }); + // Start and execute next attempt + const [startError, start] = await tryCatch(this.startAttempt({ isWarmStart: true })); - return; - } + if (startError) { + this.sendDebugLog("Failed to start attempt for retry", { error: startError.message }); - if (!suspendResult.data.ok) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "checkpoint: failed to suspend run", - properties: { - snapshotId: this.currentSnapshotId, - error: suspendResult.data.error, - }, - }); + this.stopServices(); + return; + } - return; - } + const [executeError] = await tryCatch(this.executeRunWrapper({ ...start, isWarmStart: true })); - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Suspending, any day now 🚬", - properties: { ok: suspendResult.data.ok }, - }); + if (executeError) { + this.sendDebugLog("Failed to execute run for retry", { error: executeError.message }); - await this.taskRunProcess?.suspend(); - } finally { - this.abortExecution(); + this.stopServices(); + return; } + + this.stopServices(); } /** * Restores a suspended execution from PENDING_EXECUTING */ private async restore(): Promise { - try { - if (!this._runFriendlyId || !this.currentSnapshotId) { - throw new Error("Cannot restore: missing run or snapshot ID"); - } - - // Track restore count - this.restoreCount++; + this.sendDebugLog("Restoring execution"); - // Short delay to give websocket time to reconnect - await sleep(100); + if (!this.runFriendlyId || !this.currentSnapshotId) { + throw new Error("Cannot restore: missing run or snapshot ID"); + } - // Process any env overrides - await this.processEnvOverrides(); + // Short delay to give websocket time to reconnect + await sleep(100); - const continuationResult = await this.httpClient.continueRunExecution( - this._runFriendlyId, - this.currentSnapshotId - ); + // Process any env overrides + await this.processEnvOverrides(); - if (!continuationResult.success) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "failed to restore execution", - properties: { - error: continuationResult.error, - }, - }); + const continuationResult = await this.httpClient.continueRunExecution( + this.runFriendlyId, + this.currentSnapshotId + ); - return; - } - } catch (error) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "failed to restore execution", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - } finally { - this.abortExecution(); + if (!continuationResult.success) { + throw new Error(continuationResult.error); } + + // Track restore count + this.restoreCount++; } /** @@ -992,10 +823,7 @@ export class RunExecution { */ private async processEnvOverrides() { if (!this.env.TRIGGER_METADATA_URL) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "No metadata URL, skipping env overrides", - }); + this.sendDebugLog("No metadata URL, skipping env overrides"); return; } @@ -1003,18 +831,11 @@ export class RunExecution { const overrides = await metadataClient.getEnvOverrides(); if (!overrides) { - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "No env overrides, skipping", - }); + this.sendDebugLog("No env overrides, skipping"); return; } - this.sendDebugLog({ - runId: this._runFriendlyId, - message: "Processing env overrides", - properties: { ...overrides }, - }); + this.sendDebugLog("Processing env overrides", overrides); // Override the env with the new values this.env.override(overrides); @@ -1038,16 +859,32 @@ export class RunExecution { } } - sendDebugLog(opts: SendDebugLogOptions) { + sendDebugLog( + message: string, + properties?: SendDebugLogOptions["properties"], + runIdOverride?: string + ) { this.logger.sendDebugLog({ - ...opts, + runId: runIdOverride ?? this.runFriendlyId, + message: `[execution] ${message}`, properties: { - ...opts.properties, + ...properties, + runId: this.runFriendlyId, + snapshotId: this.currentSnapshotId, executionRestoreCount: this.restoreCount, }, }); } + // Ensure we can only set this once + private set runFriendlyId(id: string) { + if (this._runFriendlyId) { + throw new Error("Run ID already set"); + } + + this._runFriendlyId = id; + } + public get runFriendlyId(): string | undefined { return this._runFriendlyId; } @@ -1056,7 +893,7 @@ export class RunExecution { return this.currentSnapshotId; } - public get taskRunEnv(): Record | null { + public get taskRunEnv(): Record | undefined { return this.currentTaskRunEnv; } @@ -1066,9 +903,22 @@ export class RunExecution { }; } + get isAborted() { + return this.executionAbortController.signal.aborted; + } + private abortExecution() { - if (this.isExecutionActive) { - this.executionAbortController.abort(); + if (this.isAborted) { + this.sendDebugLog("Execution already aborted"); + return; } + + this.executionAbortController.abort(); + this.stopServices(); + } + + private stopServices() { + this.runHeartbeat?.stop(); + this.snapshotPoller?.stop(); } } From eea6ce34b30f175ee81c1a9cf54fb45f0f328738 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 15 Apr 2025 12:48:47 +0100 Subject: [PATCH 27/35] don't throw afet abort cleanup --- packages/cli-v3/src/entryPoints/managed/execution.ts | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/packages/cli-v3/src/entryPoints/managed/execution.ts b/packages/cli-v3/src/entryPoints/managed/execution.ts index 70ad6b6c39..bfefeee27f 100644 --- a/packages/cli-v3/src/entryPoints/managed/execution.ts +++ b/packages/cli-v3/src/entryPoints/managed/execution.ts @@ -18,6 +18,7 @@ import { RunExecutionHeartbeat } from "./heartbeat.js"; import { RunExecutionSnapshotPoller } from "./poller.js"; import { assertExhaustive, tryCatch } from "@trigger.dev/core/utils"; import { MetadataClient } from "./overrides.js"; +import { randomBytes } from "node:crypto"; class ExecutionAbortError extends Error { constructor(message: string) { @@ -46,6 +47,7 @@ type RunExecutionRunOptions = { }; export class RunExecution { + private id: string; private executionAbortController: AbortController; private _runFriendlyId?: string; @@ -65,6 +67,7 @@ export class RunExecution { private snapshotPoller?: RunExecutionSnapshotPoller; constructor(opts: RunExecutionOptions) { + this.id = randomBytes(4).toString("hex"); this.workerManifest = opts.workerManifest; this.env = opts.env; this.httpClient = opts.httpClient; @@ -578,20 +581,15 @@ export class RunExecution { this.taskRunProcess = this.createTaskRunProcess({ envVars, isWarmStart }); } - this.sendDebugLog("executing task run process", { - attemptId: execution.attempt.id, - runId: execution.run.id, - }); + this.sendDebugLog("executing task run process", { runId: execution.run.id }); // Set up an abort handler that will cleanup the task run process this.executionAbortController.signal.addEventListener("abort", async () => { this.sendDebugLog("Execution aborted during task run, cleaning up process", { - attemptId: execution.attempt.id, runId: execution.run.id, }); await this.taskRunProcess?.cleanup(true); - throw new ExecutionAbortError("Execution aborted during task run"); }); const completion = await this.taskRunProcess.execute( @@ -871,6 +869,7 @@ export class RunExecution { ...properties, runId: this.runFriendlyId, snapshotId: this.currentSnapshotId, + executionId: this.id, executionRestoreCount: this.restoreCount, }, }); From eea2106f02d6b39bb25b56f2a9e809c130de9120 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 15 Apr 2025 13:57:45 +0100 Subject: [PATCH 28/35] poller should use private interval --- packages/cli-v3/src/entryPoints/managed/poller.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/cli-v3/src/entryPoints/managed/poller.ts b/packages/cli-v3/src/entryPoints/managed/poller.ts index bb85886233..0195661a66 100644 --- a/packages/cli-v3/src/entryPoints/managed/poller.ts +++ b/packages/cli-v3/src/entryPoints/managed/poller.ts @@ -87,7 +87,7 @@ export class RunExecutionSnapshotPoller { await this.handleSnapshotChange(response.data.execution); }, - intervalMs: opts.snapshotPollIntervalSeconds * 1000, + intervalMs: this.snapshotPollIntervalMs, leadingEdge: false, onError: async (error) => { this.logger.sendDebugLog({ From b61b360b93dd7687541d6d8c940ac97ad5c682ff Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 15 Apr 2025 14:06:30 +0100 Subject: [PATCH 29/35] rename heartbeat service file --- packages/core/src/v3/index.ts | 2 +- packages/core/src/v3/runEngineWorker/supervisor/session.ts | 2 +- packages/core/src/v3/utils/{heartbeat.ts => interval.ts} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename packages/core/src/v3/utils/{heartbeat.ts => interval.ts} (100%) diff --git a/packages/core/src/v3/index.ts b/packages/core/src/v3/index.ts index 1f1f4d3076..3bd1fc4547 100644 --- a/packages/core/src/v3/index.ts +++ b/packages/core/src/v3/index.ts @@ -67,7 +67,7 @@ export { } from "./utils/ioSerialization.js"; export * from "./utils/imageRef.js"; -export * from "./utils/heartbeat.js"; +export * from "./utils/interval.js"; export * from "./config.js"; export { diff --git a/packages/core/src/v3/runEngineWorker/supervisor/session.ts b/packages/core/src/v3/runEngineWorker/supervisor/session.ts index 8dd90a3b98..f309043e37 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/session.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/session.ts @@ -8,7 +8,7 @@ import { VERSION } from "../../../version.js"; import { io, Socket } from "socket.io-client"; import { WorkerClientToServerEvents, WorkerServerToClientEvents } from "../types.js"; import { getDefaultWorkerHeaders } from "./util.js"; -import { HeartbeatService } from "../../utils/heartbeat.js"; +import { HeartbeatService } from "../../utils/interval.js"; type SupervisorSessionOptions = SupervisorClientCommonOptions & { queueConsumerEnabled?: boolean; diff --git a/packages/core/src/v3/utils/heartbeat.ts b/packages/core/src/v3/utils/interval.ts similarity index 100% rename from packages/core/src/v3/utils/heartbeat.ts rename to packages/core/src/v3/utils/interval.ts From a434e9868ebd6fee1e2fd1caa9c0e28c65f5828a Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 15 Apr 2025 14:15:22 +0100 Subject: [PATCH 30/35] rename HeartbeatService to IntervalService --- .../v3/runEngineWorker/supervisor/session.ts | 12 ++-- packages/core/src/v3/utils/interval.ts | 70 ++++++++++--------- 2 files changed, 42 insertions(+), 40 deletions(-) diff --git a/packages/core/src/v3/runEngineWorker/supervisor/session.ts b/packages/core/src/v3/runEngineWorker/supervisor/session.ts index f309043e37..747e1dae5e 100644 --- a/packages/core/src/v3/runEngineWorker/supervisor/session.ts +++ b/packages/core/src/v3/runEngineWorker/supervisor/session.ts @@ -8,7 +8,7 @@ import { VERSION } from "../../../version.js"; import { io, Socket } from "socket.io-client"; import { WorkerClientToServerEvents, WorkerServerToClientEvents } from "../types.js"; import { getDefaultWorkerHeaders } from "./util.js"; -import { HeartbeatService } from "../../utils/interval.js"; +import { IntervalService } from "../../utils/interval.js"; type SupervisorSessionOptions = SupervisorClientCommonOptions & { queueConsumerEnabled?: boolean; @@ -29,7 +29,7 @@ export class SupervisorSession extends EventEmitter { private readonly queueConsumerEnabled: boolean; private readonly queueConsumer: RunQueueConsumer; - private readonly heartbeatService: HeartbeatService; + private readonly heartbeat: IntervalService; private readonly heartbeatIntervalSeconds: number; constructor(private opts: SupervisorSessionOptions) { @@ -50,8 +50,8 @@ export class SupervisorSession extends EventEmitter { // TODO: This should be dynamic and set by (or at least overridden by) the platform this.heartbeatIntervalSeconds = opts.heartbeatIntervalSeconds || 30; - this.heartbeatService = new HeartbeatService({ - heartbeat: async () => { + this.heartbeat = new IntervalService({ + onInterval: async () => { console.debug("[SupervisorSession] Sending heartbeat"); const body = this.getHeartbeatBody(); @@ -182,7 +182,7 @@ export class SupervisorSession extends EventEmitter { if (this.queueConsumerEnabled) { console.log("[SupervisorSession] Queue consumer enabled"); this.queueConsumer.start(); - this.heartbeatService.start(); + this.heartbeat.start(); } else { console.warn("[SupervisorSession] Queue consumer disabled"); } @@ -196,7 +196,7 @@ export class SupervisorSession extends EventEmitter { } async stop() { - this.heartbeatService.stop(); + this.heartbeat.stop(); this.runNotificationsSocket?.disconnect(); } diff --git a/packages/core/src/v3/utils/interval.ts b/packages/core/src/v3/utils/interval.ts index 0684bd73c5..59fd0a94cb 100644 --- a/packages/core/src/v3/utils/interval.ts +++ b/packages/core/src/v3/utils/interval.ts @@ -1,57 +1,59 @@ -type HeartbeatServiceOptions = { - heartbeat: () => Promise; +type IntervalServiceOptions = { + onInterval: () => Promise; + onError?: (error: unknown) => Promise; intervalMs?: number; leadingEdge?: boolean; - onError?: (error: unknown) => Promise; }; -export class HeartbeatService { - private _heartbeat: () => Promise; +export class IntervalService { + private _onInterval: () => Promise; + private _onError?: (error: unknown) => Promise; + private _intervalMs: number; - private _nextHeartbeat: NodeJS.Timeout | undefined; + private _nextInterval: NodeJS.Timeout | undefined; private _leadingEdge: boolean; - private _isHeartbeating: boolean; - private _onError?: (error: unknown) => Promise; + private _isEnabled: boolean; + + constructor(opts: IntervalServiceOptions) { + this._onInterval = opts.onInterval; + this._onError = opts.onError; - constructor(opts: HeartbeatServiceOptions) { - this._heartbeat = opts.heartbeat; this._intervalMs = opts.intervalMs ?? 45_000; - this._nextHeartbeat = undefined; + this._nextInterval = undefined; this._leadingEdge = opts.leadingEdge ?? false; - this._isHeartbeating = false; - this._onError = opts.onError; + this._isEnabled = false; } start() { - if (this._isHeartbeating) { + if (this._isEnabled) { return; } - this._isHeartbeating = true; + this._isEnabled = true; if (this._leadingEdge) { - this.#doHeartbeat(); + this.#doInterval(); } else { - this.#scheduleNextHeartbeat(); + this.#scheduleNextInterval(); } } stop() { - if (!this._isHeartbeating) { + if (!this._isEnabled) { return; } - this._isHeartbeating = false; - this.#clearNextHeartbeat(); + this._isEnabled = false; + this.#clearNextInterval(); } resetCurrentInterval() { - if (!this._isHeartbeating) { + if (!this._isEnabled) { return; } - this.#clearNextHeartbeat(); - this.#scheduleNextHeartbeat(); + this.#clearNextInterval(); + this.#scheduleNextInterval(); } updateInterval(intervalMs: number) { @@ -59,35 +61,35 @@ export class HeartbeatService { this.resetCurrentInterval(); } - #doHeartbeat = async () => { - this.#clearNextHeartbeat(); + #doInterval = async () => { + this.#clearNextInterval(); - if (!this._isHeartbeating) { + if (!this._isEnabled) { return; } try { - await this._heartbeat(); + await this._onInterval(); } catch (error) { if (this._onError) { try { await this._onError(error); } catch (error) { - console.error("Error handling heartbeat error", error); + console.error("Error during interval error handler", error); } } } - this.#scheduleNextHeartbeat(); + this.#scheduleNextInterval(); }; - #clearNextHeartbeat() { - if (this._nextHeartbeat) { - clearTimeout(this._nextHeartbeat); + #clearNextInterval() { + if (this._nextInterval) { + clearTimeout(this._nextInterval); } } - #scheduleNextHeartbeat() { - this._nextHeartbeat = setTimeout(this.#doHeartbeat, this._intervalMs); + #scheduleNextInterval() { + this._nextInterval = setTimeout(this.#doInterval, this._intervalMs); } } From 1023d13d1e623f1d2a6303705b5f2073413cbdb4 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 15 Apr 2025 14:17:07 +0100 Subject: [PATCH 31/35] restore old heartbeat service but deprecate it --- packages/core/src/v3/index.ts | 1 + packages/core/src/v3/utils/heartbeat.ts | 96 +++++++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 packages/core/src/v3/utils/heartbeat.ts diff --git a/packages/core/src/v3/index.ts b/packages/core/src/v3/index.ts index 3bd1fc4547..8877393dca 100644 --- a/packages/core/src/v3/index.ts +++ b/packages/core/src/v3/index.ts @@ -68,6 +68,7 @@ export { export * from "./utils/imageRef.js"; export * from "./utils/interval.js"; +export * from "./utils/heartbeat.js"; export * from "./config.js"; export { diff --git a/packages/core/src/v3/utils/heartbeat.ts b/packages/core/src/v3/utils/heartbeat.ts new file mode 100644 index 0000000000..c9bb0d97ed --- /dev/null +++ b/packages/core/src/v3/utils/heartbeat.ts @@ -0,0 +1,96 @@ +type HeartbeatServiceOptions = { + heartbeat: () => Promise; + intervalMs?: number; + leadingEdge?: boolean; + onError?: (error: unknown) => Promise; +}; + +/** + * @deprecated Use IntervalService instead + */ +export class HeartbeatService { + private _heartbeat: () => Promise; + private _intervalMs: number; + private _nextHeartbeat: NodeJS.Timeout | undefined; + private _leadingEdge: boolean; + private _isHeartbeating: boolean; + private _onError?: (error: unknown) => Promise; + + constructor(opts: HeartbeatServiceOptions) { + this._heartbeat = opts.heartbeat; + this._intervalMs = opts.intervalMs ?? 45_000; + this._nextHeartbeat = undefined; + this._leadingEdge = opts.leadingEdge ?? false; + this._isHeartbeating = false; + this._onError = opts.onError; + } + + start() { + if (this._isHeartbeating) { + return; + } + + this._isHeartbeating = true; + + if (this._leadingEdge) { + this.#doHeartbeat(); + } else { + this.#scheduleNextHeartbeat(); + } + } + + stop() { + if (!this._isHeartbeating) { + return; + } + + this._isHeartbeating = false; + this.#clearNextHeartbeat(); + } + + resetCurrentInterval() { + if (!this._isHeartbeating) { + return; + } + + this.#clearNextHeartbeat(); + this.#scheduleNextHeartbeat(); + } + + updateInterval(intervalMs: number) { + this._intervalMs = intervalMs; + this.resetCurrentInterval(); + } + + #doHeartbeat = async () => { + this.#clearNextHeartbeat(); + + if (!this._isHeartbeating) { + return; + } + + try { + await this._heartbeat(); + } catch (error) { + if (this._onError) { + try { + await this._onError(error); + } catch (error) { + console.error("Error handling heartbeat error", error); + } + } + } + + this.#scheduleNextHeartbeat(); + }; + + #clearNextHeartbeat() { + if (this._nextHeartbeat) { + clearTimeout(this._nextHeartbeat); + } + } + + #scheduleNextHeartbeat() { + this._nextHeartbeat = setTimeout(this.#doHeartbeat, this._intervalMs); + } +} From 419dd21cea902e08285d4d6fbdf1adb289ea3f99 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 15 Apr 2025 14:22:25 +0100 Subject: [PATCH 32/35] use the new interval service everywhere --- apps/supervisor/src/services/podCleaner.ts | 12 +- .../authenticatedSocketConnection.server.ts | 8 +- .../src/entryPoints/dev-run-controller.ts | 14 +-- .../src/entryPoints/managed/heartbeat.ts | 113 +----------------- .../cli-v3/src/entryPoints/managed/poller.ts | 8 +- 5 files changed, 25 insertions(+), 130 deletions(-) diff --git a/apps/supervisor/src/services/podCleaner.ts b/apps/supervisor/src/services/podCleaner.ts index e39a98cfbe..56eaaeb88a 100644 --- a/apps/supervisor/src/services/podCleaner.ts +++ b/apps/supervisor/src/services/podCleaner.ts @@ -1,7 +1,7 @@ import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; import { K8sApi } from "../clients/kubernetes.js"; import { createK8sApi } from "../clients/kubernetes.js"; -import { HeartbeatService } from "@trigger.dev/core/v3"; +import { IntervalService } from "@trigger.dev/core/v3"; import { Counter, Gauge, Registry } from "prom-client"; import { register } from "../metrics.js"; @@ -19,7 +19,7 @@ export class PodCleaner { private readonly namespace: string; private readonly batchSize: number; - private readonly deletionHeartbeat: HeartbeatService; + private readonly deletionInterval: IntervalService; // Metrics private readonly register: Registry; @@ -32,10 +32,10 @@ export class PodCleaner { this.namespace = opts.namespace; this.batchSize = opts.batchSize ?? 500; - this.deletionHeartbeat = new HeartbeatService({ + this.deletionInterval = new IntervalService({ intervalMs: opts.intervalMs ?? 10000, leadingEdge: true, - heartbeat: this.deleteCompletedPods.bind(this), + onInterval: this.deleteCompletedPods.bind(this), }); // Initialize metrics @@ -57,11 +57,11 @@ export class PodCleaner { } async start() { - this.deletionHeartbeat.start(); + this.deletionInterval.start(); } async stop() { - this.deletionHeartbeat.stop(); + this.deletionInterval.stop(); } private async deleteCompletedPods() { diff --git a/apps/webapp/app/v3/authenticatedSocketConnection.server.ts b/apps/webapp/app/v3/authenticatedSocketConnection.server.ts index a6de96b9c9..cd255c800b 100644 --- a/apps/webapp/app/v3/authenticatedSocketConnection.server.ts +++ b/apps/webapp/app/v3/authenticatedSocketConnection.server.ts @@ -1,6 +1,6 @@ import { clientWebsocketMessages, - HeartbeatService, + IntervalService, serverWebsocketMessages, } from "@trigger.dev/core/v3"; import { ZodMessageHandler, ZodMessageSender } from "@trigger.dev/core/v3/zodMessageHandler"; @@ -19,7 +19,7 @@ export class AuthenticatedSocketConnection { private _sender: ZodMessageSender; private _consumer: DevQueueConsumer; private _messageHandler: ZodMessageHandler; - private _pingService: HeartbeatService; + private _pingService: IntervalService; constructor( public ws: WebSocket, @@ -75,8 +75,8 @@ export class AuthenticatedSocketConnection { // }); }); - this._pingService = new HeartbeatService({ - heartbeat: async () => { + this._pingService = new IntervalService({ + onInterval: async () => { if (ws.readyState !== WebSocket.OPEN) { logger.debug("[AuthenticatedSocketConnection] Websocket not open, skipping ping"); return; diff --git a/packages/cli-v3/src/entryPoints/dev-run-controller.ts b/packages/cli-v3/src/entryPoints/dev-run-controller.ts index d6660c69d4..f851bc07aa 100644 --- a/packages/cli-v3/src/entryPoints/dev-run-controller.ts +++ b/packages/cli-v3/src/entryPoints/dev-run-controller.ts @@ -1,7 +1,7 @@ import { CompleteRunAttemptResult, DequeuedMessage, - HeartbeatService, + IntervalService, LogLevel, RunExecutionData, TaskRunExecution, @@ -44,9 +44,9 @@ export class DevRunController { private taskRunProcess?: TaskRunProcess; private readonly worker: BackgroundWorker; private readonly httpClient: CliApiClient; - private readonly runHeartbeat: HeartbeatService; + private readonly runHeartbeat: IntervalService; private readonly heartbeatIntervalSeconds: number; - private readonly snapshotPoller: HeartbeatService; + private readonly snapshotPoller: IntervalService; private readonly snapshotPollIntervalSeconds: number; private state: @@ -78,8 +78,8 @@ export class DevRunController { this.httpClient = opts.httpClient; - this.snapshotPoller = new HeartbeatService({ - heartbeat: async () => { + this.snapshotPoller = new IntervalService({ + onInterval: async () => { if (!this.runFriendlyId) { logger.debug("[DevRunController] Skipping snapshot poll, no run ID"); return; @@ -121,8 +121,8 @@ export class DevRunController { }, }); - this.runHeartbeat = new HeartbeatService({ - heartbeat: async () => { + this.runHeartbeat = new IntervalService({ + onInterval: async () => { if (!this.runFriendlyId || !this.snapshotFriendlyId) { logger.debug("[DevRunController] Skipping heartbeat, no run ID or snapshot ID"); return; diff --git a/packages/cli-v3/src/entryPoints/managed/heartbeat.ts b/packages/cli-v3/src/entryPoints/managed/heartbeat.ts index 799408e13b..3b3c820c91 100644 --- a/packages/cli-v3/src/entryPoints/managed/heartbeat.ts +++ b/packages/cli-v3/src/entryPoints/managed/heartbeat.ts @@ -1,4 +1,4 @@ -import { HeartbeatService, RunExecutionData } from "@trigger.dev/core/v3"; +import { IntervalService } from "@trigger.dev/core/v3"; import { WorkloadHttpClient } from "@trigger.dev/core/v3/runEngineWorker"; import { RunLogger } from "./logger.js"; @@ -17,7 +17,7 @@ export class RunExecutionHeartbeat { private readonly httpClient: WorkloadHttpClient; private readonly logger: RunLogger; private readonly heartbeatIntervalMs: number; - private readonly heartbeat: HeartbeatService; + private readonly heartbeat: IntervalService; constructor(opts: RunExecutionHeartbeatOptions) { this.runFriendlyId = opts.runFriendlyId; @@ -36,8 +36,8 @@ export class RunExecutionHeartbeat { }, }); - this.heartbeat = new HeartbeatService({ - heartbeat: async () => { + this.heartbeat = new IntervalService({ + onInterval: async () => { this.logger.sendDebugLog({ runId: this.runFriendlyId, message: "heartbeat: started", @@ -90,108 +90,3 @@ export class RunExecutionHeartbeat { this.heartbeat.stop(); } } - -type RunExecutionSnapshotPollerOptions = { - runFriendlyId: string; - snapshotFriendlyId: string; - httpClient: WorkloadHttpClient; - logger: RunLogger; - snapshotPollIntervalSeconds: number; - handleSnapshotChange: (execution: RunExecutionData) => Promise; -}; - -class RunExecutionSnapshotPoller { - private readonly logger: RunLogger; - private readonly poller: HeartbeatService; - private readonly httpClient: WorkloadHttpClient; - - private readonly runFriendlyId: string; - private readonly snapshotFriendlyId: string; - - private readonly handleSnapshotChange: (execution: RunExecutionData) => Promise; - - constructor(opts: RunExecutionSnapshotPollerOptions) { - this.logger = opts.logger; - this.httpClient = opts.httpClient; - - this.runFriendlyId = opts.runFriendlyId; - this.snapshotFriendlyId = opts.snapshotFriendlyId; - - this.handleSnapshotChange = opts.handleSnapshotChange; - - this.poller = new HeartbeatService({ - heartbeat: async () => { - if (!this.runFriendlyId) { - this.logger.sendDebugLog({ - runId: this.runFriendlyId, - message: "Skipping snapshot poll, no run ID", - }); - return; - } - - this.logger.sendDebugLog({ - runId: this.runFriendlyId, - message: "Polling for latest snapshot", - }); - - this.logger.sendDebugLog({ - runId: this.runFriendlyId, - message: `snapshot poll: started`, - properties: { - snapshotId: this.snapshotFriendlyId, - }, - }); - - const response = await this.httpClient.getRunExecutionData(this.runFriendlyId); - - if (!response.success) { - this.logger.sendDebugLog({ - runId: this.runFriendlyId, - message: "Snapshot poll failed", - properties: { - error: response.error, - }, - }); - - this.logger.sendDebugLog({ - runId: this.runFriendlyId, - message: `snapshot poll: failed`, - properties: { - snapshotId: this.snapshotFriendlyId, - error: response.error, - }, - }); - - return; - } - - await this.handleSnapshotChange(response.data.execution); - }, - intervalMs: opts.snapshotPollIntervalSeconds * 1000, - leadingEdge: false, - onError: async (error) => { - this.logger.sendDebugLog({ - runId: this.runFriendlyId, - message: "Failed to poll for snapshot", - properties: { error: error instanceof Error ? error.message : String(error) }, - }); - }, - }); - } - - resetCurrentInterval() { - this.poller.resetCurrentInterval(); - } - - updateInterval(intervalMs: number) { - this.poller.updateInterval(intervalMs); - } - - start() { - this.poller.start(); - } - - stop() { - this.poller.stop(); - } -} diff --git a/packages/cli-v3/src/entryPoints/managed/poller.ts b/packages/cli-v3/src/entryPoints/managed/poller.ts index 0195661a66..2decd401ee 100644 --- a/packages/cli-v3/src/entryPoints/managed/poller.ts +++ b/packages/cli-v3/src/entryPoints/managed/poller.ts @@ -1,6 +1,6 @@ import { WorkloadHttpClient } from "@trigger.dev/core/v3/runEngineWorker"; import { RunLogger } from "./logger.js"; -import { HeartbeatService, RunExecutionData } from "@trigger.dev/core/v3"; +import { IntervalService, RunExecutionData } from "@trigger.dev/core/v3"; export type RunExecutionSnapshotPollerOptions = { runFriendlyId: string; @@ -19,7 +19,7 @@ export class RunExecutionSnapshotPoller { private readonly logger: RunLogger; private readonly snapshotPollIntervalMs: number; private readonly handleSnapshotChange: (runData: RunExecutionData) => Promise; - private readonly poller: HeartbeatService; + private readonly poller: IntervalService; constructor(opts: RunExecutionSnapshotPollerOptions) { this.runFriendlyId = opts.runFriendlyId; @@ -39,8 +39,8 @@ export class RunExecutionSnapshotPoller { }, }); - this.poller = new HeartbeatService({ - heartbeat: async () => { + this.poller = new IntervalService({ + onInterval: async () => { if (!this.runFriendlyId) { this.logger.sendDebugLog({ runId: this.runFriendlyId, From dce194f8a2a044684f3619ca5d2fc7b5b9d1ef9e Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 15 Apr 2025 14:22:42 +0100 Subject: [PATCH 33/35] Revert "temp disable pre" This reverts commit e03f4179de6a731c17253b68a6e00bcb7ac1736b. --- .changeset/pre.json | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .changeset/pre.json diff --git a/.changeset/pre.json b/.changeset/pre.json new file mode 100644 index 0000000000..3863ea8ac2 --- /dev/null +++ b/.changeset/pre.json @@ -0,0 +1,32 @@ +{ + "mode": "pre", + "tag": "v4-beta", + "initialVersions": { + "coordinator": "0.0.1", + "docker-provider": "0.0.1", + "kubernetes-provider": "0.0.1", + "supervisor": "0.0.1", + "webapp": "1.0.0", + "@trigger.dev/build": "3.3.17", + "trigger.dev": "3.3.17", + "@trigger.dev/core": "3.3.17", + "@trigger.dev/python": "3.3.17", + "@trigger.dev/react-hooks": "3.3.17", + "@trigger.dev/redis-worker": "3.3.17", + "@trigger.dev/rsc": "3.3.17", + "@trigger.dev/sdk": "3.3.17" + }, + "changesets": [ + "breezy-turtles-talk", + "four-needles-add", + "honest-files-decide", + "late-chairs-ring", + "moody-squids-count", + "nice-colts-boil", + "polite-lies-fix", + "red-wasps-cover", + "shiny-kiwis-beam", + "smart-coins-hammer", + "weak-jobs-hide" + ] +} From 2d0bef949c9f7add48823461bd840457966c4d85 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 15 Apr 2025 14:31:26 +0100 Subject: [PATCH 34/35] add changeset --- .changeset/tricky-houses-invite.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changeset/tricky-houses-invite.md diff --git a/.changeset/tricky-houses-invite.md b/.changeset/tricky-houses-invite.md new file mode 100644 index 0000000000..e21e7b5818 --- /dev/null +++ b/.changeset/tricky-houses-invite.md @@ -0,0 +1,6 @@ +--- +"trigger.dev": patch +"@trigger.dev/core": patch +--- + +Managed run controller performance and reliability improvements From df845e1eff6e5f52bae607d1acab71b218950bad Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 15 Apr 2025 15:51:00 +0100 Subject: [PATCH 35/35] replace all run engine find uniques with find first --- .../run-engine/src/engine/db/worker.ts | 9 +++------ .../run-engine/src/engine/systems/batchSystem.ts | 2 +- .../src/engine/systems/runAttemptSystem.ts | 10 ++++------ .../run-engine/src/engine/systems/ttlSystem.ts | 2 +- .../src/engine/systems/waitpointSystem.ts | 16 ++++++---------- 5 files changed, 15 insertions(+), 24 deletions(-) diff --git a/internal-packages/run-engine/src/engine/db/worker.ts b/internal-packages/run-engine/src/engine/db/worker.ts index 81a2880699..e61e9e8d43 100644 --- a/internal-packages/run-engine/src/engine/db/worker.ts +++ b/internal-packages/run-engine/src/engine/db/worker.ts @@ -264,13 +264,10 @@ export async function getManagedWorkerFromCurrentlyPromotedDeployment( prisma: PrismaClientOrTransaction, environmentId: string ): Promise { - // TODO: fixme - const promotion = await prisma.workerDeploymentPromotion.findUnique({ + const promotion = await prisma.workerDeploymentPromotion.findFirst({ where: { - environmentId_label: { - environmentId, - label: CURRENT_DEPLOYMENT_LABEL, - }, + environmentId, + label: CURRENT_DEPLOYMENT_LABEL, }, include: { deployment: { diff --git a/internal-packages/run-engine/src/engine/systems/batchSystem.ts b/internal-packages/run-engine/src/engine/systems/batchSystem.ts index 5f1948a831..8f0a14f4e3 100644 --- a/internal-packages/run-engine/src/engine/systems/batchSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/batchSystem.ts @@ -34,7 +34,7 @@ export class BatchSystem { */ async #tryCompleteBatch({ batchId }: { batchId: string }) { return startSpan(this.$.tracer, "#tryCompleteBatch", async (span) => { - const batch = await this.$.prisma.batchTaskRun.findUnique({ + const batch = await this.$.prisma.batchTaskRun.findFirst({ select: { status: true, runtimeEnvironmentId: true, diff --git a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts index 5ecf5ffd99..76b97c7d60 100644 --- a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts @@ -139,12 +139,10 @@ export class RunAttemptSystem { throw new ServiceValidationError("Task run is not locked", 400); } - const queue = await prisma.taskQueue.findUnique({ + const queue = await prisma.taskQueue.findFirst({ where: { - runtimeEnvironmentId_name: { - runtimeEnvironmentId: environment.id, - name: taskRun.queue, - }, + runtimeEnvironmentId: environment.id, + name: taskRun.queue, }, }); @@ -1199,7 +1197,7 @@ export class RunAttemptSystem { async #getAuthenticatedEnvironmentFromRun(runId: string, tx?: PrismaClientOrTransaction) { const prisma = tx ?? this.$.prisma; - const taskRun = await prisma.taskRun.findUnique({ + const taskRun = await prisma.taskRun.findFirst({ where: { id: runId, }, diff --git a/internal-packages/run-engine/src/engine/systems/ttlSystem.ts b/internal-packages/run-engine/src/engine/systems/ttlSystem.ts index 12910f4634..f020fe2b3c 100644 --- a/internal-packages/run-engine/src/engine/systems/ttlSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/ttlSystem.ts @@ -33,7 +33,7 @@ export class TtlSystem { } //only expire "PENDING" runs - const run = await prisma.taskRun.findUnique({ where: { id: runId } }); + const run = await prisma.taskRun.findFirst({ where: { id: runId } }); if (!run) { this.$.logger.debug("Could not find enqueued run to expire", { diff --git a/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts b/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts index 669fcf0e26..b2eb9e5396 100644 --- a/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts @@ -159,12 +159,10 @@ export class WaitpointSystem { const prisma = tx ?? this.$.prisma; const existingWaitpoint = idempotencyKey - ? await prisma.waitpoint.findUnique({ + ? await prisma.waitpoint.findFirst({ where: { - environmentId_idempotencyKey: { - environmentId, - idempotencyKey, - }, + environmentId, + idempotencyKey, }, }) : undefined; @@ -241,12 +239,10 @@ export class WaitpointSystem { tags?: string[]; }): Promise<{ waitpoint: Waitpoint; isCached: boolean }> { const existingWaitpoint = idempotencyKey - ? await this.$.prisma.waitpoint.findUnique({ + ? await this.$.prisma.waitpoint.findFirst({ where: { - environmentId_idempotencyKey: { - environmentId, - idempotencyKey, - }, + environmentId, + idempotencyKey, }, }) : undefined;