From 3c1d8c7eed9db98ee5914101485645f0a6657163 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Mon, 16 Sep 2024 18:09:32 +0100 Subject: [PATCH 1/4] never abort the same controller twice --- apps/coordinator/src/checkpointer.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/apps/coordinator/src/checkpointer.ts b/apps/coordinator/src/checkpointer.ts index d191f488f5..36369dd2d1 100644 --- a/apps/coordinator/src/checkpointer.ts +++ b/apps/coordinator/src/checkpointer.ts @@ -248,7 +248,12 @@ export class Checkpointer { return false; } - controller.abort("cancelCheckpointing()"); + if (controller.signal.aborted) { + this.#logger.debug("Controller already aborted", { runId }); + return false; + } + + controller.abort("cancelCheckpoint()"); this.#abortControllers.delete(runId); return true; From 3f00f304ef77a1fe9dd5410faa76faa0423b17ec Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Mon, 16 Sep 2024 23:35:19 +0100 Subject: [PATCH 2/4] prevent uncaught exception when aborting pipe --- apps/coordinator/src/checkpointer.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/coordinator/src/checkpointer.ts b/apps/coordinator/src/checkpointer.ts index 36369dd2d1..7c0ceecdb3 100644 --- a/apps/coordinator/src/checkpointer.ts +++ b/apps/coordinator/src/checkpointer.ts @@ -481,7 +481,7 @@ export class Checkpointer { const containerId = this.#logger.debug( // @ts-expect-error - await $$`crictl ps` + await $`crictl ps` .pipeStdout($$({ stdin: "pipe" })`grep ${containterName}`) .pipeStdout($$({ stdin: "pipe" })`cut -f1 ${"-d "}`) ); From 7061e32828258963dc73ea6ebd92ae1148f44a74 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 17 Sep 2024 12:53:28 +0100 Subject: [PATCH 3/4] abort signal assertions and more logging --- apps/coordinator/src/checkpointer.ts | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/apps/coordinator/src/checkpointer.ts b/apps/coordinator/src/checkpointer.ts index 7c0ceecdb3..21cc9a554b 100644 --- a/apps/coordinator/src/checkpointer.ts +++ b/apps/coordinator/src/checkpointer.ts @@ -48,6 +48,8 @@ type CheckpointerOptions = { chaosMonkey?: ChaosMonkey; }; +class CheckpointAbortError extends Error {} + async function getFileSize(filePath: string): Promise { try { const stats = await fs.stat(filePath); @@ -400,6 +402,14 @@ export class Checkpointer { const controller = new AbortController(); this.#abortControllers.set(runId, controller); + const assertNotAborted = (abortMessage?: string) => { + if (controller.signal.aborted) { + throw new CheckpointAbortError(abortMessage); + } + + this.#logger.debug("Not aborted", { abortMessage }); + }; + const $$ = $({ signal: controller.signal }); const shortCode = nanoid(8); @@ -423,6 +433,7 @@ export class Checkpointer { }; try { + assertNotAborted("chaosMonkey.call"); await this.chaosMonkey.call({ $: $$ }); this.#logger.log("Checkpointing:", { options }); @@ -479,6 +490,7 @@ export class Checkpointer { return { success: false, reason: "SKIP_RETRYING" }; } + assertNotAborted("cmd: crictl ps"); const containerId = this.#logger.debug( // @ts-expect-error await $`crictl ps` @@ -501,6 +513,7 @@ export class Checkpointer { } // Create checkpoint + assertNotAborted("cmd: crictl checkpoint"); this.#logger.debug(await $$`crictl checkpoint --export=${exportLocation} ${containerId}`); const postCheckpoint = performance.now(); @@ -509,20 +522,25 @@ export class Checkpointer { this.#logger.log("checkpoint archive created", { size, options }); // Create image from checkpoint + assertNotAborted("cmd: buildah from scratch"); const container = this.#logger.debug(await $$`buildah from scratch`); const postFrom = performance.now(); + assertNotAborted("cmd: buildah add"); this.#logger.debug(await $$`buildah add ${container} ${exportLocation} /`); const postAdd = performance.now(); + assertNotAborted("cmd: buildah config"); this.#logger.debug( await $$`buildah config --annotation=io.kubernetes.cri-o.annotations.checkpoint.name=counter ${container}` ); const postConfig = performance.now(); + assertNotAborted("cmd: buildah commit"); this.#logger.debug(await $$`buildah commit ${container} ${imageRef}`); const postCommit = performance.now(); + assertNotAborted("cmd: buildah rm"); this.#logger.debug(await $$`buildah rm ${container}`); const postRm = performance.now(); @@ -534,6 +552,7 @@ export class Checkpointer { } // Push checkpoint image + assertNotAborted("cmd: buildah push"); this.#logger.debug( await $$`buildah push --tls-verify=${String(this.registryTlsVerify)} ${imageRef}` ); @@ -559,9 +578,15 @@ export class Checkpointer { }, }; } catch (error) { + if (error instanceof CheckpointAbortError) { + this.#logger.error("Checkpoint canceled: CheckpointAbortError", { options, error }); + + return { success: false, reason: "CANCELED" }; + } + if (isExecaChildProcess(error)) { if (error.isCanceled) { - this.#logger.error("Checkpoint canceled", { options, error }); + this.#logger.error("Checkpoint canceled: ExecaChildProcess", { options, error }); return { success: false, reason: "CANCELED" }; } From fc40b7114af15aa6b0ef740b423a14706abd0748 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Tue, 17 Sep 2024 13:57:14 +0100 Subject: [PATCH 4/4] never abort running pipe --- apps/coordinator/src/checkpointer.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/coordinator/src/checkpointer.ts b/apps/coordinator/src/checkpointer.ts index 21cc9a554b..ad14d8145c 100644 --- a/apps/coordinator/src/checkpointer.ts +++ b/apps/coordinator/src/checkpointer.ts @@ -494,8 +494,8 @@ export class Checkpointer { const containerId = this.#logger.debug( // @ts-expect-error await $`crictl ps` - .pipeStdout($$({ stdin: "pipe" })`grep ${containterName}`) - .pipeStdout($$({ stdin: "pipe" })`cut -f1 ${"-d "}`) + .pipeStdout($({ stdin: "pipe" })`grep ${containterName}`) + .pipeStdout($({ stdin: "pipe" })`cut -f1 ${"-d "}`) ); if (!containerId.stdout) {