mark running jobs as failed on exc k8s restart (#3642)

This PR makes the following changes: - The `ExecutionController` `_verifyExecution()` function now marks the execution status as failed if the current status is a `running` status ('recovering', 'running', 'failing', 'paused', 'stopping'). We can't start from a running state so the job will be stopped, but we were not updating the status previously. The assumption is that one of these statuses means that the execution controller pod has crashed and k8s is restarting a new pod. We would never get in this situation in native clustering, as there are check before the execution controller process is forked. - The `ExecutionService` `_finishExecution()` function also marks the execution status as failed if in a `running` status. It looks like this function is designed to run after an execution controller error and shutdown, so it's safe to assume a running status means there was an error and the execution controller shut down before status was updated. Ref: #2673
terascope · Jun 14, 2024 · 19e70bf · 19e70bf
1 parent 138f9a7
commit 19e70bf
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 1 deletion.
diff --git a/packages/teraslice/src/lib/cluster/services/execution.ts b/packages/teraslice/src/lib/cluster/services/execution.ts
@@ -240,6 +240,19 @@ export class ExecutionService {
             return;
         }
 
+        const runningStatuses = this.executionStorage.getRunningStatuses();
+
+        if (runningStatuses.includes(status)) {
+            // This should never happen. If we get here with a running status
+            // something has gone wrong. Mark execution as failed before shutdown.
+            this.logger.warn(`Cluster_master is changing status of execution ${exId} from ${status} to failed`);
+            await this.executionStorage.setStatus(
+                exId,
+                'failed',
+                this.executionStorage.executionMetaData(null, getFullErrorStack(err))
+            );
+        }
+
         this.logger.debug(`execution ${exId} finished, shutting down execution`);
 
         try {

diff --git a/packages/teraslice/src/lib/workers/execution-controller/index.ts b/packages/teraslice/src/lib/workers/execution-controller/index.ts
@@ -899,13 +899,22 @@ export class ExecutionController {
 
         const invalidStateMsg = (state: string) => {
             const prefix = `Execution ${this.exId} was starting in ${state} status`;
-            return `${prefix} sending execution:finished event to cluster master`;
+            return `${prefix}, sending execution:finished event to cluster master`;
         };
 
         if (includes(terminalStatuses, status)) {
             error = new Error(invalidStateMsg('terminal'));
         } else if (includes(runningStatuses, status)) {
             error = new Error(invalidStateMsg('running'));
+            // If in a running status the execution process
+            // crashed and k8s is trying to restart the pod,
+            // e.g. execution controller OOM.
+            this.logger.warn(`Changing execution status from ${status} to failed`);
+            await this.executionStorage.setStatus(
+                this.exId,
+                'failed',
+                this.executionStorage.executionMetaData(null, getFullErrorStack(error))
+            );
         } else {
             return true;
         }