From 61b2205fbdd5827e0567ee83b5f9553170883c85 Mon Sep 17 00:00:00 2001 From: Paolo Di Tommaso Date: Mon, 15 Jul 2024 17:34:01 +0200 Subject: [PATCH] Improve Google Batch 5000x error class handling (#5141) Signed-off-by: Paolo Di Tommaso --- .../batch/GoogleBatchTaskHandler.groovy | 19 ++++++++++++------- .../batch/GoogleBatchTaskHandlerTest.groovy | 4 ++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy index f292b335e7..f8a10e9663 100644 --- a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy +++ b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy @@ -18,6 +18,7 @@ package nextflow.cloud.google.batch import java.nio.file.Path +import java.util.regex.Pattern import com.google.cloud.batch.v1.AllocationPolicy import com.google.cloud.batch.v1.ComputeResource @@ -37,6 +38,7 @@ import groovy.util.logging.Slf4j import nextflow.cloud.google.batch.client.BatchClient import nextflow.cloud.types.CloudMachineInfo import nextflow.cloud.types.PriceModel +import nextflow.exception.ProcessException import nextflow.exception.ProcessUnrecoverableException import nextflow.executor.BashWrapperBuilder import nextflow.executor.res.DiskResource @@ -58,6 +60,8 @@ import nextflow.trace.TraceRecord @CompileStatic class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask { + private static Pattern EXIT_CODE_REGEX = ~/exit code 500(\d\d)/ + private GoogleBatchExecutor executor private Path exitFile @@ -480,10 +484,10 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask { if( state in COMPLETED ) { log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - terminated job=$jobId; task=$taskId; state=$state" // finalize the task - task.exitStatus = getJobExitCode() - if( task.exitStatus == null ) - task.exitStatus = readExitFile() + task.exitStatus = readExitFile() if( state == 'FAILED' ) { + if( task.exitStatus == Integer.MAX_VALUE ) + task.error = getJobError() task.stdout = executor.logging.stdout(uid, taskId) ?: outputFile task.stderr = executor.logging.stderr(uid, taskId) ?: errorFile } @@ -498,15 +502,16 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask { return false } - protected Integer getJobExitCode() { + protected Throwable getJobError() { try { final status = client.getTaskStatus(jobId, taskId) final eventsCount = status.getStatusEventsCount() final lastEvent = eventsCount > 0 ? status.getStatusEvents(eventsCount - 1) : null - log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - last event: ${lastEvent}" + log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - last event: ${lastEvent}; exit code: ${lastEvent?.taskExecution?.exitCode}" - if( lastEvent?.getDescription()?.contains('due to Spot VM preemption with exit code 50001') ) { - return 50001 + final error = lastEvent?.description + if( error && EXIT_CODE_REGEX.matcher(error).find() ) { + return new ProcessException(error) } } catch (Throwable t) { diff --git a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy index 005a91d4ca..b54a5ff7a8 100644 --- a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy +++ b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy @@ -480,8 +480,8 @@ class GoogleBatchTaskHandlerTest extends Specification { makeTaskStatus('Task succeeded') ] then: - handler.getJobExitCode() == 50001 - handler.getJobExitCode() == null + handler.getJobError().message == "Task failed due to Spot VM preemption with exit code 50001." + handler.getJobError() == null } def 'should find best instance type' () {