From f2b2c306b5c5ea4cc080aa5e0095fa3b755d6ee1 Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Mon, 20 May 2024 10:57:48 -0400 Subject: [PATCH] WX-1625 Quota retry (#7439) --- CHANGELOG.md | 6 ++ .../standardTestCases/quota_fail_retry.test | 21 ++++++ .../quota_fail_retry/quota_fail_retry.wdl | 36 ++++++++++ cromwell.example.backends/PAPIv2.conf | 5 +- docs/backends/Google.md | 16 +++++ .../papi_v2beta_provider_config.inc.conf | 2 + ...inesApiAsyncBackendJobExecutionActor.scala | 68 +++++++++++++++++-- ...linesApiBackendLifecycleActorFactory.scala | 5 +- .../common/PipelinesApiConfiguration.scala | 3 +- .../PipelinesApiConfigurationAttributes.scala | 8 +++ .../common/PreviousRetryReasons.scala | 14 ++-- .../pipelines/common/api/RunStatus.scala | 25 +++++++ ...ApiAsyncBackendJobExecutionActorSpec.scala | 7 ++ ...sApiBackendLifecycleActorFactorySpec.scala | 1 + ...esApiBackendCacheHitCopyingActorSpec.scala | 1 + .../v2beta/api/request/ErrorReporter.scala | 11 ++- .../api/request/GetRequestHandlerSpec.scala | 13 +++- 17 files changed, 226 insertions(+), 16 deletions(-) create mode 100644 centaur/src/main/resources/standardTestCases/quota_fail_retry.test create mode 100644 centaur/src/main/resources/standardTestCases/quota_fail_retry/quota_fail_retry.wdl diff --git a/CHANGELOG.md b/CHANGELOG.md index f17c5fd23a6..a16dc98132e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,12 @@ be found [here](https://cromwell.readthedocs.io/en/stable/backends/HPC/#optional The `genomics` configuration entry was renamed to `batch`, see [ReadTheDocs](https://cromwell.readthedocs.io/en/stable/backends/GCPBatch/) for more information. +### Improved handling of Life Sciences API quota errors + +Users reported cases where Life Sciences jobs failed due to insufficient quota, instead of queueing and waiting until +quota is available (which is the expected behavior). Cromwell will now retry under these conditions, which present with errors +such as "PAPI error code 9", "no available zones", and/or "quota too low". + ## 87 Release Notes ### GCP Batch diff --git a/centaur/src/main/resources/standardTestCases/quota_fail_retry.test b/centaur/src/main/resources/standardTestCases/quota_fail_retry.test new file mode 100644 index 00000000000..36e9565bb84 --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/quota_fail_retry.test @@ -0,0 +1,21 @@ +name: quota_fail_retry +testFormat: workflowfailure +backends: [Papiv2] + +files { + workflow: quota_fail_retry/quota_fail_retry.wdl +} + +# Adapted from `preemptible_and_memory_retry.test`. +# I set `broad-dsde-cromwell-dev` to have super low CPU quota in `us-west3` (Salt Lake City) for this test +# This functionality is pretty married to PAPI, it doesn't run on `GCPBatch` backend. + +metadata { + workflowName: sleepy_sleep + status: Failed + "failures.0.message": "Workflow failed" + "failures.0.causedBy.0.message": "Task sleepy_sleep.sleep:NA:3 failed. The job was stopped before the command finished. PAPI error code 9. Could not start instance custom-12-11264 due to insufficient quota. Cromwell retries exhausted, task failed. Backend info: Execution failed: allocating: selecting resources: selecting region and zone: no available zones: us-west3: 12 CPUS (10/10 available) quota too low" + "sleepy_sleep.sleep.-1.1.executionStatus": "RetryableFailure" + "sleepy_sleep.sleep.-1.2.executionStatus": "RetryableFailure" + "sleepy_sleep.sleep.-1.3.executionStatus": "Failed" +} diff --git a/centaur/src/main/resources/standardTestCases/quota_fail_retry/quota_fail_retry.wdl b/centaur/src/main/resources/standardTestCases/quota_fail_retry/quota_fail_retry.wdl new file mode 100644 index 00000000000..9eaf7185a4d --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/quota_fail_retry/quota_fail_retry.wdl @@ -0,0 +1,36 @@ +version 1.0 + +workflow sleepy_sleep { + + input { + Int sleep_seconds = 180 + } + + call sleep { + input: sleep_seconds = sleep_seconds + } + +} + +task sleep { + + input { + Int sleep_seconds + } + + meta { + volatile: true + } + + # I set `broad-dsde-cromwell-dev` to have super low CPU quota in `us-west3` (Salt Lake City) for this test + runtime { + cpu: 12 + docker: "ubuntu:latest" + zones: "us-west3-a us-west3-b us-west3-c" + } + + command <<< + sleep ~{sleep_seconds}; + ls -la + >>> +} diff --git a/cromwell.example.backends/PAPIv2.conf b/cromwell.example.backends/PAPIv2.conf index 3fc34e79b24..f6be1fec897 100644 --- a/cromwell.example.backends/PAPIv2.conf +++ b/cromwell.example.backends/PAPIv2.conf @@ -4,7 +4,7 @@ # of cromwell.example.backends/cromwell.examples.conf in the root of the repository. # You should uncomment lines that you want to define, and read carefully to customize # the file. If you have any questions, please open an issue at -# https://broadworkbench.atlassian.net/projects/BA/issues +# https://github.com/broadinstitute/cromwell/issues # Documentation # https://cromwell.readthedocs.io/en/stable/backends/Google/ @@ -61,6 +61,9 @@ backend { # Defaults to 7 days; max 30 days # pipeline-timeout = 7 days + # Cromwell will retry jobs that fail with a quota signature, such as "PAPI error code 9", "no available zones", and/or "quota too low". + quota-attempts = 20 + genomics { # A reference to an auth defined in the `google` stanza at the top. This auth is used to create # Pipelines and manipulate auth JSONs. diff --git a/docs/backends/Google.md b/docs/backends/Google.md index cf694125de6..605df9975bc 100644 --- a/docs/backends/Google.md +++ b/docs/backends/Google.md @@ -238,6 +238,22 @@ backend.providers.PAPIv2.config { } ``` +#### Quota retry + +Typically, Life Sciences API is designed to accept all jobs sent to it and respond to quota exhaustion +by queueing jobs internally. However, there are cases when jobs fail instead of queueing, with messages such +as "PAPI error code 9", "no available zones", and/or "quota too low". + +The following configuration permits Cromwell to detect and retry these failures. Proactively monitoring +and raising quota is still recommended. + +```hocon +backend.providers.PAPIv2.config { + # Counts attempts (total jobs) not just retries after to the first + quota-attempts: 20 +} +``` + **Enabling FUSE capabilities** *This is a community contribution and not officially supported by the Cromwell team.* diff --git a/src/ci/resources/papi_v2beta_provider_config.inc.conf b/src/ci/resources/papi_v2beta_provider_config.inc.conf index b1d0ad8837f..e69eeaebc01 100644 --- a/src/ci/resources/papi_v2beta_provider_config.inc.conf +++ b/src/ci/resources/papi_v2beta_provider_config.inc.conf @@ -3,6 +3,8 @@ root = "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci" maximum-polling-interval = 600 concurrent-job-limit = 1000 +quota-attempts: 3 + genomics { auth = "service_account" location = "us-central1" diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala index f09ebd88bd6..ada935dd675 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala @@ -68,6 +68,7 @@ import wom.values._ import scala.concurrent.Future import scala.concurrent.duration._ import scala.language.postfixOps +import scala.util.control.NoStackTrace import scala.util.{Failure, Success, Try} object PipelinesApiAsyncBackendJobExecutionActor { @@ -103,6 +104,7 @@ object PipelinesApiAsyncBackendJobExecutionActor { } new Exception(s"Task $jobTag failed. $returnCodeMessage PAPI error code ${errorCode.getCode.value}. $message") + with NoStackTrace } } @@ -165,7 +167,7 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta override lazy val dockerImageUsed: Option[String] = Option(jobDockerImage) override lazy val preemptible: Boolean = previousRetryReasons match { - case Valid(PreviousRetryReasons(p, _)) => p < maxPreemption + case Valid(PreviousRetryReasons(p, _, _)) => p < maxPreemption case _ => false } @@ -891,6 +893,7 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta runStatus match { case preemptedStatus: RunStatus.Preempted if preemptible => handlePreemption(preemptedStatus, returnCode) case _: RunStatus.Cancelled => AbortedExecutionHandle + case quotaFailedStatus: RunStatus.QuotaFailed => handleQuotaFailedStatus(quotaFailedStatus) case failedStatus: RunStatus.UnsuccessfulRunStatus => handleFailedRunStatus(failedStatus, returnCode) case unknown => throw new RuntimeException( @@ -901,13 +904,23 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta } } - private def nextAttemptPreemptedAndUnexpectedRetryCountsToKvPairs(p: Int, ur: Int): Seq[KvPair] = + /** + * + * @param p Preemption count + * @param ur Unexpected Retry count + * @param q Quota count + * @return KV sequence ready to be saved for the next attempt + */ + private def nextAttemptRetryKvPairs(p: Int, ur: Int, q: Int): Seq[KvPair] = Seq( KvPair(ScopedKey(workflowId, futureKvJobKey, PipelinesApiBackendLifecycleActorFactory.unexpectedRetryCountKey), ur.toString ), KvPair(ScopedKey(workflowId, futureKvJobKey, PipelinesApiBackendLifecycleActorFactory.preemptionCountKey), p.toString + ), + KvPair(ScopedKey(workflowId, futureKvJobKey, PipelinesApiBackendLifecycleActorFactory.quotaRetryCountKey), + q.toString ) ) @@ -917,11 +930,11 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta ): ExecutionHandle = { val msg = s"Retrying. $errorMessage" previousRetryReasons match { - case Valid(PreviousRetryReasons(p, ur)) => + case Valid(PreviousRetryReasons(p, ur, q)) => val thisUnexpectedRetry = ur + 1 if (thisUnexpectedRetry <= maxUnexpectedRetries) { val preemptionAndUnexpectedRetryCountsKvPairs = - nextAttemptPreemptedAndUnexpectedRetryCountsToKvPairs(p, thisUnexpectedRetry) + nextAttemptRetryKvPairs(p, thisUnexpectedRetry, q) // Increment unexpected retry count and preemption count stays the same FailedRetryableExecutionHandle( StandardException(errorCode, msg, jobTag, jobReturnCode, standardPaths.error), @@ -944,19 +957,62 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta } } + private def handleQuotaFailedStatus(runStatus: RunStatus.QuotaFailed): ExecutionHandle = { + + val machineType = runStatus.machineType.map(mt => s"$mt ").getOrElse("") + val baseMsg = s"Could not start instance ${machineType}due to insufficient quota." + + previousRetryReasons match { + case Valid(PreviousRetryReasons(p, ur, q)) => + val thisQuotaFailure = q + 1 + val nextKvPairs = nextAttemptRetryKvPairs(p, ur, thisQuotaFailure) + + if (thisQuotaFailure < pipelinesConfiguration.papiAttributes.quotaAttempts) { + val retryFlavor = + s"$baseMsg Cromwell will automatically retry the task. Backend info: ${runStatus.prettyPrintedError}" + val exception = StandardException(runStatus.errorCode, retryFlavor, jobTag, None, standardPaths.error) + jobLogger.info(exception.getMessage) + FailedRetryableExecutionHandle( + exception, + None, + Option(nextKvPairs) + ) + } else { + val nopeFlavor = + s"$baseMsg Cromwell retries exhausted, task failed. Backend info: ${runStatus.prettyPrintedError}" + val exception = StandardException(runStatus.errorCode, nopeFlavor, jobTag, None, standardPaths.error) + jobLogger.info(exception.getMessage) + FailedNonRetryableExecutionHandle( + StandardException(runStatus.errorCode, nopeFlavor, jobTag, None, standardPaths.error), + None, + None + ) + } + case Invalid(_) => + val otherMsg = s"$baseMsg Backend info: ${runStatus.prettyPrintedError}" + val exception = StandardException(runStatus.errorCode, otherMsg, jobTag, None, standardPaths.error) + jobLogger.info(exception.getMessage) + FailedNonRetryableExecutionHandle( + exception, + None, + None + ) + } + } + private def handlePreemption(runStatus: RunStatus.Preempted, jobReturnCode: Option[Int]): ExecutionHandle = { import common.numeric.IntegerUtil._ val errorCode: Status = runStatus.errorCode val prettyPrintedError: String = runStatus.prettyPrintedError previousRetryReasons match { - case Valid(PreviousRetryReasons(p, ur)) => + case Valid(PreviousRetryReasons(p, ur, q)) => val thisPreemption = p + 1 val taskName = s"${workflowDescriptor.id}:${call.localName}" val baseMsg = s"Task $taskName was preempted for the ${thisPreemption.toOrdinal} time." val preemptionAndUnexpectedRetryCountsKvPairs = - nextAttemptPreemptedAndUnexpectedRetryCountsToKvPairs(thisPreemption, ur) + nextAttemptRetryKvPairs(thisPreemption, ur, q) if (thisPreemption < maxPreemption) { // Increment preemption count and unexpectedRetryCount stays the same val msg = diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala index 21bc2e7d2b6..ef4bc6df528 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala @@ -6,6 +6,7 @@ import com.typesafe.scalalogging.StrictLogging import cromwell.backend._ import cromwell.backend.google.pipelines.common.PipelinesApiBackendLifecycleActorFactory.{ preemptionCountKey, + quotaRetryCountKey, robustBuildAttributes, unexpectedRetryCountKey } @@ -32,7 +33,8 @@ abstract class PipelinesApiBackendLifecycleActorFactory( protected def requiredBackendSingletonActor(serviceRegistryActor: ActorRef): Props protected val jesConfiguration: PipelinesApiConfiguration - override val requestedKeyValueStoreKeys: Seq[String] = Seq(preemptionCountKey, unexpectedRetryCountKey) + override val requestedKeyValueStoreKeys: Seq[String] = + Seq(preemptionCountKey, unexpectedRetryCountKey, quotaRetryCountKey) protected val googleConfig: GoogleConfiguration = GoogleConfiguration(configurationDescriptor.globalConfig) @@ -125,6 +127,7 @@ abstract class PipelinesApiBackendLifecycleActorFactory( object PipelinesApiBackendLifecycleActorFactory extends StrictLogging { val preemptionCountKey = "PreemptionCount" val unexpectedRetryCountKey = "UnexpectedRetryCount" + val quotaRetryCountKey = "QuotaRetryCount" private[common] def robustBuildAttributes(buildAttributes: () => PipelinesApiConfigurationAttributes, maxAttempts: Int = 3, diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfiguration.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfiguration.scala index f279728d102..c1d273b4de8 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfiguration.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfiguration.scala @@ -3,7 +3,7 @@ package cromwell.backend.google.pipelines.common import com.typesafe.config.Config import cromwell.backend.BackendConfigurationDescriptor import cromwell.backend.google.pipelines.common.api.PipelinesApiFactoryInterface -import cromwell.backend.google.pipelines.common.authentication.{PipelinesApiAuths, PipelinesApiDockerCredentials} +import cromwell.backend.google.pipelines.common.authentication.PipelinesApiDockerCredentials import cromwell.cloudsupport.gcp.GoogleConfiguration import cromwell.core.BackendDockerConfiguration import net.ceedubs.ficus.Ficus._ @@ -17,7 +17,6 @@ class PipelinesApiConfiguration(val configurationDescriptor: BackendConfiguratio val papiAttributes: PipelinesApiConfigurationAttributes ) extends DefaultJsonProtocol { - val jesAuths: PipelinesApiAuths = papiAttributes.auths val root: String = configurationDescriptor.backendConfig.getString("root") val pipelineTimeout: FiniteDuration = papiAttributes.pipelineTimeout val runtimeConfig: Option[Config] = configurationDescriptor.backendRuntimeAttributesConfig diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfigurationAttributes.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfigurationAttributes.scala index 9f279c4350c..44c1078431a 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfigurationAttributes.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfigurationAttributes.scala @@ -51,6 +51,7 @@ case class PipelinesApiConfigurationAttributes( cacheHitDuplicationStrategy: PipelinesCacheHitDuplicationStrategy, requestWorkers: Int Refined Positive, pipelineTimeout: FiniteDuration, + quotaAttempts: Int, logFlushPeriod: Option[FiniteDuration], gcsTransferConfiguration: GcsTransferConfiguration, virtualPrivateCloudConfiguration: VirtualPrivateCloudConfiguration, @@ -86,6 +87,9 @@ object PipelinesApiConfigurationAttributes val checkpointingIntervalKey = "checkpointing-interval" + /** + * Used to screen & warn about unexpected keys + */ private val papiKeys = CommonBackendConfigurationAttributes.commonValidConfigurationAttributeKeys ++ Set( "project", "root", @@ -108,6 +112,7 @@ object PipelinesApiConfigurationAttributes "concurrent-job-limit", "request-workers", "pipeline-timeout", + "quota-attempts", "batch-requests.timeouts.read", "batch-requests.timeouts.connect", "default-runtime-attributes.bootDiskSizeGb", @@ -223,6 +228,8 @@ object PipelinesApiConfigurationAttributes val pipelineTimeout: FiniteDuration = backendConfig.getOrElse("pipeline-timeout", 7.days) + val quotaAttempts: Int = backendConfig.as[Option[Int]]("quota-attempts").getOrElse(20) + val logFlushPeriod: Option[FiniteDuration] = backendConfig.as[Option[FiniteDuration]]("log-flush-period") match { case Some(duration) if duration.isFinite => Option(duration) // "Inf" disables upload @@ -317,6 +324,7 @@ object PipelinesApiConfigurationAttributes cacheHitDuplicationStrategy = cacheHitDuplicationStrategy, requestWorkers = requestWorkers, pipelineTimeout = pipelineTimeout, + quotaAttempts = quotaAttempts, logFlushPeriod = logFlushPeriod, gcsTransferConfiguration = gcsTransferConfiguration, virtualPrivateCloudConfiguration = virtualPrivateCloudConfiguration, diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PreviousRetryReasons.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PreviousRetryReasons.scala index 50a372bc027..8797588da34 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PreviousRetryReasons.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PreviousRetryReasons.scala @@ -5,13 +5,14 @@ import cats.syntax.validated._ import common.validation.ErrorOr.ErrorOr import cromwell.backend.google.pipelines.common.PipelinesApiBackendLifecycleActorFactory.{ preemptionCountKey, + quotaRetryCountKey, unexpectedRetryCountKey } import cromwell.services.keyvalue.KeyValueServiceActor._ import scala.util.{Failure, Success, Try} -case class PreviousRetryReasons(preempted: Int, unexpectedRetry: Int) +case class PreviousRetryReasons(preempted: Int, unexpectedRetry: Int, quota: Int) object PreviousRetryReasons { @@ -19,16 +20,21 @@ object PreviousRetryReasons { val validatedPreemptionCount = validatedKvResponse(prefetchedKvEntries.get(preemptionCountKey), preemptionCountKey) val validatedUnexpectedRetryCount = validatedKvResponse(prefetchedKvEntries.get(unexpectedRetryCountKey), unexpectedRetryCountKey) + val validatedQuotaRetryCount = validatedKvResponse(prefetchedKvEntries.get(quotaRetryCountKey), quotaRetryCountKey) - (validatedPreemptionCount, validatedUnexpectedRetryCount) mapN PreviousRetryReasons.apply + (validatedPreemptionCount, validatedUnexpectedRetryCount, validatedQuotaRetryCount) mapN PreviousRetryReasons.apply } - def apply(knownPreemptedCount: Int, knownUnexpectedRetryCount: Int, attempt: Int): PreviousRetryReasons = { + def apply(knownPreemptedCount: Int, + knownUnexpectedRetryCount: Int, + quotaCount: Int, + attempt: Int + ): PreviousRetryReasons = { // If we have anything unaccounted for, we can top up the unexpected retry count. // NB: 'attempt' is 1-indexed, so, magic number: // NB2: for sanity's sake, I won't let this unaccounted for drop below 0, just in case... val unaccountedFor = Math.max(attempt - 1 - knownPreemptedCount - knownUnexpectedRetryCount, 0) - PreviousRetryReasons(knownPreemptedCount, knownUnexpectedRetryCount + unaccountedFor) + PreviousRetryReasons(knownPreemptedCount, knownUnexpectedRetryCount + unaccountedFor, quotaCount) } private def validatedKvResponse(r: Option[KvResponse], fromKey: String): ErrorOr[Int] = r match { diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala index 933a6139344..0f2c178a28f 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala @@ -40,6 +40,16 @@ object RunStatus { } object UnsuccessfulRunStatus { + + // TODO: Dead code alert. Functional code only ever calls this with status `UNKNOWN`. + // + // Seems to have been replaced with: + // - cromwell.backend.google.pipelines.v2beta.api.request.ErrorReporter#toUnsuccessfulRunStatus + // - cromwell.backend.google.batch.models.RunStatus#fromJobStatus + // There are useful tests in `PipelinesApiAsyncBackendJobExecutionActorSpec` + // that test other things and happen to rely on this method, so eventually + // delete it with the rest of Life Sciences. GCP Batch does not use the + // `PipelinesApiAsyncBackendJobExecutionActor` at all. def apply(errorCode: Status, errorMessage: Option[String], eventList: Seq[ExecutionEvent], @@ -114,4 +124,19 @@ object RunStatus { ) extends UnsuccessfulRunStatus { override def toString = "Preempted" } + + /** + * This should NOT happen, but occasionally we see Life Sciences fail jobs with + * as FAILED_PRECONDITION and a message that contains "no available zones" or similar. (WX-1625) + */ + final case class QuotaFailed(errorCode: Status, + jesCode: Option[Int], + errorMessages: List[String], + eventList: Seq[ExecutionEvent], + machineType: Option[String], + zone: Option[String], + instanceName: Option[String] + ) extends UnsuccessfulRunStatus { + override def toString = "QuotaFailed" + } } diff --git a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActorSpec.scala b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActorSpec.scala index 41e826a4dc2..194433b1162 100644 --- a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActorSpec.scala +++ b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActorSpec.scala @@ -258,6 +258,13 @@ class PipelinesApiAsyncBackendJobExecutionActorSpec PipelinesApiBackendLifecycleActorFactory.unexpectedRetryCountKey ), previousUnexpectedRetries.toString + ), + PipelinesApiBackendLifecycleActorFactory.quotaRetryCountKey -> KvPair( + ScopedKey(workflowDescriptor.id, + KvJobKey(key), + PipelinesApiBackendLifecycleActorFactory.quotaRetryCountKey + ), + 0.toString // We're testing this in other ways, fake it here. ) ) val prefetchedKvEntriesUpd = if (failedRetriesCountOpt.isEmpty) { diff --git a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactorySpec.scala b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactorySpec.scala index f2ba65c65db..269ed4f362e 100644 --- a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactorySpec.scala +++ b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactorySpec.scala @@ -30,6 +30,7 @@ class PipelinesApiBackendLifecycleActorFactorySpec cacheHitDuplicationStrategy = null, requestWorkers = refineV[Positive](1).toOption.get, pipelineTimeout = 1 second, + quotaAttempts = 3, logFlushPeriod = Option(1 second), gcsTransferConfiguration = null, virtualPrivateCloudConfiguration = null, diff --git a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/callcaching/PipelinesApiBackendCacheHitCopyingActorSpec.scala b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/callcaching/PipelinesApiBackendCacheHitCopyingActorSpec.scala index 368b631c6ce..cc0900acc85 100644 --- a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/callcaching/PipelinesApiBackendCacheHitCopyingActorSpec.scala +++ b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/callcaching/PipelinesApiBackendCacheHitCopyingActorSpec.scala @@ -429,6 +429,7 @@ class PipelinesApiBackendCacheHitCopyingActorSpec cacheHitDuplicationStrategy = CopyCachedOutputs, requestWorkers = refineMV[Positive](1), pipelineTimeout = null, + quotaAttempts = 3, logFlushPeriod = None, gcsTransferConfiguration = null, virtualPrivateCloudConfiguration = VirtualPrivateCloudConfiguration(None, None), diff --git a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala index 2329af494c0..77e53176df3 100644 --- a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala +++ b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala @@ -6,7 +6,14 @@ import com.google.api.services.lifesciences.v2beta.model._ import common.validation.ErrorOr.ErrorOr import cromwell.backend.google.pipelines.common.action.ActionLabels._ import cromwell.backend.google.pipelines.common.PipelinesApiAsyncBackendJobExecutionActor -import cromwell.backend.google.pipelines.common.api.RunStatus.{Cancelled, Failed, Preempted, UnsuccessfulRunStatus} +import cromwell.backend.google.pipelines.common.api.RunStatus.{ + Cancelled, + Failed, + Preempted, + QuotaFailed, + UnsuccessfulRunStatus +} +import cromwell.backend.google.pipelines.common.errors.isQuotaMessage import cromwell.backend.google.pipelines.v2beta.api.request.RequestHandler.logger import cromwell.core.{ExecutionEvent, WorkflowId} import io.grpc.{Status => GStatus} @@ -69,6 +76,8 @@ class ErrorReporter(machineType: Option[String], _.contains(PipelinesApiAsyncBackendJobExecutionActor.FailedV2Style) ) => Preempted.apply _ + case GStatus.FAILED_PRECONDITION if isQuotaMessage(error.getMessage) => + QuotaFailed.apply _ case GStatus.CANCELLED => Cancelled.apply _ case _ => Failed.apply _ } diff --git a/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala b/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala index 7f0ae47135f..95fe61bf0c1 100644 --- a/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala +++ b/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala @@ -106,6 +106,17 @@ class GetRequestHandlerSpec extends AnyFlatSpec with CromwellTimeoutSpec with Ma |""".stripMargin, Failed(Status.UNAVAILABLE, None, Nil, Nil, None, None, None) ), + ("parse & classify quota fails", + """|{ + | "done": true, + | "error": { + | "code": 9, + | "message": "Execution failed: allocating: selecting resources: selecting region and zone: no available zones: us-west3: 12 CPUS (10/10 available) quota too low" + | }, + | "name": "projects/1005074806481/locations/us-central1/operations/16958337426039071297" + |}""".stripMargin, + QuotaFailed(Status.FAILED_PRECONDITION, None, Nil, Nil, None, None, None) + ), ("check that we classify error code 10 as a preemption on a preemptible VM", """{ | "done": true, @@ -302,7 +313,7 @@ class GetRequestHandlerSpec extends AnyFlatSpec with CromwellTimeoutSpec with Ma None ) ), - // As of 2022-01 the zone `us-west3` in `broad-dsde-cromwell-dev` has its CPU quota purposely de-rated to 1 for testing + // As of 2022-01 the zone `us-west3` in `broad-dsde-cromwell-dev` has its CPU quota purposely de-rated to 10 for testing ("check that a job is AwaitingCloudQuota if its most recent event is quota exhaustion", """{ | "metadata": {