From 83e702bda4a061506c9733ff90095d9678e8c259 Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Wed, 15 May 2024 17:42:34 -0400 Subject: [PATCH 01/15] Stuff I know I want --- CHANGELOG.md | 6 ++ cromwell.example.backends/PAPIv2.conf | 5 +- docs/backends/Google.md | 15 +++++ .../gcp_batch_provider_config.inc.conf | 2 + .../papi_v2beta_provider_config.inc.conf | 2 + ...inesApiAsyncBackendJobExecutionActor.scala | 67 +++++++++++++++++-- ...linesApiBackendLifecycleActorFactory.scala | 5 +- .../common/PipelinesApiConfiguration.scala | 3 +- .../common/PreviousRetryReasons.scala | 12 ++-- .../pipelines/common/api/RunStatus.scala | 15 +++++ .../v2beta/api/request/ErrorReporter.scala | 5 +- 11 files changed, 119 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dc204172689..f758f131e11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,12 @@ be found [here](https://cromwell.readthedocs.io/en/stable/backends/HPC/#optional The `genomics` configuration entry was renamed to `batch`, see [ReadTheDocs](https://cromwell.readthedocs.io/en/stable/backends/GCPBatch/) for more information. +### Improved handling of Life Sciences API quota errors + +Users reported cases where Life Sciences jobs failed due to insufficient quota, instead of queueing them and waiting until +quota freed up (which is the expected behavior). Cromwell will now retry under these conditions, which present with errors +such as "PAPI error code 9", "no available zones", and/or "quota too low". + ## 87 Release Notes ### `upgrade` command removed from Womtool diff --git a/cromwell.example.backends/PAPIv2.conf b/cromwell.example.backends/PAPIv2.conf index 3fc34e79b24..b83efd6bc82 100644 --- a/cromwell.example.backends/PAPIv2.conf +++ b/cromwell.example.backends/PAPIv2.conf @@ -4,7 +4,7 @@ # of cromwell.example.backends/cromwell.examples.conf in the root of the repository. # You should uncomment lines that you want to define, and read carefully to customize # the file. If you have any questions, please open an issue at -# https://broadworkbench.atlassian.net/projects/BA/issues +# https://github.com/broadinstitute/cromwell/issues # Documentation # https://cromwell.readthedocs.io/en/stable/backends/Google/ @@ -61,6 +61,9 @@ backend { # Defaults to 7 days; max 30 days # pipeline-timeout = 7 days + # Cromwell will retry jobs that fail with a quota signature, such as "PAPI error code 9", "no available zones", and/or "quota too low". + quota-retry-attempts = 20 + genomics { # A reference to an auth defined in the `google` stanza at the top. This auth is used to create # Pipelines and manipulate auth JSONs. diff --git a/docs/backends/Google.md b/docs/backends/Google.md index cf694125de6..90d26b585ac 100644 --- a/docs/backends/Google.md +++ b/docs/backends/Google.md @@ -238,6 +238,21 @@ backend.providers.PAPIv2.config { } ``` +#### Quota retry + +Typically, Life Sciences API is designed to accept all jobs sent to it and respond to quota exhaustion +by queueing jobs internally. However, there are cases when jobs fail instead of queueing, with messages such +as "PAPI error code 9", "no available zones", and/or "quota too low". + +The following configuration permits Cromwell to detect and retry these failures. Proactively monitoring +and raising quota is still recommended. + +```hocon +backend.providers.PAPIv2.config { + quota-retry-attempts: 20 +} +``` + **Enabling FUSE capabilities** *This is a community contribution and not officially supported by the Cromwell team.* diff --git a/src/ci/resources/gcp_batch_provider_config.inc.conf b/src/ci/resources/gcp_batch_provider_config.inc.conf index 9277e8e7678..63c698c987a 100644 --- a/src/ci/resources/gcp_batch_provider_config.inc.conf +++ b/src/ci/resources/gcp_batch_provider_config.inc.conf @@ -3,6 +3,8 @@ root = "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci" maximum-polling-interval = 600 concurrent-job-limit = 1000 +quota-retry-attempts: 3 + batch { auth = "service_account" location = "us-central1" diff --git a/src/ci/resources/papi_v2beta_provider_config.inc.conf b/src/ci/resources/papi_v2beta_provider_config.inc.conf index b1d0ad8837f..873cdf29c79 100644 --- a/src/ci/resources/papi_v2beta_provider_config.inc.conf +++ b/src/ci/resources/papi_v2beta_provider_config.inc.conf @@ -3,6 +3,8 @@ root = "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci" maximum-polling-interval = 600 concurrent-job-limit = 1000 +quota-retry-attempts: 3 + genomics { auth = "service_account" location = "us-central1" diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala index f09ebd88bd6..db7f1b951d6 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala @@ -68,6 +68,7 @@ import wom.values._ import scala.concurrent.Future import scala.concurrent.duration._ import scala.language.postfixOps +import scala.util.control.NoStackTrace import scala.util.{Failure, Success, Try} object PipelinesApiAsyncBackendJobExecutionActor { @@ -102,7 +103,7 @@ object PipelinesApiAsyncBackendJobExecutionActor { case None => "The job was stopped before the command finished." } - new Exception(s"Task $jobTag failed. $returnCodeMessage PAPI error code ${errorCode.getCode.value}. $message") + new Exception(s"Task $jobTag failed. $returnCodeMessage PAPI error code ${errorCode.getCode.value}. $message") with NoStackTrace } } @@ -165,7 +166,7 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta override lazy val dockerImageUsed: Option[String] = Option(jobDockerImage) override lazy val preemptible: Boolean = previousRetryReasons match { - case Valid(PreviousRetryReasons(p, _)) => p < maxPreemption + case Valid(PreviousRetryReasons(p, _, _)) => p < maxPreemption case _ => false } @@ -891,6 +892,7 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta runStatus match { case preemptedStatus: RunStatus.Preempted if preemptible => handlePreemption(preemptedStatus, returnCode) case _: RunStatus.Cancelled => AbortedExecutionHandle + case quotaFailedStatus: RunStatus.QuotaFailed => handleQuotaFailedStatus(quotaFailedStatus) case failedStatus: RunStatus.UnsuccessfulRunStatus => handleFailedRunStatus(failedStatus, returnCode) case unknown => throw new RuntimeException( @@ -901,13 +903,23 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta } } - private def nextAttemptPreemptedAndUnexpectedRetryCountsToKvPairs(p: Int, ur: Int): Seq[KvPair] = + /** + * + * @param p Preemption count + * @param ur Unexpected Retry count + * @param q Quota count + * @return KV sequence ready to be saved for the next attempt + */ + private def nextAttemptRetryKvPairs(p: Int, ur: Int, q: Int): Seq[KvPair] = Seq( KvPair(ScopedKey(workflowId, futureKvJobKey, PipelinesApiBackendLifecycleActorFactory.unexpectedRetryCountKey), ur.toString ), KvPair(ScopedKey(workflowId, futureKvJobKey, PipelinesApiBackendLifecycleActorFactory.preemptionCountKey), p.toString + ), + KvPair(ScopedKey(workflowId, futureKvJobKey, PipelinesApiBackendLifecycleActorFactory.quotaRetryCountKey), + q.toString ) ) @@ -917,11 +929,11 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta ): ExecutionHandle = { val msg = s"Retrying. $errorMessage" previousRetryReasons match { - case Valid(PreviousRetryReasons(p, ur)) => + case Valid(PreviousRetryReasons(p, ur, q)) => val thisUnexpectedRetry = ur + 1 if (thisUnexpectedRetry <= maxUnexpectedRetries) { val preemptionAndUnexpectedRetryCountsKvPairs = - nextAttemptPreemptedAndUnexpectedRetryCountsToKvPairs(p, thisUnexpectedRetry) + nextAttemptRetryKvPairs(p, thisUnexpectedRetry, q) // Increment unexpected retry count and preemption count stays the same FailedRetryableExecutionHandle( StandardException(errorCode, msg, jobTag, jobReturnCode, standardPaths.error), @@ -944,19 +956,60 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta } } + private def handleQuotaFailedStatus(runStatus: RunStatus.QuotaFailed): ExecutionHandle = { + + val machineType = runStatus.machineType.map(mt => s"$mt ").getOrElse("") + val baseMsg = s"Could not start instance ${machineType}due to insufficient quota." + + previousRetryReasons match { + case Valid(PreviousRetryReasons(p, ur, q)) => + val thisQuotaFailure = q + 1 + val nextKvPairs = nextAttemptRetryKvPairs(p, ur, thisQuotaFailure) + + if (thisQuotaFailure < pipelinesConfiguration.papiAttributes.quotaRetries) { + val retryFlavor = s"$baseMsg Cromwell will automatically retry the task. Backend info: ${runStatus.prettyPrintedError}" + val exception = StandardException(runStatus.errorCode, retryFlavor, jobTag, None, standardPaths.error) + jobLogger.info(exception.getMessage) + FailedRetryableExecutionHandle( + exception, + None, + Option(nextKvPairs) + ) + } else { + val nopeFlavor = s"$baseMsg Cromwell retries exhausted, task failed. Backend info: ${runStatus.prettyPrintedError}" + val exception = StandardException(runStatus.errorCode, nopeFlavor, jobTag, None, standardPaths.error) + jobLogger.info(exception.getMessage) + FailedNonRetryableExecutionHandle( + StandardException(runStatus.errorCode, nopeFlavor, jobTag, None, standardPaths.error), + None, + None + ) + } + case Invalid(_) => + val otherMsg = s"$baseMsg Backend info: ${runStatus.prettyPrintedError}" + val exception = StandardException(runStatus.errorCode, otherMsg, jobTag, None, standardPaths.error) + jobLogger.info(exception.getMessage) + FailedNonRetryableExecutionHandle( + exception, + None, + None + ) + } + } + private def handlePreemption(runStatus: RunStatus.Preempted, jobReturnCode: Option[Int]): ExecutionHandle = { import common.numeric.IntegerUtil._ val errorCode: Status = runStatus.errorCode val prettyPrintedError: String = runStatus.prettyPrintedError previousRetryReasons match { - case Valid(PreviousRetryReasons(p, ur)) => + case Valid(PreviousRetryReasons(p, ur, q)) => val thisPreemption = p + 1 val taskName = s"${workflowDescriptor.id}:${call.localName}" val baseMsg = s"Task $taskName was preempted for the ${thisPreemption.toOrdinal} time." val preemptionAndUnexpectedRetryCountsKvPairs = - nextAttemptPreemptedAndUnexpectedRetryCountsToKvPairs(thisPreemption, ur) + nextAttemptRetryKvPairs(thisPreemption, ur, q) if (thisPreemption < maxPreemption) { // Increment preemption count and unexpectedRetryCount stays the same val msg = diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala index 21bc2e7d2b6..6ed6bb52d4a 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala @@ -32,7 +32,7 @@ abstract class PipelinesApiBackendLifecycleActorFactory( protected def requiredBackendSingletonActor(serviceRegistryActor: ActorRef): Props protected val jesConfiguration: PipelinesApiConfiguration - override val requestedKeyValueStoreKeys: Seq[String] = Seq(preemptionCountKey, unexpectedRetryCountKey) + override val requestedKeyValueStoreKeys: Seq[String] = Seq(preemptionCountKey, unexpectedRetryCountKey, quotaRetryCountKey) protected val googleConfig: GoogleConfiguration = GoogleConfiguration(configurationDescriptor.globalConfig) @@ -123,8 +123,9 @@ abstract class PipelinesApiBackendLifecycleActorFactory( } object PipelinesApiBackendLifecycleActorFactory extends StrictLogging { - val preemptionCountKey = "PreemptionCount" + val preemptionCountKey = "PreemptionCount" val unexpectedRetryCountKey = "UnexpectedRetryCount" + val quotaRetryCountKey = "QuotaRetryCount" private[common] def robustBuildAttributes(buildAttributes: () => PipelinesApiConfigurationAttributes, maxAttempts: Int = 3, diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfiguration.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfiguration.scala index f279728d102..c1d273b4de8 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfiguration.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfiguration.scala @@ -3,7 +3,7 @@ package cromwell.backend.google.pipelines.common import com.typesafe.config.Config import cromwell.backend.BackendConfigurationDescriptor import cromwell.backend.google.pipelines.common.api.PipelinesApiFactoryInterface -import cromwell.backend.google.pipelines.common.authentication.{PipelinesApiAuths, PipelinesApiDockerCredentials} +import cromwell.backend.google.pipelines.common.authentication.PipelinesApiDockerCredentials import cromwell.cloudsupport.gcp.GoogleConfiguration import cromwell.core.BackendDockerConfiguration import net.ceedubs.ficus.Ficus._ @@ -17,7 +17,6 @@ class PipelinesApiConfiguration(val configurationDescriptor: BackendConfiguratio val papiAttributes: PipelinesApiConfigurationAttributes ) extends DefaultJsonProtocol { - val jesAuths: PipelinesApiAuths = papiAttributes.auths val root: String = configurationDescriptor.backendConfig.getString("root") val pipelineTimeout: FiniteDuration = papiAttributes.pipelineTimeout val runtimeConfig: Option[Config] = configurationDescriptor.backendRuntimeAttributesConfig diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PreviousRetryReasons.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PreviousRetryReasons.scala index 50a372bc027..71a5ca2e8d0 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PreviousRetryReasons.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PreviousRetryReasons.scala @@ -5,13 +5,14 @@ import cats.syntax.validated._ import common.validation.ErrorOr.ErrorOr import cromwell.backend.google.pipelines.common.PipelinesApiBackendLifecycleActorFactory.{ preemptionCountKey, - unexpectedRetryCountKey + unexpectedRetryCountKey, + quotaRetryCountKey } import cromwell.services.keyvalue.KeyValueServiceActor._ import scala.util.{Failure, Success, Try} -case class PreviousRetryReasons(preempted: Int, unexpectedRetry: Int) +case class PreviousRetryReasons(preempted: Int, unexpectedRetry: Int, quota: Int) object PreviousRetryReasons { @@ -19,16 +20,17 @@ object PreviousRetryReasons { val validatedPreemptionCount = validatedKvResponse(prefetchedKvEntries.get(preemptionCountKey), preemptionCountKey) val validatedUnexpectedRetryCount = validatedKvResponse(prefetchedKvEntries.get(unexpectedRetryCountKey), unexpectedRetryCountKey) + val validatedQuotaRetryCount = validatedKvResponse(prefetchedKvEntries.get(quotaRetryCountKey), quotaRetryCountKey) - (validatedPreemptionCount, validatedUnexpectedRetryCount) mapN PreviousRetryReasons.apply + (validatedPreemptionCount, validatedUnexpectedRetryCount, validatedQuotaRetryCount) mapN PreviousRetryReasons.apply } - def apply(knownPreemptedCount: Int, knownUnexpectedRetryCount: Int, attempt: Int): PreviousRetryReasons = { + def apply(knownPreemptedCount: Int, knownUnexpectedRetryCount: Int, quotaCount: Int, attempt: Int): PreviousRetryReasons = { // If we have anything unaccounted for, we can top up the unexpected retry count. // NB: 'attempt' is 1-indexed, so, magic number: // NB2: for sanity's sake, I won't let this unaccounted for drop below 0, just in case... val unaccountedFor = Math.max(attempt - 1 - knownPreemptedCount - knownUnexpectedRetryCount, 0) - PreviousRetryReasons(knownPreemptedCount, knownUnexpectedRetryCount + unaccountedFor) + PreviousRetryReasons(knownPreemptedCount, knownUnexpectedRetryCount + unaccountedFor, quotaCount) } private def validatedKvResponse(r: Option[KvResponse], fromKey: String): ErrorOr[Int] = r match { diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala index 933a6139344..b8e14b4c336 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala @@ -114,4 +114,19 @@ object RunStatus { ) extends UnsuccessfulRunStatus { override def toString = "Preempted" } + + /** + * This should NOT happen, but occasionally we see Life Sciences fail jobs with + * as FAILED_PRECONDITION and a message that contains "no available zones" or similar. (WX-1625) + */ + final case class QuotaFailed(errorCode: Status, + jesCode: Option[Int], + errorMessages: List[String], + eventList: Seq[ExecutionEvent], + machineType: Option[String], + zone: Option[String], + instanceName: Option[String] + ) extends UnsuccessfulRunStatus { + override def toString = "QuotaFailed" + } } diff --git a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala index 2329af494c0..e68dbd4287a 100644 --- a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala +++ b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala @@ -6,7 +6,8 @@ import com.google.api.services.lifesciences.v2beta.model._ import common.validation.ErrorOr.ErrorOr import cromwell.backend.google.pipelines.common.action.ActionLabels._ import cromwell.backend.google.pipelines.common.PipelinesApiAsyncBackendJobExecutionActor -import cromwell.backend.google.pipelines.common.api.RunStatus.{Cancelled, Failed, Preempted, UnsuccessfulRunStatus} +import cromwell.backend.google.pipelines.common.api.RunStatus.{Cancelled, Failed, Preempted, QuotaFailed, UnsuccessfulRunStatus} +import cromwell.backend.google.pipelines.common.errors.isQuotaMessage import cromwell.backend.google.pipelines.v2beta.api.request.RequestHandler.logger import cromwell.core.{ExecutionEvent, WorkflowId} import io.grpc.{Status => GStatus} @@ -69,6 +70,8 @@ class ErrorReporter(machineType: Option[String], _.contains(PipelinesApiAsyncBackendJobExecutionActor.FailedV2Style) ) => Preempted.apply _ + case GStatus.FAILED_PRECONDITION if isQuotaMessage(error.getMessage) => + QuotaFailed.apply _ case GStatus.CANCELLED => Cancelled.apply _ case _ => Failed.apply _ } From 12f7a0729d8f02f56a1ec8d918825a0d8e4ee182 Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Wed, 15 May 2024 18:05:14 -0400 Subject: [PATCH 02/15] I want that too --- .../PipelinesApiBackendLifecycleActorFactory.scala | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala index 6ed6bb52d4a..dd4a02f7428 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala @@ -4,16 +4,9 @@ import akka.actor.{ActorRef, Props} import com.google.api.client.util.ExponentialBackOff import com.typesafe.scalalogging.StrictLogging import cromwell.backend._ -import cromwell.backend.google.pipelines.common.PipelinesApiBackendLifecycleActorFactory.{ - preemptionCountKey, - robustBuildAttributes, - unexpectedRetryCountKey -} +import cromwell.backend.google.pipelines.common.PipelinesApiBackendLifecycleActorFactory.{preemptionCountKey, quotaRetryCountKey, robustBuildAttributes, unexpectedRetryCountKey} import cromwell.backend.google.pipelines.common.authentication.PipelinesApiDockerCredentials -import cromwell.backend.google.pipelines.common.callcaching.{ - PipelinesApiBackendCacheHitCopyingActor, - PipelinesApiBackendFileHashingActor -} +import cromwell.backend.google.pipelines.common.callcaching.{PipelinesApiBackendCacheHitCopyingActor, PipelinesApiBackendFileHashingActor} import cromwell.backend.standard._ import cromwell.backend.standard.callcaching.{StandardCacheHitCopyingActor, StandardFileHashingActor} import cromwell.cloudsupport.gcp.GoogleConfiguration From 4560c1c81d5c0b3029e3b50adafa3d91af1fb1c1 Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Wed, 15 May 2024 18:23:32 -0400 Subject: [PATCH 03/15] Naming --- cromwell.example.backends/PAPIv2.conf | 2 +- docs/backends/Google.md | 3 ++- src/ci/resources/gcp_batch_provider_config.inc.conf | 2 +- src/ci/resources/papi_v2beta_provider_config.inc.conf | 2 +- .../common/PipelinesApiAsyncBackendJobExecutionActor.scala | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cromwell.example.backends/PAPIv2.conf b/cromwell.example.backends/PAPIv2.conf index b83efd6bc82..f6be1fec897 100644 --- a/cromwell.example.backends/PAPIv2.conf +++ b/cromwell.example.backends/PAPIv2.conf @@ -62,7 +62,7 @@ backend { # pipeline-timeout = 7 days # Cromwell will retry jobs that fail with a quota signature, such as "PAPI error code 9", "no available zones", and/or "quota too low". - quota-retry-attempts = 20 + quota-attempts = 20 genomics { # A reference to an auth defined in the `google` stanza at the top. This auth is used to create diff --git a/docs/backends/Google.md b/docs/backends/Google.md index 90d26b585ac..605df9975bc 100644 --- a/docs/backends/Google.md +++ b/docs/backends/Google.md @@ -249,7 +249,8 @@ and raising quota is still recommended. ```hocon backend.providers.PAPIv2.config { - quota-retry-attempts: 20 + # Counts attempts (total jobs) not just retries after to the first + quota-attempts: 20 } ``` diff --git a/src/ci/resources/gcp_batch_provider_config.inc.conf b/src/ci/resources/gcp_batch_provider_config.inc.conf index 63c698c987a..7445ea2b89e 100644 --- a/src/ci/resources/gcp_batch_provider_config.inc.conf +++ b/src/ci/resources/gcp_batch_provider_config.inc.conf @@ -3,7 +3,7 @@ root = "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci" maximum-polling-interval = 600 concurrent-job-limit = 1000 -quota-retry-attempts: 3 +quota-attempts: 3 batch { auth = "service_account" diff --git a/src/ci/resources/papi_v2beta_provider_config.inc.conf b/src/ci/resources/papi_v2beta_provider_config.inc.conf index 873cdf29c79..e69eeaebc01 100644 --- a/src/ci/resources/papi_v2beta_provider_config.inc.conf +++ b/src/ci/resources/papi_v2beta_provider_config.inc.conf @@ -3,7 +3,7 @@ root = "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci" maximum-polling-interval = 600 concurrent-job-limit = 1000 -quota-retry-attempts: 3 +quota-attempts: 3 genomics { auth = "service_account" diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala index db7f1b951d6..06f3cb479c9 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala @@ -966,7 +966,7 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta val thisQuotaFailure = q + 1 val nextKvPairs = nextAttemptRetryKvPairs(p, ur, thisQuotaFailure) - if (thisQuotaFailure < pipelinesConfiguration.papiAttributes.quotaRetries) { + if (thisQuotaFailure < pipelinesConfiguration.papiAttributes.quotaAttempts) { val retryFlavor = s"$baseMsg Cromwell will automatically retry the task. Backend info: ${runStatus.prettyPrintedError}" val exception = StandardException(runStatus.errorCode, retryFlavor, jobTag, None, standardPaths.error) jobLogger.info(exception.getMessage) From 74210d88867aa6bbcd4a347b3727d85cfd6619af Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Wed, 15 May 2024 19:24:04 -0400 Subject: [PATCH 04/15] Finish rename --- .../common/PipelinesApiConfigurationAttributes.scala | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfigurationAttributes.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfigurationAttributes.scala index 9f279c4350c..44c1078431a 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfigurationAttributes.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiConfigurationAttributes.scala @@ -51,6 +51,7 @@ case class PipelinesApiConfigurationAttributes( cacheHitDuplicationStrategy: PipelinesCacheHitDuplicationStrategy, requestWorkers: Int Refined Positive, pipelineTimeout: FiniteDuration, + quotaAttempts: Int, logFlushPeriod: Option[FiniteDuration], gcsTransferConfiguration: GcsTransferConfiguration, virtualPrivateCloudConfiguration: VirtualPrivateCloudConfiguration, @@ -86,6 +87,9 @@ object PipelinesApiConfigurationAttributes val checkpointingIntervalKey = "checkpointing-interval" + /** + * Used to screen & warn about unexpected keys + */ private val papiKeys = CommonBackendConfigurationAttributes.commonValidConfigurationAttributeKeys ++ Set( "project", "root", @@ -108,6 +112,7 @@ object PipelinesApiConfigurationAttributes "concurrent-job-limit", "request-workers", "pipeline-timeout", + "quota-attempts", "batch-requests.timeouts.read", "batch-requests.timeouts.connect", "default-runtime-attributes.bootDiskSizeGb", @@ -223,6 +228,8 @@ object PipelinesApiConfigurationAttributes val pipelineTimeout: FiniteDuration = backendConfig.getOrElse("pipeline-timeout", 7.days) + val quotaAttempts: Int = backendConfig.as[Option[Int]]("quota-attempts").getOrElse(20) + val logFlushPeriod: Option[FiniteDuration] = backendConfig.as[Option[FiniteDuration]]("log-flush-period") match { case Some(duration) if duration.isFinite => Option(duration) // "Inf" disables upload @@ -317,6 +324,7 @@ object PipelinesApiConfigurationAttributes cacheHitDuplicationStrategy = cacheHitDuplicationStrategy, requestWorkers = requestWorkers, pipelineTimeout = pipelineTimeout, + quotaAttempts = quotaAttempts, logFlushPeriod = logFlushPeriod, gcsTransferConfiguration = gcsTransferConfiguration, virtualPrivateCloudConfiguration = virtualPrivateCloudConfiguration, From 5e0bf4f89970196e1c7d602509d26c4012a9e6e7 Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Wed, 15 May 2024 19:37:42 -0400 Subject: [PATCH 05/15] Fix existing tests --- .../common/PipelinesApiBackendLifecycleActorFactorySpec.scala | 1 + .../PipelinesApiBackendCacheHitCopyingActorSpec.scala | 1 + 2 files changed, 2 insertions(+) diff --git a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactorySpec.scala b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactorySpec.scala index f2ba65c65db..269ed4f362e 100644 --- a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactorySpec.scala +++ b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactorySpec.scala @@ -30,6 +30,7 @@ class PipelinesApiBackendLifecycleActorFactorySpec cacheHitDuplicationStrategy = null, requestWorkers = refineV[Positive](1).toOption.get, pipelineTimeout = 1 second, + quotaAttempts = 3, logFlushPeriod = Option(1 second), gcsTransferConfiguration = null, virtualPrivateCloudConfiguration = null, diff --git a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/callcaching/PipelinesApiBackendCacheHitCopyingActorSpec.scala b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/callcaching/PipelinesApiBackendCacheHitCopyingActorSpec.scala index 368b631c6ce..cc0900acc85 100644 --- a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/callcaching/PipelinesApiBackendCacheHitCopyingActorSpec.scala +++ b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/callcaching/PipelinesApiBackendCacheHitCopyingActorSpec.scala @@ -429,6 +429,7 @@ class PipelinesApiBackendCacheHitCopyingActorSpec cacheHitDuplicationStrategy = CopyCachedOutputs, requestWorkers = refineMV[Positive](1), pipelineTimeout = null, + quotaAttempts = 3, logFlushPeriod = None, gcsTransferConfiguration = null, virtualPrivateCloudConfiguration = VirtualPrivateCloudConfiguration(None, None), From b99bf2c185e80e330ad4eac8f162d1c254212630 Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Wed, 15 May 2024 19:37:57 -0400 Subject: [PATCH 06/15] New test --- .../v2beta/api/request/GetRequestHandlerSpec.scala | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala b/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala index 7f0ae47135f..0befef54fa6 100644 --- a/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala +++ b/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala @@ -106,6 +106,17 @@ class GetRequestHandlerSpec extends AnyFlatSpec with CromwellTimeoutSpec with Ma |""".stripMargin, Failed(Status.UNAVAILABLE, None, Nil, Nil, None, None, None) ), + ("parse & classify quota fails", + """|{ + | "done": true, + | "error": { + | "code": 9, + | "message": "Execution failed: allocating: selecting resources: selecting region and zone: no available zones: northamerica-northeast1: 12 CPUS (9/9 available) quota too low" + | }, + | "name": "projects/1005074806481/locations/us-central1/operations/16958337426039071297" + |}""".stripMargin, + QuotaFailed(Status.FAILED_PRECONDITION, None, Nil, Nil, None, None, None) + ), ("check that we classify error code 10 as a preemption on a preemptible VM", """{ | "done": true, From f51dca4638ad808098f5d78b39e1e7dfee265e53 Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Wed, 15 May 2024 20:19:50 -0400 Subject: [PATCH 07/15] This is a job for another day --- .../google/pipelines/common/api/RunStatus.scala | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala index b8e14b4c336..0f2c178a28f 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala @@ -40,6 +40,16 @@ object RunStatus { } object UnsuccessfulRunStatus { + + // TODO: Dead code alert. Functional code only ever calls this with status `UNKNOWN`. + // + // Seems to have been replaced with: + // - cromwell.backend.google.pipelines.v2beta.api.request.ErrorReporter#toUnsuccessfulRunStatus + // - cromwell.backend.google.batch.models.RunStatus#fromJobStatus + // There are useful tests in `PipelinesApiAsyncBackendJobExecutionActorSpec` + // that test other things and happen to rely on this method, so eventually + // delete it with the rest of Life Sciences. GCP Batch does not use the + // `PipelinesApiAsyncBackendJobExecutionActor` at all. def apply(errorCode: Status, errorMessage: Option[String], eventList: Seq[ExecutionEvent], From eb9e50d6c2ba4b660aa43e8454ae0e6c637f06a2 Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Wed, 15 May 2024 20:55:42 -0400 Subject: [PATCH 08/15] Integration test --- .../standardTestCases/quota_fail_retry.test | 20 ++++++++++ .../quota_fail_retry/quota_fail_retry.wdl | 38 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 centaur/src/main/resources/standardTestCases/quota_fail_retry.test create mode 100644 centaur/src/main/resources/standardTestCases/quota_fail_retry/quota_fail_retry.wdl diff --git a/centaur/src/main/resources/standardTestCases/quota_fail_retry.test b/centaur/src/main/resources/standardTestCases/quota_fail_retry.test new file mode 100644 index 00000000000..f351d5c3f11 --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/quota_fail_retry.test @@ -0,0 +1,20 @@ +name: quota_fail_retry +testFormat: workflowfailure +backends: [Papiv2] + +files { + workflow: quota_fail_retry/quota_fail_retry.wdl +} + +# Adapted from `preemptible_and_memory_retry.test`. +# This functionality is pretty married to PAPI, it doesn't run on `GCPBatch` backend. + +metadata { + workflowName: sleepy_sleep + status: Failed + "failures.0.message": "Workflow failed" + "failures.0.causedBy.0.message": "Task sleepy_sleep.sleep:NA:3 failed. The job was stopped before the command finished. PAPI error code 9. Could not start instance custom-12-11264 due to insufficient quota. Cromwell retries exhausted, task failed. Backend info: Execution failed: allocating: selecting resources: selecting region and zone: no available zones: northamerica-northeast1: 12 CPUS (9/9 available) quota too low" + "sleepy_sleep.sleep.-1.1.executionStatus": "RetryableFailure" + "sleepy_sleep.sleep.-1.2.executionStatus": "RetryableFailure" + "sleepy_sleep.sleep.-1.3.executionStatus": "Failed" +} diff --git a/centaur/src/main/resources/standardTestCases/quota_fail_retry/quota_fail_retry.wdl b/centaur/src/main/resources/standardTestCases/quota_fail_retry/quota_fail_retry.wdl new file mode 100644 index 00000000000..b3e72ebbce7 --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/quota_fail_retry/quota_fail_retry.wdl @@ -0,0 +1,38 @@ +version 1.0 + +workflow sleepy_sleep { + + input { + Int sleep_seconds = 180 + } + + call sleep { + input: sleep_seconds = sleep_seconds + } + +} + +task sleep { + + input { + Int sleep_seconds + } + + meta { + volatile: true + } + + # `northamerica-northeast1` is Montreal, which is is geographically close (low latency), + # uses 100% carbon-free energy, and best of all... we don't use for anything else, + # so I can drop its quota to something ridiculousy low in `broad-dsde-cromwell-dev` + runtime { + cpu: 12 + docker: "ubuntu:latest" + zones: "northamerica-northeast1-a northamerica-northeast1-b northamerica-northeast1-c" + } + + command <<< + sleep ~{sleep_seconds}; + ls -la + >>> +} From ee8009308622c855895b9bc2b18da3a10c824e61 Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Wed, 15 May 2024 20:59:02 -0400 Subject: [PATCH 09/15] `scalafmtAll` --- ...inesApiAsyncBackendJobExecutionActor.scala | 9 ++++++--- ...linesApiBackendLifecycleActorFactory.scala | 19 ++++++++++++++----- .../common/PreviousRetryReasons.scala | 10 +++++++--- .../v2beta/api/request/ErrorReporter.scala | 8 +++++++- .../api/request/GetRequestHandlerSpec.scala | 18 +++++++++--------- 5 files changed, 43 insertions(+), 21 deletions(-) diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala index 06f3cb479c9..ada935dd675 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala @@ -103,7 +103,8 @@ object PipelinesApiAsyncBackendJobExecutionActor { case None => "The job was stopped before the command finished." } - new Exception(s"Task $jobTag failed. $returnCodeMessage PAPI error code ${errorCode.getCode.value}. $message") with NoStackTrace + new Exception(s"Task $jobTag failed. $returnCodeMessage PAPI error code ${errorCode.getCode.value}. $message") + with NoStackTrace } } @@ -967,7 +968,8 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta val nextKvPairs = nextAttemptRetryKvPairs(p, ur, thisQuotaFailure) if (thisQuotaFailure < pipelinesConfiguration.papiAttributes.quotaAttempts) { - val retryFlavor = s"$baseMsg Cromwell will automatically retry the task. Backend info: ${runStatus.prettyPrintedError}" + val retryFlavor = + s"$baseMsg Cromwell will automatically retry the task. Backend info: ${runStatus.prettyPrintedError}" val exception = StandardException(runStatus.errorCode, retryFlavor, jobTag, None, standardPaths.error) jobLogger.info(exception.getMessage) FailedRetryableExecutionHandle( @@ -976,7 +978,8 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta Option(nextKvPairs) ) } else { - val nopeFlavor = s"$baseMsg Cromwell retries exhausted, task failed. Backend info: ${runStatus.prettyPrintedError}" + val nopeFlavor = + s"$baseMsg Cromwell retries exhausted, task failed. Backend info: ${runStatus.prettyPrintedError}" val exception = StandardException(runStatus.errorCode, nopeFlavor, jobTag, None, standardPaths.error) jobLogger.info(exception.getMessage) FailedNonRetryableExecutionHandle( diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala index dd4a02f7428..ef4bc6df528 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiBackendLifecycleActorFactory.scala @@ -4,9 +4,17 @@ import akka.actor.{ActorRef, Props} import com.google.api.client.util.ExponentialBackOff import com.typesafe.scalalogging.StrictLogging import cromwell.backend._ -import cromwell.backend.google.pipelines.common.PipelinesApiBackendLifecycleActorFactory.{preemptionCountKey, quotaRetryCountKey, robustBuildAttributes, unexpectedRetryCountKey} +import cromwell.backend.google.pipelines.common.PipelinesApiBackendLifecycleActorFactory.{ + preemptionCountKey, + quotaRetryCountKey, + robustBuildAttributes, + unexpectedRetryCountKey +} import cromwell.backend.google.pipelines.common.authentication.PipelinesApiDockerCredentials -import cromwell.backend.google.pipelines.common.callcaching.{PipelinesApiBackendCacheHitCopyingActor, PipelinesApiBackendFileHashingActor} +import cromwell.backend.google.pipelines.common.callcaching.{ + PipelinesApiBackendCacheHitCopyingActor, + PipelinesApiBackendFileHashingActor +} import cromwell.backend.standard._ import cromwell.backend.standard.callcaching.{StandardCacheHitCopyingActor, StandardFileHashingActor} import cromwell.cloudsupport.gcp.GoogleConfiguration @@ -25,7 +33,8 @@ abstract class PipelinesApiBackendLifecycleActorFactory( protected def requiredBackendSingletonActor(serviceRegistryActor: ActorRef): Props protected val jesConfiguration: PipelinesApiConfiguration - override val requestedKeyValueStoreKeys: Seq[String] = Seq(preemptionCountKey, unexpectedRetryCountKey, quotaRetryCountKey) + override val requestedKeyValueStoreKeys: Seq[String] = + Seq(preemptionCountKey, unexpectedRetryCountKey, quotaRetryCountKey) protected val googleConfig: GoogleConfiguration = GoogleConfiguration(configurationDescriptor.globalConfig) @@ -116,9 +125,9 @@ abstract class PipelinesApiBackendLifecycleActorFactory( } object PipelinesApiBackendLifecycleActorFactory extends StrictLogging { - val preemptionCountKey = "PreemptionCount" + val preemptionCountKey = "PreemptionCount" val unexpectedRetryCountKey = "UnexpectedRetryCount" - val quotaRetryCountKey = "QuotaRetryCount" + val quotaRetryCountKey = "QuotaRetryCount" private[common] def robustBuildAttributes(buildAttributes: () => PipelinesApiConfigurationAttributes, maxAttempts: Int = 3, diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PreviousRetryReasons.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PreviousRetryReasons.scala index 71a5ca2e8d0..8797588da34 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PreviousRetryReasons.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PreviousRetryReasons.scala @@ -5,8 +5,8 @@ import cats.syntax.validated._ import common.validation.ErrorOr.ErrorOr import cromwell.backend.google.pipelines.common.PipelinesApiBackendLifecycleActorFactory.{ preemptionCountKey, - unexpectedRetryCountKey, - quotaRetryCountKey + quotaRetryCountKey, + unexpectedRetryCountKey } import cromwell.services.keyvalue.KeyValueServiceActor._ @@ -25,7 +25,11 @@ object PreviousRetryReasons { (validatedPreemptionCount, validatedUnexpectedRetryCount, validatedQuotaRetryCount) mapN PreviousRetryReasons.apply } - def apply(knownPreemptedCount: Int, knownUnexpectedRetryCount: Int, quotaCount: Int, attempt: Int): PreviousRetryReasons = { + def apply(knownPreemptedCount: Int, + knownUnexpectedRetryCount: Int, + quotaCount: Int, + attempt: Int + ): PreviousRetryReasons = { // If we have anything unaccounted for, we can top up the unexpected retry count. // NB: 'attempt' is 1-indexed, so, magic number: // NB2: for sanity's sake, I won't let this unaccounted for drop below 0, just in case... diff --git a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala index e68dbd4287a..77e53176df3 100644 --- a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala +++ b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala @@ -6,7 +6,13 @@ import com.google.api.services.lifesciences.v2beta.model._ import common.validation.ErrorOr.ErrorOr import cromwell.backend.google.pipelines.common.action.ActionLabels._ import cromwell.backend.google.pipelines.common.PipelinesApiAsyncBackendJobExecutionActor -import cromwell.backend.google.pipelines.common.api.RunStatus.{Cancelled, Failed, Preempted, QuotaFailed, UnsuccessfulRunStatus} +import cromwell.backend.google.pipelines.common.api.RunStatus.{ + Cancelled, + Failed, + Preempted, + QuotaFailed, + UnsuccessfulRunStatus +} import cromwell.backend.google.pipelines.common.errors.isQuotaMessage import cromwell.backend.google.pipelines.v2beta.api.request.RequestHandler.logger import cromwell.core.{ExecutionEvent, WorkflowId} diff --git a/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala b/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala index 0befef54fa6..aac8722b9c7 100644 --- a/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala +++ b/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala @@ -107,15 +107,15 @@ class GetRequestHandlerSpec extends AnyFlatSpec with CromwellTimeoutSpec with Ma Failed(Status.UNAVAILABLE, None, Nil, Nil, None, None, None) ), ("parse & classify quota fails", - """|{ - | "done": true, - | "error": { - | "code": 9, - | "message": "Execution failed: allocating: selecting resources: selecting region and zone: no available zones: northamerica-northeast1: 12 CPUS (9/9 available) quota too low" - | }, - | "name": "projects/1005074806481/locations/us-central1/operations/16958337426039071297" - |}""".stripMargin, - QuotaFailed(Status.FAILED_PRECONDITION, None, Nil, Nil, None, None, None) + """|{ + | "done": true, + | "error": { + | "code": 9, + | "message": "Execution failed: allocating: selecting resources: selecting region and zone: no available zones: northamerica-northeast1: 12 CPUS (9/9 available) quota too low" + | }, + | "name": "projects/1005074806481/locations/us-central1/operations/16958337426039071297" + |}""".stripMargin, + QuotaFailed(Status.FAILED_PRECONDITION, None, Nil, Nil, None, None, None) ), ("check that we classify error code 10 as a preemption on a preemptible VM", """{ From 381a6b582beaf28593350a44ab610cccad1d9130 Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Thu, 16 May 2024 12:31:33 -0400 Subject: [PATCH 10/15] Re-use previous low quota region --- .../main/resources/standardTestCases/quota_fail_retry.test | 3 ++- .../standardTestCases/quota_fail_retry/quota_fail_retry.wdl | 6 ++---- .../v2beta/api/request/GetRequestHandlerSpec.scala | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/centaur/src/main/resources/standardTestCases/quota_fail_retry.test b/centaur/src/main/resources/standardTestCases/quota_fail_retry.test index f351d5c3f11..36e9565bb84 100644 --- a/centaur/src/main/resources/standardTestCases/quota_fail_retry.test +++ b/centaur/src/main/resources/standardTestCases/quota_fail_retry.test @@ -7,13 +7,14 @@ files { } # Adapted from `preemptible_and_memory_retry.test`. +# I set `broad-dsde-cromwell-dev` to have super low CPU quota in `us-west3` (Salt Lake City) for this test # This functionality is pretty married to PAPI, it doesn't run on `GCPBatch` backend. metadata { workflowName: sleepy_sleep status: Failed "failures.0.message": "Workflow failed" - "failures.0.causedBy.0.message": "Task sleepy_sleep.sleep:NA:3 failed. The job was stopped before the command finished. PAPI error code 9. Could not start instance custom-12-11264 due to insufficient quota. Cromwell retries exhausted, task failed. Backend info: Execution failed: allocating: selecting resources: selecting region and zone: no available zones: northamerica-northeast1: 12 CPUS (9/9 available) quota too low" + "failures.0.causedBy.0.message": "Task sleepy_sleep.sleep:NA:3 failed. The job was stopped before the command finished. PAPI error code 9. Could not start instance custom-12-11264 due to insufficient quota. Cromwell retries exhausted, task failed. Backend info: Execution failed: allocating: selecting resources: selecting region and zone: no available zones: us-west3: 12 CPUS (10/10 available) quota too low" "sleepy_sleep.sleep.-1.1.executionStatus": "RetryableFailure" "sleepy_sleep.sleep.-1.2.executionStatus": "RetryableFailure" "sleepy_sleep.sleep.-1.3.executionStatus": "Failed" diff --git a/centaur/src/main/resources/standardTestCases/quota_fail_retry/quota_fail_retry.wdl b/centaur/src/main/resources/standardTestCases/quota_fail_retry/quota_fail_retry.wdl index b3e72ebbce7..9eaf7185a4d 100644 --- a/centaur/src/main/resources/standardTestCases/quota_fail_retry/quota_fail_retry.wdl +++ b/centaur/src/main/resources/standardTestCases/quota_fail_retry/quota_fail_retry.wdl @@ -22,13 +22,11 @@ task sleep { volatile: true } - # `northamerica-northeast1` is Montreal, which is is geographically close (low latency), - # uses 100% carbon-free energy, and best of all... we don't use for anything else, - # so I can drop its quota to something ridiculousy low in `broad-dsde-cromwell-dev` + # I set `broad-dsde-cromwell-dev` to have super low CPU quota in `us-west3` (Salt Lake City) for this test runtime { cpu: 12 docker: "ubuntu:latest" - zones: "northamerica-northeast1-a northamerica-northeast1-b northamerica-northeast1-c" + zones: "us-west3-a us-west3-b us-west3-c" } command <<< diff --git a/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala b/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala index aac8722b9c7..95fe61bf0c1 100644 --- a/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala +++ b/supportedBackends/google/pipelines/v2beta/src/test/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandlerSpec.scala @@ -111,7 +111,7 @@ class GetRequestHandlerSpec extends AnyFlatSpec with CromwellTimeoutSpec with Ma | "done": true, | "error": { | "code": 9, - | "message": "Execution failed: allocating: selecting resources: selecting region and zone: no available zones: northamerica-northeast1: 12 CPUS (9/9 available) quota too low" + | "message": "Execution failed: allocating: selecting resources: selecting region and zone: no available zones: us-west3: 12 CPUS (10/10 available) quota too low" | }, | "name": "projects/1005074806481/locations/us-central1/operations/16958337426039071297" |}""".stripMargin, @@ -313,7 +313,7 @@ class GetRequestHandlerSpec extends AnyFlatSpec with CromwellTimeoutSpec with Ma None ) ), - // As of 2022-01 the zone `us-west3` in `broad-dsde-cromwell-dev` has its CPU quota purposely de-rated to 1 for testing + // As of 2022-01 the zone `us-west3` in `broad-dsde-cromwell-dev` has its CPU quota purposely de-rated to 10 for testing ("check that a job is AwaitingCloudQuota if its most recent event is quota exhaustion", """{ | "metadata": { From c6817f3376d5541fa243ca941469f0891706dba1 Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Thu, 16 May 2024 14:38:50 -0400 Subject: [PATCH 11/15] Fix tests --- .../PipelinesApiAsyncBackendJobExecutionActorSpec.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActorSpec.scala b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActorSpec.scala index 41e826a4dc2..53bc7b6ab8a 100644 --- a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActorSpec.scala +++ b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActorSpec.scala @@ -258,6 +258,13 @@ class PipelinesApiAsyncBackendJobExecutionActorSpec PipelinesApiBackendLifecycleActorFactory.unexpectedRetryCountKey ), previousUnexpectedRetries.toString + ), + PipelinesApiBackendLifecycleActorFactory.quotaRetryCountKey -> KvPair( + ScopedKey(workflowDescriptor.id, + KvJobKey(key), + PipelinesApiBackendLifecycleActorFactory.quotaRetryCountKey + ), + 0.toString // We're testing this in other ways, fake it here. ) ) val prefetchedKvEntriesUpd = if (failedRetriesCountOpt.isEmpty) { From 91241e6054bb25abc242da7219731a2cfab6bcb0 Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Thu, 16 May 2024 14:42:57 -0400 Subject: [PATCH 12/15] Not sure I agree with `scalafmtAll` here, but OK --- .../PipelinesApiAsyncBackendJobExecutionActorSpec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActorSpec.scala b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActorSpec.scala index 53bc7b6ab8a..194433b1162 100644 --- a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActorSpec.scala +++ b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActorSpec.scala @@ -261,8 +261,8 @@ class PipelinesApiAsyncBackendJobExecutionActorSpec ), PipelinesApiBackendLifecycleActorFactory.quotaRetryCountKey -> KvPair( ScopedKey(workflowDescriptor.id, - KvJobKey(key), - PipelinesApiBackendLifecycleActorFactory.quotaRetryCountKey + KvJobKey(key), + PipelinesApiBackendLifecycleActorFactory.quotaRetryCountKey ), 0.toString // We're testing this in other ways, fake it here. ) From 4730e98ec350dab6bbbfef93d48422995422826c Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Thu, 16 May 2024 15:23:16 -0400 Subject: [PATCH 13/15] Grammar fix --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f758f131e11..2c5e944ea02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ The `genomics` configuration entry was renamed to `batch`, see [ReadTheDocs](htt ### Improved handling of Life Sciences API quota errors -Users reported cases where Life Sciences jobs failed due to insufficient quota, instead of queueing them and waiting until +Users reported cases where Life Sciences jobs failed due to insufficient quota, instead of queueing and waiting until quota freed up (which is the expected behavior). Cromwell will now retry under these conditions, which present with errors such as "PAPI error code 9", "no available zones", and/or "quota too low". From 716b4e9d84c43b048f8a0cb190b43593c292cc7b Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Thu, 16 May 2024 15:24:51 -0400 Subject: [PATCH 14/15] Grammar fix --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b14a074ee21..a16dc98132e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ The `genomics` configuration entry was renamed to `batch`, see [ReadTheDocs](htt ### Improved handling of Life Sciences API quota errors Users reported cases where Life Sciences jobs failed due to insufficient quota, instead of queueing and waiting until -quota freed up (which is the expected behavior). Cromwell will now retry under these conditions, which present with errors +quota is available (which is the expected behavior). Cromwell will now retry under these conditions, which present with errors such as "PAPI error code 9", "no available zones", and/or "quota too low". ## 87 Release Notes From 0ecc2cf0cc84d86a5cb55fd36e8c1578c5af92df Mon Sep 17 00:00:00 2001 From: Adam Nichols Date: Fri, 17 May 2024 09:30:44 -0400 Subject: [PATCH 15/15] Remove irrelevant key from GCP Batch --- src/ci/resources/gcp_batch_provider_config.inc.conf | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/ci/resources/gcp_batch_provider_config.inc.conf b/src/ci/resources/gcp_batch_provider_config.inc.conf index 7445ea2b89e..9277e8e7678 100644 --- a/src/ci/resources/gcp_batch_provider_config.inc.conf +++ b/src/ci/resources/gcp_batch_provider_config.inc.conf @@ -3,8 +3,6 @@ root = "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci" maximum-polling-interval = 600 concurrent-job-limit = 1000 -quota-attempts: 3 - batch { auth = "service_account" location = "us-central1"