Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WX-990 Make TES request backoff behavior configurable #7122

Merged
merged 4 commits into from
Apr 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions cromwell.example.backends/TES.conf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,22 @@ backend {
disk: "2 GB"
preemptible: false
}

# Backoff behavior for task status polling and execution retries are configurable, with defaults
# shown below. All four fields must be set for each backoff if overriding.
#
# poll-backoff {
# min: "10 seconds"
# max: "5 minutes"
# multiplier: 1.1
# randomization-factor: 0.5
# }
# execute-or-recover-backoff {
# min: "3 seconds"
# max: "30 seconds"
# multiplier: 1.1
# randomization-factor: 0.5
# }
}
}
}
Expand Down
14 changes: 14 additions & 0 deletions docs/backends/TES.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,20 @@ backend {
}
```

Cromwell uses an exponential backoff strategy to control the rate of requests sent to the TES server.
Users can override the default settings for this behavior in config, see `cromwell.example.backends/TES.conf`
for full expected format.

For example, to force a constant polling rate rather than exponential backoff:
```hocon
backend.providers.TES.config.poll-backoff {
min: "1 minute"
max: "1 minute"
multiplier: 1
randomization-factor: 0
}
```

### Supported File Systems

Currently this backend only works with files on a Local or Shared File System.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ import wom.values.WomFile
import net.ceedubs.ficus.Ficus._

import scala.concurrent.Future
import scala.concurrent.duration._
import scala.language.postfixOps
import scala.util.{Failure, Success}

sealed trait TesRunStatus {
Expand Down Expand Up @@ -71,17 +69,8 @@ class TesAsyncBackendJobExecutionActor(override val standardParams: StandardAsyn

def statusEquivalentTo(thiz: StandardAsyncRunState)(that: StandardAsyncRunState): Boolean = thiz == that

override lazy val pollBackOff = SimpleExponentialBackoff(
initialInterval = 1 seconds,
maxInterval = 5 minutes,
multiplier = 1.1
)

override lazy val executeOrRecoverBackOff = SimpleExponentialBackoff(
initialInterval = 3 seconds,
maxInterval = 30 seconds,
multiplier = 1.1
)
override lazy val pollBackOff: SimpleExponentialBackoff = tesConfiguration.pollBackoff
override lazy val executeOrRecoverBackOff: SimpleExponentialBackoff = tesConfiguration.executeOrRecoverBackoff

private lazy val realDockerImageUsed: String = jobDescriptor.maybeCallCachingEligible.dockerHash.getOrElse(runtimeAttributes.dockerImage)
override lazy val dockerImageUsed: Option[String] = Option(realDockerImageUsed)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
package cromwell.backend.impl.tes

import com.typesafe.config.Config
import cromwell.backend.BackendConfigurationDescriptor

import cromwell.core.retry.SimpleExponentialBackoff
import net.ceedubs.ficus.Ficus._

import scala.concurrent.duration._
import scala.language.postfixOps

class TesConfiguration(val configurationDescriptor: BackendConfigurationDescriptor) {

val endpointURL = configurationDescriptor.backendConfig.getString("endpoint")
Expand All @@ -13,8 +17,35 @@ class TesConfiguration(val configurationDescriptor: BackendConfigurationDescript
.backendConfig
.as[Option[Boolean]](TesConfiguration.useBackendParametersKey)
.getOrElse(false)

val pollBackoff =
configurationDescriptor
.backendConfig
.as[Option[Config]]("poll-backoff")
.map(SimpleExponentialBackoff(_))
.getOrElse(TesConfiguration.defaultPollBackoff)

val executeOrRecoverBackoff =
configurationDescriptor
.backendConfig
.as[Option[Config]]("execute-or-recover-backoff")
.map(SimpleExponentialBackoff(_))
.getOrElse(TesConfiguration.defaultExecOrRecoverBackoff)

}

object TesConfiguration {
final val useBackendParametersKey = "use_tes_11_preview_backend_parameters"

final val defaultPollBackoff = SimpleExponentialBackoff(
initialInterval = 10 seconds,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This initialInterval default value is different than what we were using before. I don't see any issue with changing from 1 -> 10seconds, but just confirming it's intentional.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, that is intentional! For nontrivial workflows polling every 1s seems silly.

maxInterval = 5 minutes,
multiplier = 1.1
)

final val defaultExecOrRecoverBackoff = SimpleExponentialBackoff(
initialInterval = 3 seconds,
maxInterval = 30 seconds,
multiplier = 1.1
)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package cromwell.backend.impl.tes

import com.typesafe.config.{Config, ConfigException, ConfigFactory}
import cromwell.backend.BackendConfigurationDescriptor
import cromwell.core.retry.SimpleExponentialBackoff
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

import scala.concurrent.duration._
import scala.language.postfixOps

class TesConfigurationSpec extends AnyFlatSpec with Matchers {

behavior of "TesConfiguration"

def makeTesConfig(backendConfig: Config) = new TesConfiguration(
BackendConfigurationDescriptor(
backendConfig,
ConfigFactory.empty()
)
)

def backoffsAreEquivalent(expectedBackoff: SimpleExponentialBackoff, actualBackoff: SimpleExponentialBackoff): Boolean = {
val b1 = expectedBackoff.googleBackoff
val b2 = actualBackoff.googleBackoff
b1.getInitialIntervalMillis == b2.getInitialIntervalMillis &&
b1.getMaxIntervalMillis == b2.getMaxIntervalMillis &&
b1.getMultiplier == b2.getMultiplier &&
b1.getRandomizationFactor == b2.getRandomizationFactor
}

it should "use default backoffs when no custom config provided" in {
val tesConfig = makeTesConfig(TesTestConfig.backendConfig)
backoffsAreEquivalent(TesConfiguration.defaultPollBackoff, tesConfig.pollBackoff) shouldBe true
backoffsAreEquivalent(TesConfiguration.defaultExecOrRecoverBackoff, tesConfig.executeOrRecoverBackoff) shouldBe true
}

it should "use configured backoffs if they exist" in {
val tesConfig = makeTesConfig(TesTestConfig.backendConfigWithBackoffs)
backoffsAreEquivalent(SimpleExponentialBackoff(5 seconds, 1 minute, 2.5, .7), tesConfig.pollBackoff) shouldBe true
backoffsAreEquivalent(SimpleExponentialBackoff(3 minutes, 1 hours, 5, .1), tesConfig.executeOrRecoverBackoff) shouldBe true
}

it should "fail if user defines an invalid backoff" in {
assertThrows[ConfigException.Missing](makeTesConfig(TesTestConfig.backendConfigWithInvalidBackoffs))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,70 @@ object TesTestConfig {
|""".stripMargin

val backendConfigWithBackendParams = ConfigFactory.parseString(backendConfigStringWithBackendParams)

private val backendConfigStringWithBackoffs =
"""
|root = "local-cromwell-executions"
|dockerRoot = "/cromwell-executions"
|endpoint = "http://127.0.0.1:9000/v1/jobs"
|
|default-runtime-attributes {
| cpu: 1
| failOnStderr: false
| continueOnReturnCode: 0
| memory: "2 GB"
| disk: "2 GB"
| preemptible: false
| # The keys below have been commented out as they are optional runtime attributes.
| # dockerWorkingDir
| # docker
|}
|
|poll-backoff {
| min: "5 seconds"
| max: "1 minute"
| multiplier: 2.5
| randomization-factor: .7
|}
|
|execute-or-recover-backoff {
| min: "3 minutes"
| max: "1 hours"
| multiplier: 5
| randomization-factor: .1
|}
|
|""".stripMargin

val backendConfigWithBackoffs = ConfigFactory.parseString(backendConfigStringWithBackoffs)

private val backendConfigStringWithInvalidBackoffs =
"""
|root = "local-cromwell-executions"
|dockerRoot = "/cromwell-executions"
|endpoint = "http://127.0.0.1:9000/v1/jobs"
|
|default-runtime-attributes {
| cpu: 1
| failOnStderr: false
| continueOnReturnCode: 0
| memory: "2 GB"
| disk: "2 GB"
| preemptible: false
| # The keys below have been commented out as they are optional runtime attributes.
| # dockerWorkingDir
| # docker
|}
|
|poll-backoff {
| min: "5 seconds"
| max: "1 minute"
| multiplier: 2.5
| # missing randomization-factor
|}
|
|""".stripMargin

val backendConfigWithInvalidBackoffs = ConfigFactory.parseString(backendConfigStringWithInvalidBackoffs)
}