broadinstitute · mcovarr · Aug 28, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
@@ -20,6 +20,7 @@ be found [here](https://cromwell.readthedocs.io/en/stable/backends/HPC/#optional
 
 - The `genomics` configuration entry was renamed to `batch`, see [ReadTheDocs](https://cromwell.readthedocs.io/en/stable/backends/GCPBatch/) for more information.
 - Fixes a bug with not being able to recover jobs on Cromwell restart.
+- Fixes machine type selection to match the Google Cloud Life Sciences backend, including default n1 non shared-core machine types and correct handling of `cpuPlatform` to select n2 or n2d machine types as appropriate.
 - Fixes the preemption error handling, now, the correct error message is printed, this also handles the other potential exit codes.
 - Fixes error message reporting for failed jobs.
 - Fixes the "retry with more memory" feature.

@@ -237,6 +237,7 @@ lazy val googlePipelinesV2Beta = (project in backendRoot / "google" / "pipelines
 
 lazy val googleBatch = (project in backendRoot / "google" / "batch")
   .withLibrarySettings("cromwell-google-batch-backend")
+  .dependsOn(core)
   .dependsOn(backend)
   .dependsOn(gcsFileSystem)
   .dependsOn(drsFileSystem)

@@ -1,6 +1,7 @@
 name: papi_cpu_platform
 testFormat: workflowsuccess
-backends: [Papiv2]
+backendsMode: any
+backends: [Papiv2, GCPBATCH]
 
 files {
   workflow: papi_cpu_platform/papi_cpu_platform.wdl

@@ -53,7 +53,7 @@
         backendSingletonActor ! BatchApiRequestManager.BatchRunCreationRequest(
           request.workflowId,
           self,
-          requestFactory.submitRequest(request)
+          requestFactory.submitRequest(request, jobLogger)
         )
         val newPromise = Promise[StandardAsyncJob]()
         runCreationClientPromise = Option(newPromise)

@@ -6,13 +6,14 @@ import cromwell.backend.google.batch.io.GcpBatchAttachedDisk
 import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.VirtualPrivateCloudConfiguration
 import cromwell.backend.google.batch.models._
 import cromwell.backend.google.batch.monitoring.{CheckpointingConfiguration, MonitoringImage}
+import cromwell.core.logging.JobLogger
 import cromwell.core.path.Path
 import wom.runtime.WomOutputRuntimeExtractor
 
 import scala.concurrent.duration.FiniteDuration
 
 trait GcpBatchRequestFactory {
-  def submitRequest(data: GcpBatchRequest): CreateJobRequest
+  def submitRequest(data: GcpBatchRequest, jobLogger: JobLogger): CreateJobRequest
 
   def queryRequest(jobName: JobName): GetJobRequest
 

@@ -22,7 +22,8 @@
 import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration
 import cromwell.backend.google.batch.models.{GcpBatchRequest, VpcAndSubnetworkProjectLabelValues}
 import cromwell.backend.google.batch.runnable._
-import cromwell.backend.google.batch.util.BatchUtilityConversions
+import cromwell.backend.google.batch.util.{BatchUtilityConversions, GcpBatchMachineConstraints}
+import cromwell.core.logging.JobLogger
 
 import scala.jdk.CollectionConverters._
 
@@ -74,14 +75,16 @@
   private def createInstancePolicy(cpuPlatform: String,
                                    spotModel: ProvisioningModel,
                                    accelerators: Option[Accelerator.Builder],
-                                   attachedDisks: List[AttachedDisk]
+                                   attachedDisks: List[AttachedDisk],
+                                   machineType: String
   ): InstancePolicy.Builder = {
 
     // set GPU count to 0 if not included in workflow
     val gpuAccelerators = accelerators.getOrElse(Accelerator.newBuilder.setCount(0).setType("")) // TODO: Driver version
 
     val instancePolicy = InstancePolicy.newBuilder
       .setProvisioningModel(spotModel)
+      .setMachineType(machineType)
       .addAllDisks(attachedDisks.asJava)
       .setMinCpuPlatform(cpuPlatform)
       .buildPartial()
@@ -154,7 +157,7 @@
     }
   }
 
-  override def submitRequest(data: GcpBatchRequest): CreateJobRequest = {
+  override def submitRequest(data: GcpBatchRequest, jobLogger: JobLogger): CreateJobRequest = {
 
     val runtimeAttributes = data.gcpBatchParameters.runtimeAttributes
     val createParameters = data.createParameters
@@ -224,7 +227,13 @@
     val computeResource = createComputeResource(cpuCores, memory, gcpBootDiskSizeMb)
     val taskSpec = createTaskSpec(sortedRunnables, computeResource, retryCount, durationInSeconds, allVolumes)
     val taskGroup: TaskGroup = createTaskGroup(taskCount, taskSpec)
-    val instancePolicy = createInstancePolicy(cpuPlatform, spotModel, accelerators, allDisks)
+    val machineType = GcpBatchMachineConstraints.machineType(runtimeAttributes.memory,
+                                                             runtimeAttributes.cpu,
+                                                             cpuPlatformOption = runtimeAttributes.cpuPlatform,
+                                                             googleLegacyMachineSelection = false,
+                                                             jobLogger = jobLogger
+    )
+    val instancePolicy = createInstancePolicy(cpuPlatform, spotModel, accelerators, allDisks, machineType)
     val locationPolicy = LocationPolicy.newBuilder.addAllowedLocations(zones).build
     val allocationPolicy =
       createAllocationPolicy(data, locationPolicy, instancePolicy.build, networkPolicy, gcpSa, accelerators)

@@ -77,6 +77,7 @@
   private val cpuPlatformValidationInstance = new StringRuntimeAttributesValidation(CpuPlatformKey).optional
   // via `gcloud compute zones describe us-central1-a`
   val CpuPlatformIntelCascadeLakeValue = "Intel Cascade Lake"
+  val CpuPlatformIntelIceLakeValue = "Intel Ice Lake"
   val CpuPlatformAMDRomeValue = "AMD Rome"
 
   val UseDockerImageCacheKey = "useDockerImageCache"

@@ -6,9 +6,9 @@
   N2CustomMachineType,
   N2DCustomMachineType
 }
+import cromwell.core.logging.JobLogger
 import eu.timepit.refined.api.Refined
 import eu.timepit.refined.numeric.Positive
-import org.slf4j.Logger
 import wdl4s.parser.MemoryUnit
 import wom.format.MemorySize
 
@@ -17,16 +17,17 @@
                   cpu: Int Refined Positive,
                   cpuPlatformOption: Option[String],
                   googleLegacyMachineSelection: Boolean,
-                  jobLogger: Logger
+                  jobLogger: JobLogger
   ): String =
     if (googleLegacyMachineSelection) {
       s"predefined-$cpu-${memory.to(MemoryUnit.MB).amount.intValue()}"
     } else {
-      // If someone requests Intel Cascade Lake as their CPU platform then switch the machine type to n2.
+      // If someone requests Intel Cascade Lake or Intel Ice Lake as their CPU platform then switch the machine type to n2.
       // Similarly, CPU platform of AMD Rome corresponds to the machine type n2d.
       val customMachineType =
         cpuPlatformOption match {
           case Some(GcpBatchRuntimeAttributes.CpuPlatformIntelCascadeLakeValue) => N2CustomMachineType
+          case Some(GcpBatchRuntimeAttributes.CpuPlatformIntelIceLakeValue) => N2CustomMachineType
           case Some(GcpBatchRuntimeAttributes.CpuPlatformAMDRomeValue) => N2DCustomMachineType
           case _ => N1CustomMachineType
         }

@@ -132,7 +132,7 @@ class GcpBatchAsyncBackendJobExecutionActorSpec
     val runtimeAttributesBuilder = GcpBatchRuntimeAttributes.runtimeAttributesBuilder(configuration)
 
     val requestFactory: GcpBatchRequestFactory = new GcpBatchRequestFactory {
-      override def submitRequest(data: GcpBatchRequest): CreateJobRequest = null
+      override def submitRequest(data: GcpBatchRequest, jobLogger: JobLogger): CreateJobRequest = null
 
       override def queryRequest(jobName: JobName): GetJobRequest = null
 

@@ -1,14 +1,15 @@
 package cromwell.backend.google.batch.util
 
 import common.assertion.CromwellTimeoutSpec
+import common.mock.MockSugar.mock
 import cromwell.backend.google.batch.models.GcpBatchRuntimeAttributes
+import cromwell.core.logging.JobLogger
 import eu.timepit.refined.numeric.Positive
 import eu.timepit.refined.refineMV
 import org.scalatest.flatspec.AnyFlatSpec
 import org.scalatest.matchers.should.Matchers
 import org.scalatest.prop.TableDrivenPropertyChecks._
 import org.scalatest.prop.Tables.Table
-import org.slf4j.helpers.NOPLogger
 import wdl4s.parser.MemoryUnit
 import wom.format.MemorySize
 
@@ -83,7 +84,7 @@ class GcpBatchMachineConstraintsSpec extends AnyFlatSpec with CromwellTimeoutSpe
         cpu = cpu,
         cpuPlatformOption = cpuPlatformOption,
         googleLegacyMachineSelection = googleLegacyMachineSelection,
-        jobLogger = NOPLogger.NOP_LOGGER
+        jobLogger = mock[JobLogger]
       ) shouldBe expected
     }
   }