diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index f82a5020ef3..98185b3d1b3 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -24,7 +24,11 @@ jobs: #Each will be launched on its own runner so they can occur in parallel. #Friendly names are displayed on the Github UI and aren't used anywhere else. matrix: + # Batch test fixes to land later include: + # - build_type: centaurGcpBatch + # build_mysql: 5.7 + # friendly_name: Centaur GCP Batch with MySQL 5.7 - build_type: centaurPapiV2beta build_mysql: 5.7 friendly_name: Centaur Papi V2 Beta with MySQL 5.7 diff --git a/build.sbt b/build.sbt index 6b2641e024f..9bdd086d731 100644 --- a/build.sbt +++ b/build.sbt @@ -233,6 +233,19 @@ lazy val googlePipelinesV2Beta = (project in backendRoot / "google" / "pipelines .dependsOn(core % "test->test") .dependsOn(common % "test->test") +lazy val googleBatch = (project in backendRoot / "google" / "batch") + .withLibrarySettings("cromwell-google-batch-backend") + .dependsOn(backend) + .dependsOn(gcsFileSystem) + .dependsOn(drsFileSystem) + .dependsOn(sraFileSystem) + .dependsOn(httpFileSystem) + .dependsOn(backend % "test->test") + .dependsOn(gcsFileSystem % "test->test") + .dependsOn(services % "test->test") + .dependsOn(common % "test->test") + .dependsOn(core % "test->test") + lazy val awsBackend = (project in backendRoot / "aws") .withLibrarySettings("cromwell-aws-backend") .dependsOn(backend) @@ -392,6 +405,7 @@ lazy val server = project .dependsOn(engine) .dependsOn(googlePipelinesV2Alpha1) .dependsOn(googlePipelinesV2Beta) + .dependsOn(googleBatch) .dependsOn(awsBackend) .dependsOn(tesBackend) .dependsOn(cromwellApiClient) @@ -431,6 +445,7 @@ lazy val root = (project in file(".")) .aggregate(googlePipelinesCommon) .aggregate(googlePipelinesV2Alpha1) .aggregate(googlePipelinesV2Beta) + .aggregate(googleBatch) .aggregate(httpFileSystem) .aggregate(languageFactoryCore) .aggregate(perf) diff --git a/centaur/src/main/resources/standardTestCases/draft3_read_file_limits.test b/centaur/src/main/resources/standardTestCases/draft3_read_file_limits.test index 4a9af0c8813..4bcbdd38db7 100644 --- a/centaur/src/main/resources/standardTestCases/draft3_read_file_limits.test +++ b/centaur/src/main/resources/standardTestCases/draft3_read_file_limits.test @@ -2,6 +2,7 @@ name: draft3_read_file_limits testFormat: workflowfailure workflowType: WDL workflowTypeVersion: 1.0 +tags: [batchexclude] files { workflow: wdl_draft3/read_file_limits/read_file_limits.wdl diff --git a/centaur/src/main/resources/standardTestCases/long_cmd.test b/centaur/src/main/resources/standardTestCases/long_cmd.test index 40b6110b629..cef5fda2177 100644 --- a/centaur/src/main/resources/standardTestCases/long_cmd.test +++ b/centaur/src/main/resources/standardTestCases/long_cmd.test @@ -9,6 +9,7 @@ name: long_cmd testFormat: workflowsuccess +tags: [batchexclude] files { workflow: long_cmd/long_cmd.wdl diff --git a/centaur/src/main/resources/standardTestCases/read_file_limits.test b/centaur/src/main/resources/standardTestCases/read_file_limits.test index 0079401812a..734ab809b92 100644 --- a/centaur/src/main/resources/standardTestCases/read_file_limits.test +++ b/centaur/src/main/resources/standardTestCases/read_file_limits.test @@ -1,5 +1,6 @@ name: read_file_limits testFormat: workflowfailure +tags: [batchexclude] files { workflow: read_file_limits/read_file_limits.wdl diff --git a/centaur/src/main/resources/standardTestCases/relative_output_paths_colliding.test b/centaur/src/main/resources/standardTestCases/relative_output_paths_colliding.test index 82b01c6399d..2a6fca6793e 100644 --- a/centaur/src/main/resources/standardTestCases/relative_output_paths_colliding.test +++ b/centaur/src/main/resources/standardTestCases/relative_output_paths_colliding.test @@ -1,5 +1,6 @@ name: relative_output_paths_colliding testFormat: workflowfailure +tags: [batchexclude] files { workflow: relative_output_paths_colliding/workflow_output_paths_colliding.wdl diff --git a/centaur/src/main/resources/standardTestCases/standard_output_paths_colliding_prevented.test b/centaur/src/main/resources/standardTestCases/standard_output_paths_colliding_prevented.test index 6c5a5b51476..d8d37b4b2d0 100644 --- a/centaur/src/main/resources/standardTestCases/standard_output_paths_colliding_prevented.test +++ b/centaur/src/main/resources/standardTestCases/standard_output_paths_colliding_prevented.test @@ -1,5 +1,6 @@ name: standard_output_paths_colliding_prevented testFormat: workflowsuccess +tags: [batchexclude] files { workflow: standard_output_paths_colliding_prevented/workflow_output_paths_colliding.wdl diff --git a/cromwell.example.backends/GCPBATCH.conf b/cromwell.example.backends/GCPBATCH.conf new file mode 100644 index 00000000000..ba554e3322d --- /dev/null +++ b/cromwell.example.backends/GCPBATCH.conf @@ -0,0 +1,104 @@ +# This is an example of how you can use the Google Cloud Batch backend +# provider. *This is not a complete configuration file!* The +# content here should be copy pasted into the backend -> providers section +# of cromwell.example.backends/cromwell.examples.conf in the root of the repository. +# You should uncomment lines that you want to define, and read carefully to customize +# the file. + +# Documentation +# https://cromwell.readthedocs.io/en/stable/backends/Google/ + +backend { + default = GCPBATCH + + providers { + GCPBATCH { + actor-factory = "cromwell.backend.google.batch.GcpBatchBackendLifecycleActorFactory" + config { + # Google project + project = "my-cromwell-workflows" + + # Base bucket for workflow executions + root = "gs://my-cromwell-workflows-bucket" + + # Polling for completion backs-off gradually for slower-running jobs. + # This is the maximum polling interval (in seconds): + maximum-polling-interval = 600 + + # Optional Dockerhub Credentials. Can be used to access private docker images. + dockerhub { + # account = "" + # token = "" + } + + # Optional configuration to use high security network (Virtual Private Cloud) for running jobs. + # See https://cromwell.readthedocs.io/en/stable/backends/Google/ for more details. + # virtual-private-cloud { + # network-label-key = "network-key" + # auth = "application-default" + # } + + # Global pipeline timeout + # Defaults to 7 days; max 30 days + # batch-timeout = 7 days + + genomics { + # A reference to an auth defined in the `google` stanza at the top. This auth is used to create + # Batch Jobs and manipulate auth JSONs. + auth = "application-default" + + + // alternative service account to use on the launched compute instance + // NOTE: If combined with service account authorization, both that service account and this service account + // must be able to read and write to the 'root' GCS path + compute-service-account = "default" + + # Location to submit jobs to Batch and store job metadata. + location = "us-central1" + + # Specifies the minimum file size for `gsutil cp` to use parallel composite uploads during delocalization. + # Parallel composite uploads can result in a significant improvement in delocalization speed for large files + # but may introduce complexities in downloading such files from GCS, please see + # https://cloud.google.com/storage/docs/gsutil/commands/cp#parallel-composite-uploads for more information. + # + # If set to 0 parallel composite uploads are turned off. The default Cromwell configuration turns off + # parallel composite uploads, this sample configuration turns it on for files of 150M or larger. + parallel-composite-upload-threshold="150M" + } + + filesystems { + gcs { + # A reference to a potentially different auth for manipulating files via engine functions. + auth = "application-default" + # Google project which will be billed for the requests + project = "google-billing-project" + + caching { + # When a cache hit is found, the following duplication strategy will be followed to use the cached outputs + # Possible values: "copy", "reference". Defaults to "copy" + # "copy": Copy the output files + # "reference": DO NOT copy the output files but point to the original output files instead. + # Will still make sure than all the original output files exist and are accessible before + # going forward with the cache hit. + duplication-strategy = "copy" + } + } + } + + default-runtime-attributes { + cpu: 1 + failOnStderr: false + continueOnReturnCode: 0 + memory: "2048 MB" + bootDiskSizeGb: 10 + # Allowed to be a String, or a list of Strings + disks: "local-disk 10 SSD" + noAddress: false + preemptible: 0 + zones: ["us-central1-a", "us-central1-b"] + } + + } + } + } +} diff --git a/docs/backends/GCPBatch.md b/docs/backends/GCPBatch.md new file mode 100644 index 00000000000..766719af8c5 --- /dev/null +++ b/docs/backends/GCPBatch.md @@ -0,0 +1,473 @@ +**Google Cloud Backend** + +[//]: +Google Cloud Batch is a fully managed service that lets you schedule, queue, and execute batch processing workloads on Google Cloud resources. Batch provisions resources and manages capacity on your behalf, allowing your batch workloads to run at scale. + +This section offers detailed configuration instructions for using Cromwell with the Batch API in all supported +authentication modes. Before reading further in this section please see the +[Getting started on Google Batch API](../tutorials/Batch101) for instructions common to all authentication modes +and detailed instructions for the application default authentication scheme in particular. +The instructions below assume you have created a Google Cloud Storage bucket and a Google project enabled for the appropriate APIs. + +**Configuring Authentication** + +The `google` stanza in the Cromwell configuration file defines how to authenticate to Google. There are four different +authentication schemes that might be used: + +* `application_default` (default, recommended) - Use [application default](https://developers.google.com/identity/protocols/application-default-credentials) credentials. +* `service_account` - Use a specific service account and key file (in PEM format) to authenticate. +* `user_account` - Authenticate as a user. +* `user_service_account` - Authenticate each individual workflow using service account credentials supplied in the workflow options. + +The `auths` block in the `google` stanza defines the authentication schemes within a Cromwell deployment: + + +```hocon +google { + application-name = "cromwell" + auths = [ + { + name = "application-default" + scheme = "application_default" + }, + { + name = "service-account" + scheme = "service_account" + service-account-id = "my-service-account" + pem-file = "/path/to/file.pem" + }, + { + name = "user-service-account" + scheme = "user_service_account" + } + ] +} +``` + +These authentication schemes can be referenced by name within other portions of the configuration file. For example, both +the `GCPBATCH` and `filesystems.gcs` sections within a Google configuration block must reference an auth defined in this block. +The auth for the `GCPBATCH` section governs the interactions with Google itself, while `filesystems.gcs` governs the localization +of data into and out of GCE VMs. + +**Application Default Credentials** + +By default, application default credentials will be used. Only `name` and `scheme` are required for application default credentials. + +To authenticate, run the following commands from your command line (requires [gcloud](https://cloud.google.com/sdk/gcloud/)): + +``` +$ gcloud auth login +$ gcloud config set project my-project +``` + +**Service Account** + +First create a new service account through the [API Credentials](https://console.developers.google.com/apis/credentials) page. Go to **Create credentials -> Service account key**. Then in the **Service account** dropdown select **New service account**. Fill in a name (e.g. `my-account`), and select key type of JSON. + +Creating the account will cause the JSON file to be downloaded. The structure of this file is roughly like this (account name is `my-account`): + +``` +{ + "type": "service_account", + "project_id": "my-project", + "private_key_id": "OMITTED", + "private_key": "-----BEGIN PRIVATE KEY-----\nBASE64 ENCODED KEY WITH \n TO REPRESENT NEWLINES\n-----END PRIVATE KEY-----\n", + "client_email": "my-account@my-project.iam.gserviceaccount.com", + "client_id": "22377410244549202395", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://accounts.google.com/o/oauth2/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/my-account%40my-project.iam.gserviceaccount.com" +} +``` + +Most importantly, the value of the `client_email` field should go into the `service-account-id` field in the configuration (see below). The +`private_key` portion needs to be pulled into its own file (e.g. `my-key.pem`). The `\n`s in the string need to be converted to newline characters. + +While technically not part of Service Account authentication mode, one can also override the default service account that the compute VM is started with via the configuration option `GCPBATCH.config.genomics.compute-service-account` or through the workflow options parameter `google_compute_service_account`. The service account you provide must have been granted Service Account Actor role to Cromwell's primary service account. As this only affects Google Batch API and not GCS, it's important that this service account, and the service account specified in `GCPBATCH.config.genomics.auth` can both read/write the location specified by `GCPBATCH.config.root` + +**User Service Account** + +A [JSON key file for the service account](../wf_options/Google.md) must be passed in via the `user_service_account_json` field in the [Workflow Options](../wf_options/Google.md) when submitting the job. Omitting this field will cause the workflow to fail. The JSON should be passed as a string and will need to have no newlines and all instances of `"` and `\n` escaped. + +[//]: # (TODO: is jes_gcs_root the correct workflow option?) +In the likely event that this service account does not have access to Cromwell's default google project the `google_project` workflow option must be set. In the similarly likely case that this service account can not access Cromwell's default google bucket, the `jes_gcs_root` workflow option should be set appropriately. + +For information on the interaction of `user_service_account_json` with private Docker images please see the `Docker` section below. + +**Docker** + +It's possible to reference private Docker images to which only particular Docker Hub accounts have access: + +``` +task mytask { + command { + ... + } + runtime { + docker: "private_repo/image" + memory: "8 GB" + cpu: "1" + } + ... +} +``` + +In order for a private image to be used the appropriate Docker configuration must be provided. If the Docker images being used +are public there is no need to add this configuration. + +For Batch + +[//]: # (TODO: Is this the correct way to configure Docker for batch?) +[//]: # (5-4-23: Leave alone for now) +``` +backend { + default = GCPBATCH + providers { + GCPBATCH { + actor-factory = "cromwell.backend.google.batch.GcpBatchBackendLifecycleActorFactory" + config { + dockerhub { + token = "base64-encoded-docker-hub-username:password" + key-name = "name/of/the/kms/key/used/for/encrypting/and/decrypting/the/docker/hub/token" + auth = "reference-to-the-auth-cromwell-should-use-for-kms-encryption" + } + } + } + } +} +``` + +`token` is the standard base64-encoded username:password for the appropriate Docker Hub account. + +`key-name` is the name of the Google KMS key Cromwell should use for encrypting the Docker `token` before including it +in the PAPI job execution request. This `key-name` will also be included in the PAPI job execution +request and will be used by Batch to decrypt the Docker token used by `docker login` to enable access to the private Docker image. + +`auth` is a reference to the name of an authorization in the `auths` block of Cromwell's `google` config. +Cromwell will use this authorization for encrypting the Google KMS key. + +The equivalents of `key-name`, `token` and `auth` can also be specified in workflow options which take +precedence over values specified in configuration. The corresponding workflow options are named `docker_credentials_key_name`, +`docker_credentials_token`, and `user_service_account_json`. While the config value `auth` refers to an auth defined in the +`google.auths` stanza elsewhere in Cromwell's +configuration, `user_service_account_json` is expected to be a literal escaped Google service account auth JSON. +See the `User Service Account` section above for more information on using user service accounts. +If the key, token or auth value is provided in workflow options then the corresponding private Docker configuration value +is not required, and vice versa. Also note that for the `user_service_account_json` workflow option to work an auth of type `user_service_account` +must be defined in Cromwell's `google.auths` stanza; more details in the `User Service Account` section above. + +Example Batch workflow options for private Docker configuration: + +``` +{ + "docker_credentials_key_name": "name/of/the/kms/key/used/for/encrypting/and/decrypting/the/docker/hub/token", + "docker_credentials_token": "base64_username:password", + "user_service_account_json": "" +} +``` + +Important + +If any of the three private Docker configuration values of key name, auth, or Docker token are missing, Batch will not perform a `docker login`. +If the Docker image to be pulled is not public the `docker pull` will fail which will cause the overall job to fail. + +If using any of these private Docker workflow options it is advisable to add +them to the `workflow-options.encrypted-fields` list in Cromwell configuration. + + +**Monitoring** + +In order to monitor metrics (CPU, Memory, Disk usage...) about the VM during Call Runtime, a workflow option can be used to specify the path to a script that will run in the background and write its output to a log file. + +``` +{ + "monitoring_script": "gs://cromwell/monitoring/script.sh" +} +``` + +The output of this script will be written to a `monitoring.log` file that will be available in the call gcs bucket when the call completes. This feature is meant to run a script in the background during long-running processes. It's possible that if the task is very short that the log file does not flush before de-localization happens and you will end up with a zero byte file. + +**Google Cloud Storage Filesystem** + +On the Google Batch backend the GCS (Google Cloud Storage) filesystem is used for the root of the workflow execution. +On the Local, SGE, and associated backends any GCS URI will be downloaded locally. For the Google backend the `jes_gcs_root` [Workflow Option](../wf_options/Google) will take +precedence over the `root` specified at `backend.providers.JES.config.root` in the configuration file. Google Cloud Storage URIs are the only acceptable values for `File` inputs for +workflows using the Google backend. + +**Batch timeout** + +Google sets a default pipeline timeout of 7 days, after which the pipeline will abort. Setting `batch-timeout` overrides this limit to a maximum of 30 days. + +```hocon +backend.providers.GCPBATCH.config { + batch-timeout: 14 days +} +``` + +#### Google Labels + +Every call run on the Batch API backend is given certain labels by default, so that Google resources can be queried by these labels later. +The current default label set automatically applied is: + +| Key | Value | Example | Notes | +|-----|-------|---------|-------| +| cromwell-workflow-id | The Cromwell ID given to the root workflow (i.e. the ID returned by Cromwell on submission) | cromwell-d4b412c5-bf3d-4169-91b0-1b635ce47a26 | To fit the required [format](#label-format), we prefix with 'cromwell-' | +| cromwell-sub-workflow-name | The name of this job's sub-workflow | my-sub-workflow | Only present if the task is called in a subworkflow. | +| wdl-task-name | The name of the WDL task | my-task | | +| wdl-call-alias | The alias of the WDL call that created this job | my-task-1 | Only present if the task was called with an alias. | + +Any custom labels provided as '`google_labels`' in the [workflow options](../wf_options/Google) are also applied to Google resources by the Batch API. + +### Virtual Private Network + +Cromwell can arrange for jobs to run in specific GCP private networks via the `config.virtual-private-cloud` stanza of a Batch backend. +There are two ways of specifying private networks: + +* [Literal network and subnetwork values](#virtual-private-network-via-literals) that will apply to all projects +* [Google project labels](#virtual-private-network-via-labels) whose values in a particular Google project will specify the network and subnetwork + +#### Virtual Private Network via Literals + +```hocon +backend { + ... + providers { + ... + GCPBATCH { + actor-factory = "cromwell.backend.google.batch.GcpBatchLifecycleActorFactory" + config { + ... + virtual-private-cloud { + network-name = "vpc-network" + subnetwork-name = "vpc-subnetwork" + } + ... + } + } + } +} +``` + +The `network-name` and `subnetwork-name` should reference the name of your private network and subnetwork within that +network respectively. The `subnetwork-name` is an optional config. + +For example, if your `virtual-private-cloud` config looks like the one above, then Cromwell will use the value of the +configuration key, which is `vpc-network` here, as the name of private network and run the jobs on this network. +If the network name is not present in the config Cromwell will fall back to trying to run jobs on the default network. + +If the `network-name` or `subnetwork-name` values contain the string `${projectId}` then that value will be replaced +by Cromwell with the name of the project running the Batch API. + +If the `network-name` does not contain a `/` then it will be prefixed with `projects/${projectId}/global/networks/`. + +Cromwell will then pass the network and subnetwork values to the Batch API. See the documentation for the +[Batch API](https://cloud.google.com/batch/docs/networking-overview) +for more information on the various formats accepted for `network` and `subnetwork`. + +#### Virtual Private Network via Labels + +```hocon +backend { + ... + providers { + ... + GCPBATCH { + actor-factory = "cromwell.backend.google.batch.GcpBatchLifecycleActorFactory" + config { + ... + virtual-private-cloud { + network-label-key = "my-private-network" + subnetwork-label-key = "my-private-subnetwork" + auth = "reference-to-auth-scheme" + } + ... + } + } + } +} +``` + + +The `network-label-key` and `subnetwork-label-key` should reference the keys in your project's labels whose value is the name of your private network +and subnetwork within that network respectively. `auth` should reference an auth scheme in the `google` stanza which will be used to get the project metadata from Google Cloud. +The `subnetwork-label-key` is an optional config. + +For example, if your `virtual-private-cloud` config looks like the one above, and one of the labels in your project is + +``` +"my-private-network" = "vpc-network" +``` + +Cromwell will get labels from the project's metadata and look for a label whose key is `my-private-network`. +Then it will use the value of the label, which is `vpc-network` here, as the name of private network and run the jobs on this network. +If the network key is not present in the project's metadata Cromwell will fall back to trying to run jobs using literal +network labels, and then fall back to running on the default network. + +### Custom Google Cloud SDK container + +[//]: # (TODO: need to test this section as well) +Cromwell can't use Google's container registry if VPC Perimeter is used in project. +Own repository can be used by adding `cloud-sdk-image-url` reference to used container: + +``` +google { + ... + cloud-sdk-image-url = "eu.gcr.io/your-project-id/cloudsdktool/cloud-sdk:354.0.0-alpine" + cloud-sdk-image-size-gb = 1 +} +``` + +### Parallel Composite Uploads + +[//]: # (TODO: Need to test parallel composite uploads) + +Cromwell can be configured to use GCS parallel composite uploads which can greatly improve delocalization performance. This feature +is turned off by default but can be enabled backend-wide by specifying a `gsutil`-compatible memory specification for the key +`genomics.parallel-composite-upload-threshold` in backend configuration. This memory value represents the minimum size an output file +must have to be a candidate for `gsutil` parallel composite uploading: + +``` +backend { + ... + providers { + ... + GCPBATCH { + actor-factory = "cromwell.backend.google.batch.GcpBatchLifecycleActorFactory" + config { + ... + genomics { + ... + parallel-composite-upload-threshold = 150M + ... + } + ... + } + } + } +} +``` + +Alternatively this threshold can be specified in workflow options using the key `parallel-composite-upload-threshold`, +which takes precedence over a setting in configuration. The default setting for this threshold is `0` which turns off +parallel composite uploads; a value of `0` can also be used in workflow options to turn off parallel composite uploads +in a Cromwell deployment where they are turned on in config. + +#### Issues with composite files + +Please see the [Google documentation](https://cloud.google.com/storage/docs/gsutil/commands/cp#parallel-composite-uploads) +describing the benefits and drawbacks of parallel composite uploads. + +The actual error message observed when attempting to download a composite file on a system without a compiled `crcmod` +looks like the following: + +``` +/ # gsutil -o GSUtil:parallel_composite_upload_threshold=150M cp gs://my-bucket/composite.bam . +Copying gs://my-bucket/composite.bam... +==> NOTE: You are downloading one or more large file(s), which would +run significantly faster if you enabled sliced object downloads. This +feature is enabled by default but requires that compiled crcmod be +installed (see "gsutil help crcmod"). + +CommandException: +Downloading this composite object requires integrity checking with CRC32c, +but your crcmod installation isn't using the module's C extension, so the +hash computation will likely throttle download performance. For help +installing the extension, please see "gsutil help crcmod". + +To download regardless of crcmod performance or to skip slow integrity +checks, see the "check_hashes" option in your boto config file. + +NOTE: It is strongly recommended that you not disable integrity checks. Doing so +could allow data corruption to go undetected during uploading/downloading. +/ # +``` + +As the message states, the best option would be to have a compiled `crcmod` installed on the system. +Turning off integrity checks on downloads does get around this issue but really isn't a great idea. + +#### Parallel composite uploads and call caching + +Because the parallel composite upload threshold is not considered part of the hash used for call caching purposes, calls +which would be expected to generate non-composite outputs may call cache to results that did generate composite +outputs. Calls which are executed and not cached will always honor the parallel composite upload setting at the time of +their execution. + + +### Migration from Google Cloud Genomics v2alpha1 to Google Cloud Life Sciences v2beta + +1. If you currently run your workflows using Cloud Genomics v2beta and would like to switch to Google Batch, you will need to do a few changes to your configuration file: `actor-factory` value should be changed +from `cromwell.backend.google.pipelines.v2beta.PipelinesApiLifecycleActorFactory` to `cromwell.backend.google.batch.GcpBatchLifecycleActorFactory`. + +2. You will need to remove the parameter `genomics.endpoint-url` and generate a new config file. + +3. Google Batch is now available in a variety of regions. Please see the [Batch Locations](https://cloud.google.com/batch/docs/locations) for a list of supported regions + + +### Reference Disk Support + +[//]: # (TODO: follow up later) + +Cromwell 55 and later support mounting reference disks from prebuilt GCP disk images as an alternative to localizing large +input reference files on Batch. Please note the configuration of reference disk manifests has changed starting with +Cromwell 57 and now uses the format documented below. + +Within the `config` stanza of a Batch backend the `reference-disk-localization-manifests` +key specifies an array of reference disk manifests: + +```hocon +backend { + ... + providers { + ... + GCPBATCH { + actor-factory = "cromwell.backend.google.batch.GcpBatchLifecycleActorFactory" + config { + ... + reference-disk-localization-manifests = [ + { + "imageIdentifier" : "projects/broad-dsde-cromwell-dev/global/images/broad-references-disk-image", + "diskSizeGb" : 500, + "files" : [ { + "path" : "gcp-public-data--broad-references/Homo_sapiens_assembly19_1000genomes_decoy/Homo_sapiens_assembly19_1000genomes_decoy.fasta.nhr", + "crc32c" : 407769621 + }, { + "path" : "gcp-public-data--broad-references/Homo_sapiens_assembly19_1000genomes_decoy/Homo_sapiens_assembly19_1000genomes_decoy.fasta.sa", + "crc32c" : 1902048083 + }, + ... + }, + ... + ] + ... + } + } + } +} +``` + +Reference disk usage is an opt-in feature, so workflow submissions must specify this workflow option: + +```json +{ + ... + "use_reference_disks": true, + ... +} +``` + +Using the first file in the manifest above as an example, assume a Batch backend is configured to use this manifest and the appropriate +`use_reference_disks` workflow option is set to `true` in the workflow submission. If a call in that workflow +specifies the input `gs://my-references/enormous_reference.bam` and because that input matches the path of a file on the +reference image without the leading `gs://`, Cromwell would +arrange for a reference disk based on this image to be mounted and for the call's input to refer to the +copy of the file on the reference disk, bypassing localization of the input. + +The Cromwell git repository includes a Java-based tool to facilitate the creation of manifests called +[CromwellRefdiskManifestCreatorApp](https://github.com/broadinstitute/cromwell/tree/develop/CromwellRefdiskManifestCreator). +Please see the help command of that tool for more details. + +Alternatively for public data stored under `gs://gcp-public-data--broad-references` there exists a shell script to +extract reference data to a new disk and then convert that disk to a public image. For more information see +[create_images.sh](https://github.com/broadinstitute/cromwell/tree/develop/scripts/reference_disks/create_images.sh). + diff --git a/docs/tutorials/Batch101.md b/docs/tutorials/Batch101.md new file mode 100644 index 00000000000..4c31d910ab1 --- /dev/null +++ b/docs/tutorials/Batch101.md @@ -0,0 +1,227 @@ +## Getting started on Google Cloud with Batch + +## Batch + +### Basic Information + +Google Cloud Batch is a fully managed service that lets you schedule, queue, and execute batch processing workloads on Google Cloud resources. +Batch provisions resources and manages capacity on your behalf, allowing your batch workloads to run at scale. + +### Setting up Batch + +#### Permissions: + +### Prerequisites + +This tutorial page relies on completing the previous tutorial: + +* [Downloading Prerequisites](FiveMinuteIntro.md) + +### Goals + +At the end of this tutorial you'll have run your first workflow against the Google Batch API. + +### Let's get started! + + +**Configuring a Google Project** + +Install the Google Cloud SDK. +Create a Google Cloud Project and give it a project id (e.g. sample-project). We’ll refer to this as `` and your user login (e.g. username@gmail.com) as ``. + +On your Google project, open up the API Manager and enable the following APIs: + +* Google Compute Engine API +* Cloud Storage +* Google Cloud Batch API + +Authenticate to Google Cloud Platform +`gcloud auth login ` + +Set your default account (will require to login again) +`gcloud auth application-default login` + +Set your default project +`gcloud config set project ` + +Create a Google Cloud Storage (GCS) bucket to hold Cromwell execution directories. +We will refer to this bucket as `google-bucket-name`, and the full identifier as `gs://google-bucket-name`. +`gsutil mb gs://` + + +**Workflow Source Files** + +Copy over the sample `hello.wdl` and `hello.inputs` files to the same directory as the Cromwell jar. +This workflow takes a string value as specified in the inputs file and writes it to stdout. + + +***hello.wdl*** +``` +task hello { + String addressee + command { + echo "Hello ${addressee}! Welcome to Cromwell . . . on Google Cloud!" + } + output { + String message = read_string(stdout()) + } + runtime { + docker: "ubuntu:latest" + } +} + +workflow wf_hello { + call hello + + output { + hello.message + } +} +``` + +***hello.inputs*** +``` +{ + "wf_hello.hello.addressee": "World" +} +``` + +**Google Configuration File** + +Copy over the sample `google.conf` file utilizing Application Default credentials to the same directory that contains your sample WDL, inputs and Cromwell jar. +Replace `` and ``in the configuration file with the project id and bucket name. Replace `` with the project id that has to be billed for the request (more information for Requester Pays can be found at: +Requester Pays) + +***google.conf*** +``` +include required(classpath("application")) + +google { + + application-name = "cromwell" + + auths = [ + { + name = "application-default" + scheme = "application_default" + } + ] +} + +engine { + filesystems { + gcs { + auth = "application-default" + project = "" + } + } +} + +backend { + default = batch + + providers { + batch { + actor-factory = "cromwell.backend.google.pipelines.batch.GcpBatchBackendLifecycleActorFactory" + config { + # Google project + project = "my-cromwell-workflows" + + # Base bucket for workflow executions + root = "gs://my-cromwell-workflows-bucket" + + # Polling for completion backs-off gradually for slower-running jobs. + # This is the maximum polling interval (in seconds): + maximum-polling-interval = 600 + + # Optional Dockerhub Credentials. Can be used to access private docker images. + dockerhub { + # account = "" + # token = "" + } + + # Optional configuration to use high security network (Virtual Private Cloud) for running jobs. + # See https://cromwell.readthedocs.io/en/stable/backends/Google/ for more details. + # virtual-private-cloud { + # network-label-key = "network-key" + # auth = "application-default" + # } + + # Global pipeline timeout + # Defaults to 7 days; max 30 days + # batch-timeout = 7 days + + genomics { + # A reference to an auth defined in the `google` stanza at the top. This auth is used to create + # Batch Jobs and manipulate auth JSONs. + auth = "application-default" + + + // alternative service account to use on the launched compute instance + // NOTE: If combined with service account authorization, both that service account and this service account + // must be able to read and write to the 'root' GCS path + compute-service-account = "default" + + # Location to submit jobs to Batch and store job metadata. + location = "us-central1" + + # Specifies the minimum file size for `gsutil cp` to use parallel composite uploads during delocalization. + # Parallel composite uploads can result in a significant improvement in delocalization speed for large files + # but may introduce complexities in downloading such files from GCS, please see + # https://cloud.google.com/storage/docs/gsutil/commands/cp#parallel-composite-uploads for more information. + # + # If set to 0 parallel composite uploads are turned off. The default Cromwell configuration turns off + # parallel composite uploads, this sample configuration turns it on for files of 150M or larger. + parallel-composite-upload-threshold="150M" + } + + filesystems { + gcs { + # A reference to a potentially different auth for manipulating files via engine functions. + auth = "application-default" + # Google project which will be billed for the requests + project = "google-billing-project" + + caching { + # When a cache hit is found, the following duplication strategy will be followed to use the cached outputs + # Possible values: "copy", "reference". Defaults to "copy" + # "copy": Copy the output files + # "reference": DO NOT copy the output files but point to the original output files instead. + # Will still make sure than all the original output files exist and are accessible before + # going forward with the cache hit. + duplication-strategy = "copy" + } + } + } + } + } + } +} +``` + +**Run Workflow** + +`java -Dconfig.file=google.conf -jar cromwell-67.jar run hello.wdl -i hello.inputs` + +**Outputs** + +The end of your workflow logs should report the workflow outputs. + +``` +[info] SingleWorkflowRunnerActor workflow finished with status 'Succeeded'. +{ + "outputs": { + "wf_hello.hello.message": "Hello World! Welcome to Cromwell . . . on Google Cloud!" + }, + "id": "08213b40-bcf5-470d-b8b7-1d1a9dccb10e" +} +``` + +Success! + +### Next steps + +You might find the following tutorials interesting to tackle next: + +* [Persisting Data Between Restarts](PersistentServer) +* [Server Mode](ServerMode.md) diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 1a7681da1e3..b2e94cc7470 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -55,6 +55,7 @@ object Dependencies { private val googleGenomicsServicesV2Alpha1ApiV = "v2alpha1-rev20210811-1.32.1" private val googleHttpClientApacheV = "2.1.2" private val googleHttpClientV = "1.42.3" + private val googleCloudBatchV1 = "0.18.0" // latest date via: https://mvnrepository.com/artifact/com.google.apis/google-api-services-lifesciences private val googleLifeSciencesServicesV2BetaApiV = "v2beta-rev20220916-2.0.0" private val googleOauth2V = "1.5.3" @@ -373,6 +374,12 @@ object Dependencies { exclude("com.google.guava", "guava-jdk5") ) + private val googleBatchv1Dependency = List( + "com.google.cloud" % "google-cloud-batch" % googleCloudBatchV1, + "com.google.api.grpc" % "proto-google-cloud-batch-v1" % googleCloudBatchV1, + "com.google.api.grpc" % "proto-google-cloud-resourcemanager-v3" % "1.17.0" + ) + /* Used instead of `"org.lerch" % "s3fs" % s3fsV exclude("org.slf4j", "jcl-over-slf4j")` org.lerch:s3fs:1.0.1 depends on a preview release of software.amazon.awssdk:s3. @@ -417,7 +424,7 @@ object Dependencies { "com.google.apis" % "google-api-services-cloudkms" % googleCloudKmsV exclude("com.google.guava", "guava-jdk5"), "org.glassfish.hk2.external" % "jakarta.inject" % jakartaInjectV, - ) ++ googleGenomicsV2Alpha1Dependency ++ googleLifeSciencesV2BetaDependency + ) ++ googleGenomicsV2Alpha1Dependency ++ googleLifeSciencesV2BetaDependency ++ googleBatchv1Dependency private val dbmsDependencies = List( "org.hsqldb" % "hsqldb" % hsqldbV, @@ -621,11 +628,12 @@ object Dependencies { "org.lz4" % "lz4-java" % lz4JavaV ) val scalaTest = "org.scalatest" %% "scalatest" % scalatestV + val testDependencies: List[ModuleID] = List( - scalaTest, + "org.scalatest" %% "scalatest" % scalatestV, // Use mockito Java DSL directly instead of the numerous and often hard to keep updated Scala DSLs. // See also scaladoc in common.mock.MockSugar and that trait's various usages. - "org.mockito" % "mockito-core" % mockitoV, + "org.mockito" % "mockito-core" % mockitoV ) ++ slf4jBindingDependencies // During testing, add an slf4j binding for _all_ libraries. val kindProjectorPlugin = "org.typelevel" % "kind-projector" % kindProjectorV cross CrossVersion.full diff --git a/src/ci/bin/testCentaurGcpBatch.sh b/src/ci/bin/testCentaurGcpBatch.sh new file mode 100755 index 00000000000..ad5da44eacb --- /dev/null +++ b/src/ci/bin/testCentaurGcpBatch.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -o errexit -o nounset -o pipefail +export CROMWELL_BUILD_REQUIRES_SECURE=true +# import in shellcheck / CI / IntelliJ compatible ways +# shellcheck source=/dev/null +source "${BASH_SOURCE%/*}/test.inc.sh" || source test.inc.sh +# shellcheck source=/dev/null +source "${BASH_SOURCE%/*}/test_gcpbatch.inc.sh" || source test_gcpbatch.inc.sh + +cromwell::build::setup_common_environment + +cromwell::build::setup_centaur_environment + +cromwell::build::batch::setup_batch_centaur_environment + +cromwell::build::assemble_jars + +cromwell::build::run_centaur \ + -p 100 \ + -e localdockertest \ + -e relative_output_paths \ + -e relative_output_paths_colliding \ + -e standard_output_paths_colliding_prevented \ + -e papi_v2alpha1_gcsa \ + +cromwell::build::generate_code_coverage diff --git a/src/ci/bin/test_gcpbatch.inc.sh b/src/ci/bin/test_gcpbatch.inc.sh new file mode 100644 index 00000000000..f75764b6006 --- /dev/null +++ b/src/ci/bin/test_gcpbatch.inc.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash + +set -o errexit -o nounset -o pipefail +# import in shellcheck / CI / IntelliJ compatible ways +# shellcheck source=/dev/null +source "${BASH_SOURCE%/*}/test.inc.sh" || source test.inc.sh + +# A set of common Gcp Batch functions for use in other scripts. +# +# Functions: +# +# - cromwell::build::batch::* +# Functions for use in other Papi scripts +# +# - cromwell::private::batch::batch::* +# Functions for use only within this file by cromwell::build::batch::* functions +# + +cromwell::private::batch::setup_batch_gcloud() { + CROMWELL_BUILD_BATCH_AUTH_JSON="${CROMWELL_BUILD_RESOURCES_DIRECTORY}/cromwell-service-account.json" + CROMWELL_BUILD_BATCH_CLIENT_EMAIL="$(jq --exit-status --raw-output .client_email "${CROMWELL_BUILD_BATCH_AUTH_JSON}")" + CROMWELL_BUILD_BATCH_PROJECT_ID="$(jq --exit-status --raw-output .project_id "${CROMWELL_BUILD_BATCH_AUTH_JSON}")" + CROMWELL_BUILD_BATCH_GCR_IMAGES="${CROMWELL_BUILD_RESOURCES_DIRECTORY}/cromwell_build_batch_gcloud_images_temporary.$$" + CROMWELL_BUILD_BATCH_CLOUDSDK_CONFIG="${CROMWELL_BUILD_RESOURCES_DIRECTORY}/cromwell_build_batch_gcloud_config.$$" + + export CROMWELL_BUILD_BATCH_AUTH_JSON + export CROMWELL_BUILD_BATCH_CLIENT_EMAIL + export CROMWELL_BUILD_BATCH_CLOUDSDK_CONFIG + export CROMWELL_BUILD_BATCH_GCR_IMAGES + export CROMWELL_BUILD_BATCH_PROJECT_ID + + if [[ "${CROMWELL_BUILD_PROVIDER}" == "${CROMWELL_BUILD_PROVIDER_TRAVIS}" ]]; then + cromwell::private::batch::install_gcloud + fi + + # All `gcloud` commands should use this configuration directory. + # https://stackoverflow.com/questions/34883810/how-to-authenticate-google-apis-with-different-service-account-credentials + # https://github.com/googleapis/google-auth-library-java/issues/58 + export CLOUDSDK_CONFIG="${CROMWELL_BUILD_BATCH_CLOUDSDK_CONFIG}" + + cromwell::build::add_exit_function cromwell::private::batch::teardown_batch_gcloud + + gcloud auth activate-service-account --key-file="${CROMWELL_BUILD_BATCH_AUTH_JSON}" + export GOOGLE_APPLICATION_CREDENTIALS="${CROMWELL_BUILD_BATCH_AUTH_JSON}" + gcloud config set account "${CROMWELL_BUILD_BATCH_CLIENT_EMAIL}" + gcloud config set project "${CROMWELL_BUILD_BATCH_PROJECT_ID}" +} + +cromwell::private::batch::teardown_batch_gcloud() { + cromwell::build::delete_docker_images cromwell::private::batch::gcr_image_delete "${CROMWELL_BUILD_BATCH_GCR_IMAGES}" +} + +cromwell::private::batch::install_gcloud() { + echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \ + | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list + sudo apt-get install -y apt-transport-https ca-certificates + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \ + | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - + sudo apt-get update + sudo apt-get install -y google-cloud-sdk +} + +cromwell::private::batch::gcr_image_push() { + local executable_name + local docker_image + + executable_name="${1:?gcr_image_push called without an executable_name}" + docker_image="${2:?gcr_image_push called without an docker_image}" + shift + shift + + cromwell::build::build_docker_image "${executable_name}" "${docker_image}" + echo "${docker_image}" >> "${CROMWELL_BUILD_BATCH_GCR_IMAGES}" + # Use cat to quiet docker: https://github.com/moby/moby/issues/36655#issuecomment-375136087 + docker push "${docker_image}" | cat +} + +cromwell::private::batch::gcr_image_delete() { + local docker_image_name + docker_image_name="${1:?gcr_image_delete called without a docker_image_name}" + shift + gcloud container images delete "${docker_image_name}" --force-delete-tags --quiet +} + +cromwell::private::batch::setup_batch_gcr() { + # Build a DOS/DRS localizer image from source, or for faster local debugging use an already provided image + if [[ -n "${CROMWELL_BUILD_BATCH_DOCKER_IMAGE_DRS:+set}" ]]; then + # If CROMWELL_BUILD_BATCH_DOCKER_IMAGE_DRS is already set then use that image + echo "Using CROMWELL_BUILD_BATCH_DOCKER_IMAGE_DRS='${CROMWELL_BUILD_BATCH_DOCKER_IMAGE_DRS}'" + elif command -v docker; then + # Upload images built from this commit + gcloud auth configure-docker --quiet + CROMWELL_BUILD_BATCH_DOCKER_IMAGE_DRS="gcr.io/${CROMWELL_BUILD_BATCH_PROJECT_ID}/cromwell-drs-localizer:${CROMWELL_BUILD_DOCKER_TAG}-batch" + cromwell::private::batch::gcr_image_push cromwell-drs-localizer "${CROMWELL_BUILD_BATCH_DOCKER_IMAGE_DRS}" + export CROMWELL_BUILD_BATCH_DOCKER_IMAGE_DRS + else + echo "Error: BA-6546 The environment variable CROMWELL_BUILD_BATCH_DOCKER_IMAGE_DRS must be set/export pointing to a valid docker image" >&2 + exit 1 + fi +} + +cromwell::private::batch::setup_batch_service_account() { + CROMWELL_BUILD_BATCH_AUTH_MODE="service-account" + export CROMWELL_BUILD_BATCH_AUTH_MODE +} + +cromwell::build::batch::setup_batch_centaur_environment() { + cromwell::private::batch::setup_batch_gcloud + if [[ "${CROMWELL_BUILD_PROVIDER}" != "${CROMWELL_BUILD_PROVIDER_JENKINS}" ]] + then + cromwell::private::batch::setup_batch_gcr + fi + cromwell::private::batch::setup_batch_service_account +} + +cromwell::build::batch::setup_batch_conformance_environment() { + cromwell::private::batch::setup_batch_service_account +} diff --git a/src/ci/bin/test_papi.inc.sh b/src/ci/bin/test_papi.inc.sh index 51f33a459b3..e3c253d7455 100644 --- a/src/ci/bin/test_papi.inc.sh +++ b/src/ci/bin/test_papi.inc.sh @@ -89,7 +89,7 @@ cromwell::private::papi::setup_papi_gcr() { elif command -v docker; then # Upload images built from this commit gcloud auth configure-docker --quiet - CROMWELL_BUILD_PAPI_DOCKER_IMAGE_DRS="gcr.io/${CROMWELL_BUILD_PAPI_PROJECT_ID}/cromwell-drs-localizer:${CROMWELL_BUILD_DOCKER_TAG}" + CROMWELL_BUILD_PAPI_DOCKER_IMAGE_DRS="gcr.io/${CROMWELL_BUILD_PAPI_PROJECT_ID}/cromwell-drs-localizer:${CROMWELL_BUILD_DOCKER_TAG}-papi" cromwell::private::papi::gcr_image_push cromwell-drs-localizer "${CROMWELL_BUILD_PAPI_DOCKER_IMAGE_DRS}" export CROMWELL_BUILD_PAPI_DOCKER_IMAGE_DRS else diff --git a/src/ci/resources/gcp_batch_application.conf b/src/ci/resources/gcp_batch_application.conf new file mode 100644 index 00000000000..61c6ffee36b --- /dev/null +++ b/src/ci/resources/gcp_batch_application.conf @@ -0,0 +1,12 @@ +include "gcp_batch_shared_application.inc.conf" + +backend { + providers { + GCPBATCH { + actor-factory = "cromwell.backend.google.batch.GcpBatchBackendLifecycleActorFactory" + config { + include "gcp_batch_provider_config.inc.conf" + } + } + } +} diff --git a/src/ci/resources/gcp_batch_application.inc.conf.ctmpl b/src/ci/resources/gcp_batch_application.inc.conf.ctmpl new file mode 100644 index 00000000000..aebdd7ea377 --- /dev/null +++ b/src/ci/resources/gcp_batch_application.inc.conf.ctmpl @@ -0,0 +1,73 @@ +backend { + providers { + Local.config.filesystems.gcs.auth = "service_account" + } +} + +engine { + filesystems { + gcs { + auth = "service_account" + project = "broad-dsde-cromwell-dev" + } + drs { + auth = "service_account" + } + } +} + +google { + application-name = "cromwell" + json-dir = "Error: BA-6546 The environment variable CROMWELL_BUILD_RESOURCES_DIRECTORY must be set/export pointing to a valid path such as '${YOUR_CROMWELL_DIR}/target/ci/resources'" + json-dir = ${?CROMWELL_BUILD_RESOURCES_DIRECTORY} + auths = [ + { + name = "service_account" + scheme = "service_account" + json-file = ${google.json-dir}/cromwell-centaur-service-account.json + } + { + name = "requester_pays_service_account" + scheme = "service_account" + # This service account does have billing permission and can be used for requester pays + json-file = ${google.json-dir}/cromwell-centaur-requester-pays-service-account.json + } + { + name = "google_compute_service_account" + scheme = "service_account" + # This service account has only: + # 1. The role "Genomics Pipelines Runner" + # 2. Has been added a "Service Account User" on cromwell@broad-dsde-cromwell-dev.iam.gserviceaccount.com + json-file = ${google.json-dir}/cromwell-centaur-google-compute-service-account.json + } + { + name = "user_service_account" + scheme = "user_service_account" + } + ] +} + +services { + HealthMonitor { + class = "cromwell.services.healthmonitor.impl.HealthMonitorServiceActor" + config { + + //check-papi-backends: [ Papi ] + check-gcs: true + check-engine-database: true + check-dockerhub: true + + google-auth-name = "service_account" + gcs-bucket-to-check = "cloud-cromwell-dev" + } + } +} + +filesystems.drs.global.config.resolver.url = "https://drshub.dsde-dev.broadinstitute.org/api/v4/drs/resolve" + +drs { + localization { + docker-image = "Error: BA-6546 The environment variable CROMWELL_BUILD_PAPI_DOCKER_IMAGE_DRS must be set/export pointing to a valid docker image" + docker-image = ${?CROMWELL_BUILD_PAPI_DOCKER_IMAGE_DRS} + } +} diff --git a/src/ci/resources/gcp_batch_provider_config.inc.conf b/src/ci/resources/gcp_batch_provider_config.inc.conf new file mode 100644 index 00000000000..a7f92e41e8c --- /dev/null +++ b/src/ci/resources/gcp_batch_provider_config.inc.conf @@ -0,0 +1,20 @@ +project = "broad-dsde-cromwell-dev" +root = "gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/ci" +maximum-polling-interval = 600 +concurrent-job-limit = 1000 + +genomics { + auth = "service_account" + location = "us-central1" +} + +filesystems { + gcs { + auth = "service_account" + project = "broad-dsde-cromwell-dev" + } +} + +slow-job-warning-time: 20 minutes + +checkpointing-interval: "10 seconds" diff --git a/src/ci/resources/gcp_batch_shared_application.inc.conf b/src/ci/resources/gcp_batch_shared_application.inc.conf new file mode 100644 index 00000000000..09124a5bb28 --- /dev/null +++ b/src/ci/resources/gcp_batch_shared_application.inc.conf @@ -0,0 +1,34 @@ +include required(classpath("application.conf")) +include "build_application.inc.conf" +include "gcp_batch_application.inc.conf" + +services { + HealthMonitor.config { + check-gcpbatch-backends: [ + "GCPBATCH", + ] + } +} + +backend { + default = "GCPBATCH" + enabled = ["GCPBATCH"] + providers { + # Default gcp batch backend + GCPBATCH { + actor-factory = "cromwell.backend.google.batch.GcpBatchBackendLifecycleActorFactory" + config { + # When importing: Remember to also include an appropriate provider_config.inc.conf here. + + # TODO: Should not need because already included. testing. + include "gcp_batch_provider_config.inc.conf" + + include "dockerhub_provider_config_v2.inc.conf" + # This SA does not have permission to bill this project when accessing RP buckets. + # This is on purpose so that we can assert the failure (see requester_pays_localization_negative) + genomics.compute-service-account = "centaur@broad-dsde-cromwell-dev.iam.gserviceaccount.com" + filesystems.http {} + } + } + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/GcpBatchBackendLifecycleActorFactory.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/GcpBatchBackendLifecycleActorFactory.scala new file mode 100644 index 00000000000..1bf708e8f8c --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/GcpBatchBackendLifecycleActorFactory.scala @@ -0,0 +1,115 @@ +package cromwell.backend.google.batch + +import akka.actor.{ActorRef, Props} +import com.google.api.client.util.ExponentialBackOff +import com.typesafe.scalalogging.StrictLogging +import cromwell.backend.google.batch.GcpBatchBackendLifecycleActorFactory.{preemptionCountKey, robustBuildAttributes, unexpectedRetryCountKey} +import cromwell.backend.google.batch.actors._ +import cromwell.backend.google.batch.api.{GcpBatchApiRequestHandler, GcpBatchRequestFactoryImpl} +import cromwell.backend.google.batch.models.{GcpBatchConfiguration, GcpBatchConfigurationAttributes} +import cromwell.backend.google.batch.callcaching.{BatchBackendCacheHitCopyingActor, BatchBackendFileHashingActor} +import cromwell.backend.standard._ +import cromwell.backend.standard.callcaching.{StandardCacheHitCopyingActor, StandardFileHashingActor} +import cromwell.backend.{BackendConfigurationDescriptor, BackendInitializationData, BackendWorkflowDescriptor, JobExecutionMap} +import cromwell.cloudsupport.gcp.GoogleConfiguration +import cromwell.core.CallOutputs +import wom.graph.CommandCallNode + +import scala.util.{Failure, Try} + +class GcpBatchBackendLifecycleActorFactory(override val name: String, override val configurationDescriptor: BackendConfigurationDescriptor) + extends StandardLifecycleActorFactory { + + override val requestedKeyValueStoreKeys: Seq[String] = Seq(preemptionCountKey, unexpectedRetryCountKey) + + override def jobIdKey: String = "__gcp_batch" + protected val googleConfig: GoogleConfiguration = GoogleConfiguration(configurationDescriptor.globalConfig) + + override lazy val initializationActorClass: Class[_ <: StandardInitializationActor] = classOf[GcpBatchInitializationActor] + + override def asyncExecutionActorClass: Class[_ <: StandardAsyncExecutionActor] = + classOf[GcpBatchAsyncBackendJobExecutionActor] + + override lazy val finalizationActorClassOption: Option[Class[_ <: StandardFinalizationActor]] = + Option(classOf[GcpBatchFinalizationActor]) + + + + protected val batchAttributes: GcpBatchConfigurationAttributes = { + def defaultBuildAttributes()= + GcpBatchConfigurationAttributes(googleConfig, configurationDescriptor.backendConfig, "batchConfig") + robustBuildAttributes(defaultBuildAttributes) + } + + val batchConfiguration = new GcpBatchConfiguration(configurationDescriptor, googleConfig, batchAttributes) + + override def workflowInitializationActorParams( + + workflowDescriptor: BackendWorkflowDescriptor, + ioActor: ActorRef, + calls: Set[CommandCallNode], + serviceRegistryActor: ActorRef, + restart: Boolean): StandardInitializationActorParams = { + GcpBatchInitializationActorParams(workflowDescriptor, ioActor , calls, batchConfiguration, serviceRegistryActor, restart) + } + + override def workflowFinalizationActorParams( + workflowDescriptor: BackendWorkflowDescriptor, + ioActor: ActorRef, + //batchConfiguration: GcpBatchConfiguration, + calls: Set[CommandCallNode], + jobExecutionMap: JobExecutionMap, + workflowOutputs: CallOutputs, + initializationDataOption: Option[BackendInitializationData]): StandardFinalizationActorParams = { + GcpBatchFinalizationActorParams(workflowDescriptor, ioActor, batchConfiguration, calls, jobExecutionMap, workflowOutputs, initializationDataOption) + } + + override lazy val cacheHitCopyingActorClassOption: Option[Class[_ <: StandardCacheHitCopyingActor]] = { + Option(classOf[BatchBackendCacheHitCopyingActor]) + } + + override lazy val fileHashingActorClassOption: Option[Class[_ <: StandardFileHashingActor]] = Option(classOf[BatchBackendFileHashingActor]) + + override def backendSingletonActorProps(serviceRegistryActor: ActorRef): Option[Props] = { + val requestHandler = new GcpBatchApiRequestHandler + val requestFactory = new GcpBatchRequestFactoryImpl()(batchConfiguration.batchAttributes.gcsTransferConfiguration) + Option(GcpBatchBackendSingletonActor.props(requestFactory, serviceRegistryActor = serviceRegistryActor)(requestHandler)) + } +} + +object GcpBatchBackendLifecycleActorFactory extends StrictLogging { + val preemptionCountKey = "PreemptionCount" + val unexpectedRetryCountKey = "UnexpectedRetryCount" + + + private [batch] def robustBuildAttributes(buildAttributes: () => GcpBatchConfigurationAttributes, + maxAttempts: Int = 3, + initialIntervalMillis: Int = 5000, + maxIntervalMillis: Int = 10000, + multiplier: Double = 1.5, + randomizationFactor: Double = 0.5): GcpBatchConfigurationAttributes = { + val backoff = new ExponentialBackOff.Builder() + .setInitialIntervalMillis(initialIntervalMillis) + .setMaxIntervalMillis(maxIntervalMillis) + .setMultiplier(multiplier) + .setRandomizationFactor(randomizationFactor) + .build() + + // `attempt` is 1-based + def build(attempt: Int): Try[GcpBatchConfigurationAttributes] = { + Try { + buildAttributes() + } recoverWith { + // Try again if this was an Exception (as opposed to an Error) and we have not hit maxAttempts + case ex: Exception if attempt < maxAttempts => + logger + .warn(s"Failed to build GcpBatchConfigurationAttributes on attempt $attempt of $maxAttempts, retrying.", ex) + Thread.sleep(backoff.nextBackOffMillis()) + build(attempt + 1) + case e => Failure(new RuntimeException(s"Failed to build GcpBatchConfigurationAttributes on attempt $attempt of $maxAttempts", e)) + } + } + // This intentionally throws if the final result of `build` is a `Failure`. + build(attempt = 1).get + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiAbortClient.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiAbortClient.scala new file mode 100644 index 00000000000..f025ae05282 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiAbortClient.scala @@ -0,0 +1,23 @@ +package cromwell.backend.google.batch.actors + +import akka.actor.{Actor, ActorLogging, ActorRef} +import com.google.cloud.batch.v1.JobName +import cromwell.backend.google.batch.monitoring.BatchInstrumentation + +trait BatchApiAbortClient { this: Actor with ActorLogging with BatchInstrumentation => + + def abortJob(jobName: JobName, backendSingletonActor: ActorRef): Unit = { + backendSingletonActor ! GcpBatchBackendSingletonActor.Action.AbortJob(jobName) + } + + def abortActorClientReceive: Actor.Receive = { + case GcpBatchBackendSingletonActor.Event.JobAbortRequestSent(job) => + log.info(s"Job aborted on GCP: ${job.getName}") + abortSuccess() + + case GcpBatchBackendSingletonActor.Event.ActionFailed(jobName, cause) => + val msg = s"Failed to abort job ($jobName) from GCP" + log.error(cause, msg) + abortFailed() + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiFetchJobClient.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiFetchJobClient.scala new file mode 100644 index 00000000000..e502bb18c2e --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiFetchJobClient.scala @@ -0,0 +1,47 @@ +package cromwell.backend.google.batch.actors + +import akka.actor.{Actor, ActorLogging, ActorRef} +import com.google.cloud.batch.v1.{Job, JobName} +import cromwell.backend.google.batch.monitoring.BatchInstrumentation + +import scala.concurrent.{Future, Promise} +import scala.util.{Failure, Success, Try} + +/** + * Allows fetching a job + */ +trait BatchApiFetchJobClient { this: Actor with ActorLogging with BatchInstrumentation => + + private var pollingActorClientPromise: Option[Promise[Job]] = None + + // handles messages produced from GcpBatchBackendSingletonActor + def pollingActorClientReceive: Actor.Receive = { + case GcpBatchBackendSingletonActor.Event.JobStatusRetrieved(job) => + log.info(s"Job retrieved from GCP: ${job.getName}: ${job.getStatus}") + pollSuccess() + completePromise(Success(job)) + + case GcpBatchBackendSingletonActor.Event.ActionFailed(jobName, cause) => + val msg = s"Failed to query job ($jobName) from GCP" + log.error(cause, msg) + pollFailed() + completePromise(Failure(cause)) + } + + private def completePromise(result: Try[Job]): Unit = { + pollingActorClientPromise foreach { _.complete(result) } + pollingActorClientPromise = None + } + + def fetchJob(jobName: JobName, backendSingletonActor: ActorRef): Future[Job] = { + pollingActorClientPromise match { + case Some(p) => p.future + case None => + backendSingletonActor ! GcpBatchBackendSingletonActor.Action.QueryJob(jobName) + + val newPromise = Promise[Job]() + pollingActorClientPromise = Option(newPromise) + newPromise.future + } + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiRunCreationClient.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiRunCreationClient.scala new file mode 100644 index 00000000000..0763369dc50 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiRunCreationClient.scala @@ -0,0 +1,50 @@ +package cromwell.backend.google.batch.actors + +import akka.actor.{Actor, ActorLogging, ActorRef} +import cromwell.backend.google.batch.models.GcpBatchRequest +import cromwell.backend.google.batch.monitoring.BatchInstrumentation +import cromwell.backend.standard.StandardAsyncJob + +import scala.concurrent.{Future, Promise} +import scala.util.{Failure, Success, Try} + +/** + * Handles the flow for submitting a single job to GCP, we can't do anything when that fails + */ +trait BatchApiRunCreationClient { this: Actor with ActorLogging with BatchInstrumentation => + private var runCreationClientPromise: Option[Promise[StandardAsyncJob]] = None + + // handles messages produced from GcpBatchBackendSingletonActor + def runCreationClientReceive: Actor.Receive = { + case GcpBatchBackendSingletonActor.Event.JobSubmitted(job) => + log.info(s"Job submitted to GCP: ${job.getName}") + runSuccess() + completePromise(Success(StandardAsyncJob(job.getName))) + + case GcpBatchBackendSingletonActor.Event.ActionFailed(jobName, cause) => + val msg = s"Failed to submit job ($jobName) to GCP" + log.error(cause, msg) + runFailed() + completePromise(Failure(cause)) + } + + private def completePromise(job: Try[StandardAsyncJob]): Unit = { + runCreationClientPromise.foreach { + _.complete(job) + } + runCreationClientPromise = None + } + + def runBatchJob(request: GcpBatchRequest, backendSingletonActor: ActorRef): Future[StandardAsyncJob] = { + runCreationClientPromise match { + case Some(p) => + p.future + case None => + log.info(s"Asking singleton actor to submit a job: ${request.jobName}") + backendSingletonActor ! GcpBatchBackendSingletonActor.Action.SubmitJob(request) + val newPromise = Promise[StandardAsyncJob]() + runCreationClientPromise = Option(newPromise) + newPromise.future + } + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActor.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActor.scala new file mode 100644 index 00000000000..b2b10d92afb --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActor.scala @@ -0,0 +1,1086 @@ +package cromwell.backend.google.batch.actors + +import akka.actor.{ActorLogging, ActorRef} +import akka.http.scaladsl.model.{ContentType, ContentTypes} +import akka.pattern.AskSupport +import cats.data.NonEmptyList +import cats.data.Validated.Valid +import cats.implicits._ +import com.google.cloud.batch.v1.JobName +import com.google.cloud.storage.contrib.nio.CloudStorageOptions +import common.util.StringUtil._ +import common.validation.ErrorOr.ErrorOr +import cromwell.backend._ +import cromwell.backend.async.{ExecutionHandle, PendingExecutionHandle} +import cromwell.backend.google.batch.api.GcpBatchRequestFactory._ +import cromwell.backend.google.batch.io._ +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration +import cromwell.backend.google.batch.models.GcpBatchJobPaths.GcsTransferLibraryName +import cromwell.backend.google.batch.models.RunStatus.TerminalRunStatus +import cromwell.backend.google.batch.models._ +import cromwell.backend.google.batch.monitoring.{BatchInstrumentation, CheckpointingConfiguration, MonitoringImage} +import cromwell.backend.google.batch.runnable.WorkflowOptionKeys +import cromwell.backend.google.batch.util.{GcpBatchReferenceFilesMappingOperations, RuntimeOutputMapping} +import cromwell.filesystems.gcs.GcsPathBuilder +import cromwell.filesystems.gcs.GcsPathBuilder.ValidFullGcsPath +import java.io.FileNotFoundException +import cromwell.backend.standard.{StandardAdHocValue, StandardAsyncExecutionActor, StandardAsyncExecutionActorParams, StandardAsyncJob} +import cromwell.core._ +import cromwell.core.io.IoCommandBuilder +import cromwell.core.path.{DefaultPathBuilder, Path} +import cromwell.core.retry.SimpleExponentialBackoff +import cromwell.filesystems.drs.{DrsPath, DrsResolver} +import cromwell.filesystems.gcs.batch.GcsBatchCommandBuilder +import cromwell.filesystems.gcs.GcsPath +import cromwell.filesystems.http.HttpPath +import cromwell.filesystems.sra.SraPath +import cromwell.services.instrumentation.CromwellInstrumentation +import cromwell.services.metadata.CallMetadataKeys +import mouse.all._ +import shapeless.Coproduct +import org.apache.commons.codec.digest.DigestUtils +import org.apache.commons.csv.{CSVFormat, CSVPrinter} +import org.apache.commons.io.output.ByteArrayOutputStream +import wdl4s.parser.MemoryUnit +import wom.callable.Callable.OutputDefinition +import wom.callable.MetaValueElement.{MetaValueElementBoolean, MetaValueElementObject} +import wom.callable.{AdHocValue, RuntimeEnvironment} +import wom.core.FullyQualifiedName +import wom.expression.{FileEvaluation, NoIoFunctionSet} +import wom.format.MemorySize +import wom.values._ +import java.io.OutputStreamWriter +import java.nio.charset.Charset +import java.util.Base64 +import scala.concurrent.Future +import scala.concurrent.duration._ +import scala.io.Source +import scala.language.postfixOps +import scala.util.{Failure, Success, Try} +import scala.util.control.NoStackTrace + +object GcpBatchAsyncBackendJobExecutionActor { + + // GCS path regexes comments: + // - The (?s) option at the start makes '.' expression to match any symbol, including '\n' + // - All valid GCS paths start with gs:// + // - Bucket names: + // - The bucket name is matched inside a set of '()'s so it can be used later. + // - The bucket name must start with a letter or number (https://cloud.google.com/storage/docs/naming) + // - Then, anything that is not a '/' is part of the bucket name + // - Allow zero or more subdirectories, with (/[^/]+)* + // - Then, for files: + // - There must be at least one '/', followed by some content in the file name. + // - Or, then, for directories: + // - If we got this far, we already have a valid directory path. Allow it to optionally end with a `/` character. + private val gcsFilePathMatcher = "(?s)^gs://([a-zA-Z0-9][^/]+)(/[^/]+)*/[^/]+$".r + private val gcsDirectoryPathMatcher = "(?s)^gs://([a-zA-Z0-9][^/]+)(/[^/]+)*/?$".r + + val GcpBatchOperationIdKey = "__gcp_batch_operation_id" + + type GcpBatchPendingExecutionHandle = PendingExecutionHandle[StandardAsyncJob, Run, RunStatus] + + val plainTextContentType: Option[ContentType.WithCharset] = Option(ContentTypes.`text/plain(UTF-8)`) + + private[batch] def groupParametersByGcsBucket[T <: BatchParameter](parameters: List[T]): Map[String, NonEmptyList[T]] = { + parameters.map { param => + def pathTypeString = if (param.isFileParameter) "File" else "Directory" + val regexToUse = if (param.isFileParameter) gcsFilePathMatcher else gcsDirectoryPathMatcher + + param.cloudPath.pathAsString match { + case regexToUse(bucket) => Map(bucket -> NonEmptyList.of(param)) + case regexToUse(bucket, _) => Map(bucket -> NonEmptyList.of(param)) + case other => + throw new Exception(s"$pathTypeString path '$other' did not match the expected regex: ${regexToUse.pattern.toString}") with NoStackTrace + } + } combineAll + } + + private[batch] def generateDrsLocalizerManifest(inputs: List[GcpBatchInput]): String = { + val outputStream = new ByteArrayOutputStream() + val csvPrinter = new CSVPrinter(new OutputStreamWriter(outputStream), CSVFormat.DEFAULT) + val drsFileInputs = inputs collect { + case drsInput@GcpBatchFileInput(_, drsPath: DrsPath, _, _) => (drsInput, drsPath) + } + drsFileInputs foreach { case (drsInput, drsPath) => + csvPrinter.printRecord(drsPath.pathAsString, drsInput.containerPath.pathAsString) + } + csvPrinter.close(true) + outputStream.toString(Charset.defaultCharset()) + } + +} + +class GcpBatchAsyncBackendJobExecutionActor(override val standardParams: StandardAsyncExecutionActorParams) + extends BackendJobLifecycleActor + with StandardAsyncExecutionActor + with BatchApiRunCreationClient + with BatchApiFetchJobClient + with BatchApiAbortClient + with AskSupport + with GcpBatchJobCachingActorHelper + with GcpBatchReferenceFilesMappingOperations + with BatchInstrumentation + with ActorLogging + with CromwellInstrumentation { + + import GcpBatchAsyncBackendJobExecutionActor._ + override lazy val ioCommandBuilder: IoCommandBuilder = GcsBatchCommandBuilder + + lazy val workflowId: WorkflowId = jobDescriptor.workflowDescriptor.id + + /** The type of the run info when a job is started. */ + override type StandardAsyncRunInfo = Run + + /** The type of the run status returned during each poll. */ + override type StandardAsyncRunState = RunStatus + + override def receive: Receive = runCreationClientReceive orElse pollingActorClientReceive orElse abortActorClientReceive orElse kvClientReceive orElse super.receive + + /** Should return true if the status contained in `thiz` is equivalent to `that`, delta any other data that might be carried around + * in the state type. + */ + def statusEquivalentTo(thiz: StandardAsyncRunState)(that: StandardAsyncRunState): Boolean = thiz == that + + protected lazy val cmdInput: GcpBatchFileInput = + GcpBatchFileInput(GcpBatchJobPaths.BatchExecParamName, gcpBatchCallPaths.script, DefaultPathBuilder.get(gcpBatchCallPaths.scriptFilename), workingDisk) + + private lazy val jobDockerImage = jobDescriptor.maybeCallCachingEligible.dockerHash + .getOrElse(runtimeAttributes.dockerImage) + + override def dockerImageUsed: Option[String] = Option(jobDockerImage) + + //noinspection ActorMutableStateInspection + + // Need to add previousRetryReasons and preemptible in order to get preemptible to work in the tests + protected val previousRetryReasons: ErrorOr[PreviousRetryReasons] = PreviousRetryReasons.tryApply(jobDescriptor.prefetchedKvStoreEntries, jobDescriptor.key.attempt) + + lazy val preemptible: Boolean = previousRetryReasons match { + case Valid(PreviousRetryReasons(p, _)) => p < maxPreemption + case _ => false + } + + override def tryAbort(job: StandardAsyncJob): Unit = abortJob(JobName.parse(job.jobId), backendSingletonActor) + + val backendSingletonActor: ActorRef = standardParams.backendSingletonActorOption + .getOrElse(throw new RuntimeException("GCP Batch actor cannot exist without its backend singleton 2")) + + + /** + * Takes two arrays of remote and local WOM File paths and generates the necessary `GcpBatchInput`s. + */ + protected def gcpBatchInputsFromWomFiles(inputName: String, + remotePathArray: Seq[WomFile], + localPathArray: Seq[WomFile], + jobDescriptor: BackendJobDescriptor): Iterable[GcpBatchInput] = { + (remotePathArray zip localPathArray) flatMap { + case (remotePath: WomMaybeListedDirectory, localPath) => + maybeListedDirectoryToBatchParameters(inputName, remotePath, localPath.valueString) + case (remotePath: WomUnlistedDirectory, localPath) => + Seq(GcpBatchDirectoryInput(inputName, getPath(remotePath.valueString).get, DefaultPathBuilder.get(localPath.valueString), workingDisk)) + case (remotePath: WomMaybePopulatedFile, localPath) => + maybePopulatedFileToBatchParameters(inputName, remotePath, localPath.valueString) + case (remotePath, localPath) => + Seq(GcpBatchFileInput(inputName, getPath(remotePath.valueString).get, DefaultPathBuilder.get(localPath.valueString), workingDisk)) + } + } + + private def maybePopulatedFileToBatchParameters(inputName: String, maybePopulatedFile: WomMaybePopulatedFile, localPath: String) = { + val secondaryFiles = maybePopulatedFile.secondaryFiles.flatMap({ secondaryFile => + gcpBatchInputsFromWomFiles(secondaryFile.valueString, List(secondaryFile), List(relativeLocalizationPath(secondaryFile)), jobDescriptor) + }) + + Seq(GcpBatchFileInput(inputName, getPath(maybePopulatedFile.valueString).get, DefaultPathBuilder.get(localPath), workingDisk)) ++ secondaryFiles + } + + /** + * Turns WomFiles into relative paths. These paths are relative to the working disk. + * + * relativeLocalizationPath("foo/bar.txt") -> "foo/bar.txt" + * relativeLocalizationPath("gs://some/bucket/foo.txt") -> "some/bucket/foo.txt" + */ + override protected def relativeLocalizationPath(file: WomFile): WomFile = { + file.mapFile(value => + getPath(value) match { + case Success(drsPath: DrsPath) => DrsResolver.getContainerRelativePath(drsPath).unsafeRunSync() + case Success(path) => path.pathWithoutScheme + case _ => value + } + ) + } + + lazy val localMonitoringImageScriptPath: Path = + DefaultPathBuilder.get(gcpBatchCallPaths.batchMonitoringImageScriptFilename) + + override protected def fileName(file: WomFile): WomFile = { + file.mapFile(value => + getPath(value) match { + case Success(drsPath: DrsPath) => DefaultPathBuilder + .get(DrsResolver.getContainerRelativePath(drsPath).unsafeRunSync()).name + case Success(path) => path.name + case _ => value + } + ) + } + + override lazy val inputsToNotLocalize: Set[WomFile] = { + val localizeOptional = jobDescriptor.findInputFilesByParameterMeta { + case MetaValueElementObject(values) => values.get("localization_optional").contains(MetaValueElementBoolean(true)) + case _ => false + } + val localizeSkipped = localizeOptional.filter(canSkipLocalize) + val localizeMapped = localizeSkipped.map(cloudResolveWomFile) + localizeSkipped ++ localizeMapped + } + + private def canSkipLocalize(womFile: WomFile): Boolean = { + var canSkipLocalize = true + womFile.mapFile { value => + getPath(value) match { + case Success(drsPath: DrsPath) => + val gsUriOption = DrsResolver.getSimpleGsUri(drsPath).unsafeRunSync() + if (gsUriOption.isEmpty) { + canSkipLocalize = false + } + case _ => /* ignore */ + } + value + } + canSkipLocalize + } + + // The original implementation recursively finds all non directory files, in V2 we can keep directory as is + protected lazy val callInputFiles: Map[FullyQualifiedName, Seq[WomFile]] = { + + // NOTE: This causes the tests to fail + jobDescriptor.localInputs map { + case (key, womFile) => + key -> womFile.collectAsSeq({ + case womFile: WomFile if !inputsToNotLocalize.contains(womFile) => womFile + }) + } + } + + private lazy val gcsTransferLibrary = + Source.fromInputStream(Thread.currentThread.getContextClassLoader.getResourceAsStream("gcs_transfer.sh")).mkString + + private def gcsLocalizationTransferBundle[T <: GcpBatchInput](gcsTransferConfiguration: GcsTransferConfiguration)(bucket: String, inputs: NonEmptyList[T]): String = { + val project = inputs.head.cloudPath.asInstanceOf[GcsPath].projectId + val maxAttempts = gcsTransferConfiguration.transferAttempts + + // Split files and directories out so files can possibly benefit from a `gsutil -m cp -I ...` optimization + // on a per-container-parent-directory basis. + val (files, directories) = inputs.toList partition { + _.isInstanceOf[GcpBatchFileInput] + } + + // Files with different names between cloud and container are not eligible for bulk copying. + val (filesWithSameNames, filesWithDifferentNames) = files partition { f => + f.cloudPath.asInstanceOf[GcsPath].nioPath.getFileName.toString == f.containerPath.getFileName.toString + } + + val filesByContainerParentDirectory = filesWithSameNames.groupBy(_.containerPath.parent.toString) + // Deduplicate any inputs since parallel localization can't deal with this. + val uniqueFilesByContainerParentDirectory = filesByContainerParentDirectory map { case (p, fs) => p -> fs.toSet } + + val filesWithSameNamesTransferBundles: List[String] = uniqueFilesByContainerParentDirectory.toList map { case (containerParent, filesWithSameParent) => + val arrayIdentifier = s"files_to_localize_" + DigestUtils.md5Hex(bucket + containerParent) + val entries = filesWithSameParent.map(_.cloudPath) mkString("\"", "\"\n| \"", "\"") + + s""" + |# Localize files from source bucket '$bucket' to container parent directory '$containerParent'. + |$arrayIdentifier=( + | "$project" # project to use if requester pays + | "$maxAttempts" # max transfer attempts + | "${containerParent.ensureSlashed}" # container parent directory + | $entries + |) + | + |localize_files "$${$arrayIdentifier[@]}" + """.stripMargin + } + + val filesWithDifferentNamesTransferBundles = filesWithDifferentNames map { f => + val arrayIdentifier = s"singleton_file_to_localize_" + DigestUtils.md5Hex(f.cloudPath.pathAsString + f.containerPath.pathAsString) + s""" + |# Localize singleton file '${f.cloudPath.pathAsString}' to '${f.containerPath.pathAsString}'. + |$arrayIdentifier=( + | "$project" + | "$maxAttempts" + | "${f.cloudPath}" + | "${f.containerPath}" + |) + | + |localize_singleton_file "$${$arrayIdentifier[@]}" + """.stripMargin + } + + // Only write a transfer bundle for directories if there are directories to be localized. Emptiness isn't a concern + // for files since there is always at least the command script to be localized. + val directoryTransferBundle = if (directories.isEmpty) "" else { + val entries = directories flatMap { i => List(i.cloudPath, i.containerPath) } mkString("\"", "\"\n| \"", "\"") + + val arrayIdentifier = s"directories_to_localize_" + DigestUtils.md5Hex(bucket) + + s""" + |# Directories from source bucket '$bucket'. + |$arrayIdentifier=( + | "$project" # project to use if requester pays + | "$maxAttempts" # max transfer attempts + | $entries + |) + | + |localize_directories "$${$arrayIdentifier[@]}" + """.stripMargin + } + + (directoryTransferBundle :: (filesWithSameNamesTransferBundles ++ filesWithDifferentNamesTransferBundles)) mkString "\n\n" + } + + private def gcsDelocalizationTransferBundle[T <: GcpBatchOutput](transferConfiguration: GcsTransferConfiguration)(bucket: String, outputs: NonEmptyList[T]): String = { + val project = outputs.head.cloudPath.asInstanceOf[GcsPath].projectId + val maxAttempts = transferConfiguration.transferAttempts + + val transferItems = outputs.toList.flatMap { output => + val kind = output match { + case o: GcpBatchFileOutput if o.secondary => "file_or_directory" // if secondary the type is unknown + case _: GcpBatchFileOutput => "file" // a primary file + case _: GcpBatchDirectoryOutput => "directory" // a primary directory + } + + val optional = Option(output) collectFirst { case o: GcpBatchFileOutput if o.secondary || o.optional => "optional" } getOrElse "required" + val contentType = output.contentType.map(_.toString).getOrElse("") + + List(kind, output.cloudPath.toString, output.containerPath.toString, optional, contentType) + } mkString("\"", "\"\n| \"", "\"") + + val parallelCompositeUploadThreshold = jobDescriptor.workflowDescriptor.workflowOptions.getOrElse( + "parallel_composite_upload_threshold", transferConfiguration.parallelCompositeUploadThreshold) + + // Use a digest as bucket names can contain characters that are not legal in bash identifiers. + val arrayIdentifier = s"delocalize_" + DigestUtils.md5Hex(bucket) + s""" + |# $bucket + |$arrayIdentifier=( + | "$project" # project + | "$maxAttempts" # max attempts + | "$parallelCompositeUploadThreshold" # parallel composite upload threshold, will not be used for directory types + | $transferItems + |) + | + |delocalize "$${$arrayIdentifier[@]}" + """.stripMargin + } + + private def bracketTransfersWithMessages(activity: String)(transferBody: String): String = { + List( + s"timestamped_message '$activity script execution started...'", + transferBody, + s"timestamped_message '$activity script execution complete.'" + ) mkString "\n" + } + + def uploadDrsLocalizationManifest(createParameters: CreateBatchJobParameters, cloudPath: Path): Future[Unit] = { + val content = generateDrsLocalizerManifest(createParameters.inputOutputParameters.fileInputParameters) + if (content.nonEmpty) + asyncIo.writeAsync(cloudPath, content, Seq(CloudStorageOptions.withMimeType("text/plain"))) + else + Future.unit + } + + def uploadScriptFile(): Future[Unit] = { + commandScriptContents + .fold( + errors => Future + .failed(new RuntimeException(errors + .toList + .mkString(", "))), + asyncIo + .writeAsync(jobPaths + .script, _, Seq(CloudStorageOptions.withMimeType("text/plain"))) + ) + } + + def sendGoogleLabelsToMetadata(customLabels: Seq[GcpLabel]): Unit = { + lazy val backendLabelEvents: Map[String, String] = ((backendLabels ++ customLabels) map { l => s"${CallMetadataKeys.BackendLabels}:${l.key}" -> l.value }).toMap + tellMetadata(backendLabelEvents) + } + + protected val useReferenceDisks: Boolean = { + val optionName = WorkflowOptions.UseReferenceDisks.name + workflowDescriptor.workflowOptions.getBoolean(optionName) match { + case Success(value) => value + case Failure(OptionNotFoundException(_)) => false + case Failure(f) => + // Should not happen, this case should have been screened for and fast-failed during workflow materialization. + log.error(f, s"Programmer error: unexpected failure attempting to read value for workflow option '$optionName' as a Boolean") + false + } + } + + + def getReferenceInputsToMountedPathsOpt(createParameters: CreateBatchJobParameters): Option[Map[GcpBatchInput, String]] = { + if (useReferenceDisks) { + batchAttributes + .referenceFileToDiskImageMappingOpt + .map(getReferenceInputsToMountedPathMappings(_, createParameters.inputOutputParameters.fileInputParameters)) + } else { + None + } + } + + private def generateGcsLocalizationScript(inputs: List[GcpBatchInput], + referenceInputsToMountedPathsOpt: Option[Map[GcpBatchInput, String]]) + (implicit gcsTransferConfiguration: GcsTransferConfiguration): String = { + // Generate a mapping of reference inputs to their mounted paths and a section of the localization script to + // "faux localize" these reference inputs with symlinks to their locations on mounted reference disks. + import cromwell.backend.google.batch.runnable.RunnableUtils.shellEscaped + + val referenceFilesLocalizationScript = { + val symlinkCreationCommandsOpt = referenceInputsToMountedPathsOpt map { referenceInputsToMountedPaths => + referenceInputsToMountedPaths map { + case (input, absolutePathOnRefDisk) => + s"mkdir -p ${shellEscaped(input.containerPath.parent.pathAsString)} && ln -s ${shellEscaped(absolutePathOnRefDisk)} ${shellEscaped(input.containerPath.pathAsString)}" + } + } + + if (symlinkCreationCommandsOpt.exists(_.nonEmpty)) { + s""" + |# Faux-localizing reference files (if any) by creating symbolic links to the files located on the mounted reference disk + |${symlinkCreationCommandsOpt.get.mkString("\n")} + |""".stripMargin + } else { + "\n# No reference disks mounted / no symbolic links created since no matching reference files found in the inputs to this call.\n" + } + } + + val maybeReferenceFilesLocalizationScript = + if (useReferenceDisks) { + referenceFilesLocalizationScript + } else { + "\n# No reference disks mounted since not requested in workflow options.\n" + } + + val regularFilesLocalizationScript = { + val regularFiles = referenceInputsToMountedPathsOpt.map(maybeReferenceInputsToMountedPaths => + inputs diff maybeReferenceInputsToMountedPaths.keySet.toList + ).getOrElse(inputs) + if (regularFiles.nonEmpty) { + val bundleFunction = (gcsLocalizationTransferBundle(gcsTransferConfiguration) _).tupled + generateGcsTransferScript(regularFiles, bundleFunction) + } else { + "" + } + } + + val combinedLocalizationScript = + s""" + |$maybeReferenceFilesLocalizationScript + | + |$regularFilesLocalizationScript + |""".stripMargin + + combinedLocalizationScript |> bracketTransfersWithMessages("Localization") + } + + private def generateGcsDelocalizationScript(outputs: List[GcpBatchOutput])(implicit gcsTransferConfiguration: GcsTransferConfiguration): String = { + val bundleFunction = (gcsDelocalizationTransferBundle(gcsTransferConfiguration) _).tupled + generateGcsTransferScript(outputs, bundleFunction) |> bracketTransfersWithMessages("Delocalization") + } + + private def generateGcsTransferScript[T <: BatchParameter](items: List[T], bundleFunction: ((String, NonEmptyList[T])) => String): String = { + val gcsItems = items collect { case i if i.cloudPath.isInstanceOf[GcsPath] => i } + groupParametersByGcsBucket(gcsItems) map bundleFunction mkString "\n" + } + + def uploadGcsLocalizationScript(createParameters: CreateBatchJobParameters, + cloudPath: Path, + transferLibraryContainerPath: Path, + gcsTransferConfiguration: GcsTransferConfiguration, + referenceInputsToMountedPathsOpt: Option[Map[GcpBatchInput, String]]): Future[Unit] = { + val content = generateGcsLocalizationScript(createParameters.inputOutputParameters.fileInputParameters, referenceInputsToMountedPathsOpt)(gcsTransferConfiguration) + asyncIo.writeAsync(cloudPath, s"source '$transferLibraryContainerPath'\n\n" + content, Seq(CloudStorageOptions.withMimeType("text/plain"))) + } + + def uploadGcsDelocalizationScript(createParameters: CreateBatchJobParameters, + cloudPath: Path, + transferLibraryContainerPath: Path, + gcsTransferConfiguration: GcsTransferConfiguration): Future[Unit] = { + val content = generateGcsDelocalizationScript(createParameters.inputOutputParameters.fileOutputParameters)(gcsTransferConfiguration) + asyncIo.writeAsync(cloudPath, s"source '$transferLibraryContainerPath'\n\n" + content, Seq(CloudStorageOptions.withMimeType("text/plain"))) + } + + + // TAG DISK + private def createBatchParameters(inputOutputParameters: InputOutputParameters, + customLabels: Seq[GcpLabel], + ): CreateBatchJobParameters = { + standardParams.backendInitializationDataOption match { + case Some(data: GcpBackendInitializationData) => + val dockerKeyAndToken: Option[CreateBatchDockerKeyAndToken] = for { + key <- data.privateDockerEncryptionKeyName + token <- data.privateDockerEncryptedToken + } yield CreateBatchDockerKeyAndToken(key, token) + + /* + * Right now this doesn't cost anything, because sizeOption returns the size if it was previously already fetched + * for some reason (expression evaluation for instance), but otherwise does not retrieve it and returns None. + * In CWL-land we tend to be aggressive in pre-fetching the size in order to be able to evaluate JS expressions, + * but less in WDL as we can get it last minute and on demand because size is a WDL function, whereas in CWL + * we don't inspect the JS to know if size is called and therefore always pre-fetch it. + * + * We could decide to call withSize before in which case we would retrieve the size for all files and have + * a guaranteed more accurate total size, but there might be performance impacts ? + */ + val inputFileSize = Option(callInputFiles.values.flatMap(_.flatMap(_.sizeOption)).sum) + + // Attempt to adjust the disk size by taking into account the size of input files + val adjustedSizeDisks = inputFileSize.map(size => MemorySize.apply(size.toDouble, MemoryUnit.Bytes).to(MemoryUnit.GB)) map { inputFileSizeInformation => + runtimeAttributes.disks.adjustWorkingDiskWithNewMin( + inputFileSizeInformation, + jobLogger.info(s"Adjusted working disk size to ${inputFileSizeInformation.amount} GB to account for input files") + ) + } getOrElse runtimeAttributes.disks + + val inputFilePaths = inputOutputParameters.jobInputParameters.map(_.cloudPath.pathAsString).toSet + + val referenceDisksToMount = + batchAttributes.referenceFileToDiskImageMappingOpt.map(getReferenceDisksToMount(_, inputFilePaths)) + + val dockerhubCredentials: (String, String) = { + new String(Base64.getDecoder.decode(batchAttributes.dockerhubToken), "UTF-8").split(":", 2) match { + case Array(username, password) => (username, password) + case _ => ("", "") + } + } + + val workflowOptions = workflowDescriptor.workflowOptions + + val monitoringImage = new MonitoringImage( + jobDescriptor = jobDescriptor, + workflowOptions = workflowOptions, + workflowPaths = workflowPaths, + commandDirectory = commandDirectory, + workingDisk = workingDisk, + localMonitoringImageScriptPath = localMonitoringImageScriptPath, + ) + + val checkpointingConfiguration = + new CheckpointingConfiguration( + jobDescriptor = jobDescriptor, + workflowPaths = workflowPaths, + commandDirectory = commandDirectory, + batchConfiguration.batchAttributes.checkpointingInterval + ) + + val enableSshAccess = workflowOptions.getBoolean(WorkflowOptionKeys.EnableSSHAccess).toOption.contains(true) + + // if the `memory_retry_multiplier` is not present in the workflow options there is no need to check whether or + // not the `stderr` file contained memory retry error keys + val retryWithMoreMemoryKeys: Option[List[String]] = memoryRetryFactor.flatMap(_ => memoryRetryErrorKeys) + + CreateBatchJobParameters( + jobDescriptor = jobDescriptor, + runtimeAttributes = runtimeAttributes, + dockerImage = jobDockerImage, + cloudWorkflowRoot = workflowPaths.workflowRoot, + cloudCallRoot = callRootPath, + commandScriptContainerPath = cmdInput.containerPath, + logGcsPath = gcpBatchLogPath, + inputOutputParameters = inputOutputParameters, + projectId = googleProject(jobDescriptor.workflowDescriptor), + computeServiceAccount = computeServiceAccount(jobDescriptor.workflowDescriptor), + googleLabels = backendLabels ++ customLabels, + preemptible = preemptible, + batchTimeout = batchConfiguration.batchTimeout, + jobShell = batchConfiguration.jobShell, + privateDockerKeyAndEncryptedToken = dockerKeyAndToken, + womOutputRuntimeExtractor = jobDescriptor.workflowDescriptor.outputRuntimeExtractor, + adjustedSizeDisks = adjustedSizeDisks, + virtualPrivateCloudConfiguration = batchAttributes.virtualPrivateCloudConfiguration, + retryWithMoreMemoryKeys = retryWithMoreMemoryKeys, + fuseEnabled = fuseEnabled(jobDescriptor.workflowDescriptor), + referenceDisksForLocalizationOpt = referenceDisksToMount, + monitoringImage = monitoringImage, + checkpointingConfiguration, + enableSshAccess = enableSshAccess, + vpcNetworkAndSubnetworkProjectLabels = data.vpcNetworkAndSubnetworkProjectLabels, + dockerhubCredentials = dockerhubCredentials + ) + case Some(other) => + throw new RuntimeException(s"Unexpected initialization data: $other") + case None => + throw new RuntimeException("No batch backend initialization data found?") + } + } + + protected def relativePathAndAttachedDisk(path: String, disks: Seq[GcpBatchAttachedDisk]): (Path, GcpBatchAttachedDisk) = { + val absolutePath = DefaultPathBuilder.get(path) match { + case p if !p.isAbsolute => GcpBatchWorkingDisk.MountPoint.resolve(p) + case p => p + } + + disks.find(d => absolutePath.startsWith(d.mountPoint)) match { + case Some(disk) => (disk.mountPoint.relativize(absolutePath), disk) + case None => + throw new Exception(s"Absolute path $path doesn't appear to be under any mount points: ${disks.map(_.toString).mkString(", ")}") + } + } + + protected def makeSafeReferenceName(referenceName: String): String = { + if (referenceName.length <= 127) referenceName else referenceName.md5Sum + } + + // De-localize the glob directory as a GcpBatchDirectoryOutput instead of using * pattern match + protected def generateGlobFileOutputs(womFile: WomGlobFile): List[GcpBatchOutput] = { + val globName = GlobFunctions.globName(womFile.value) + val globDirectory = globName + "/" + val globListFile = globName + ".list" + val gcsGlobDirectoryDestinationPath = callRootPath.resolve(globDirectory) + val gcsGlobListFileDestinationPath = callRootPath.resolve(globListFile) + + val (_, globDirectoryDisk) = relativePathAndAttachedDisk(womFile.value, runtimeAttributes.disks) + + // We need both the glob directory and the glob list: + List( + // The glob directory: + GcpBatchDirectoryOutput(makeSafeReferenceName(globDirectory), gcsGlobDirectoryDestinationPath, DefaultPathBuilder.get(globDirectory), globDirectoryDisk, optional = false, secondary = false), + // The glob list file: + GcpBatchFileOutput(makeSafeReferenceName(globListFile), gcsGlobListFileDestinationPath, DefaultPathBuilder.get(globListFile), globDirectoryDisk, optional = false, secondary = false) + ) + } + + lazy val batchMonitoringParamName: String = GcpBatchJobPaths.BatchMonitoringKey + lazy val localMonitoringLogPath: Path = DefaultPathBuilder.get(gcpBatchCallPaths.batchMonitoringLogFilename) + lazy val localMonitoringScriptPath: Path = DefaultPathBuilder.get(gcpBatchCallPaths.batchMonitoringScriptFilename) + + lazy val monitoringScript: Option[GcpBatchFileInput] = { + gcpBatchCallPaths.workflowPaths.monitoringScriptPath map { path => + GcpBatchFileInput(s"$batchMonitoringParamName-in", path, localMonitoringScriptPath, workingDisk) + } + } + + private val DockerMonitoringLogPath: Path = GcpBatchWorkingDisk.MountPoint.resolve(gcpBatchCallPaths.batchMonitoringLogFilename) + private val DockerMonitoringScriptPath: Path = GcpBatchWorkingDisk.MountPoint.resolve(gcpBatchCallPaths.batchMonitoringScriptFilename) + + override def scriptPreamble: String = { + if (monitoringOutput.isDefined) { + s"""|touch $DockerMonitoringLogPath + |chmod u+x $DockerMonitoringScriptPath + |$DockerMonitoringScriptPath > $DockerMonitoringLogPath &""".stripMargin + } else "" + } + + private[actors] def generateInputs(jobDescriptor: BackendJobDescriptor): Set[GcpBatchInput] = { + // We need to tell PAPI about files that were created as part of command instantiation (these need to be defined + // as inputs that will be localized down to the VM). Make up 'names' for these files that are just the short + // md5's of their paths. + val writeFunctionFiles = instantiatedCommand.createdFiles map { f => f.file.value.md5SumShort -> List(f) } toMap + + val writeFunctionInputs = writeFunctionFiles flatMap { + case (name, files) => gcpBatchInputsFromWomFiles(name, files.map(_.file), files.map(localizationPath), jobDescriptor) + } + + val callInputInputs = callInputFiles flatMap { + case (name, files) => gcpBatchInputsFromWomFiles(name, files, files.map(relativeLocalizationPath), jobDescriptor) + } + + (writeFunctionInputs ++ callInputInputs).toSet + } + + // Simply create a GcpBatchDirectoryOutput instead of globbing + protected def generateUnlistedDirectoryOutputs(unlistedDirectory: WomUnlistedDirectory, fileEvaluation: FileEvaluation): List[GcpBatchOutput] = { + val destination = callRootPath.resolve(unlistedDirectory.value.stripPrefix("/")) + val (relpath, disk) = relativePathAndAttachedDisk(unlistedDirectory.value, runtimeAttributes.disks) + val directoryOutput = GcpBatchDirectoryOutput(makeSafeReferenceName(unlistedDirectory.value), destination, relpath, disk, fileEvaluation.optional, fileEvaluation.secondary) + List(directoryOutput) + } + + private def maybeListedDirectoryToBatchParameters(inputName: String, womMaybeListedDirectory: WomMaybeListedDirectory, localPath: String) = womMaybeListedDirectory match { + // If there is a path, simply localize as a directory + case WomMaybeListedDirectory(Some(path), _, _, _) => + List(GcpBatchDirectoryInput(inputName, getPath(path).get, DefaultPathBuilder.get(localPath), workingDisk)) + + // If there is a listing, recurse and call gcpBatchInputsFromWomFiles on all the listed files + case WomMaybeListedDirectory(_, Some(listing), _, _) if listing.nonEmpty => + listing.flatMap({ + case womFile: WomFile if isAdHocFile(womFile) => + gcpBatchInputsFromWomFiles(makeSafeReferenceName(womFile.valueString), List(womFile), List(fileName(womFile)), jobDescriptor) + case womFile: WomFile => + gcpBatchInputsFromWomFiles(makeSafeReferenceName(womFile.valueString), List(womFile), List(relativeLocalizationPath(womFile)), jobDescriptor) + }) + case _ => List.empty + } + + def generateSingleFileOutputs(womFile: WomSingleFile, fileEvaluation: FileEvaluation): List[GcpBatchFileOutput] = { + val (relpath, disk) = relativePathAndAttachedDisk(womFile.value, runtimeAttributes.disks) + // If the file is on a custom mount point, resolve it so that the full mount path will show up in the cloud path + // For the default one (cromwell_root), the expectation is that it does not appear + val mountedPath = if (!disk.mountPoint.isSamePathAs(GcpBatchWorkingDisk.Default.mountPoint)) disk.mountPoint.resolve(relpath) else relpath + // Normalize the local path (to get rid of ".." and "."). Also strip any potential leading / so that it gets appended to the call root + val normalizedPath = mountedPath.normalize().pathAsString.stripPrefix("/") + val destination = callRootPath.resolve(normalizedPath) + val batchFileOutput = GcpBatchFileOutput(makeSafeReferenceName(womFile.value), destination, relpath, disk, fileEvaluation.optional, fileEvaluation.secondary) + List(batchFileOutput) + } + + private[actors] def generateOutputs(jobDescriptor: BackendJobDescriptor): Set[GcpBatchOutput] = { + def evaluateFiles(output: OutputDefinition): List[FileEvaluation] = { + Try( + output.expression.evaluateFiles(jobDescriptor.localInputs, NoIoFunctionSet, output.womType).map(_.toList) + ).getOrElse(List.empty[FileEvaluation].validNel) + .getOrElse(List.empty) + } + + def relativeFileEvaluation(evaluation: FileEvaluation): FileEvaluation = { + evaluation.copy(file = relativeLocalizationPath(evaluation.file)) + } + + val womFileOutputs = jobDescriptor.taskCall.callable.outputs.flatMap(evaluateFiles) map relativeFileEvaluation + + val outputs: Seq[GcpBatchOutput] = womFileOutputs.distinct flatMap { fileEvaluation => + fileEvaluation.file.flattenFiles flatMap { + case unlistedDirectory: WomUnlistedDirectory => generateUnlistedDirectoryOutputs(unlistedDirectory, fileEvaluation) + case singleFile: WomSingleFile => generateSingleFileOutputs(singleFile, fileEvaluation) + case globFile: WomGlobFile => generateGlobFileOutputs(globFile) // Assumes optional = false for globs. + } + } + + val additionalGlobOutput = jobDescriptor.taskCall.callable.additionalGlob.toList.flatMap(generateGlobFileOutputs).toSet + + outputs.toSet ++ additionalGlobOutput + } + + protected def uploadGcsTransferLibrary(createBatchParameters: CreateBatchJobParameters, + cloudPath: Path, + gcsTransferConfiguration: GcsTransferConfiguration): Future[Unit] = { + + asyncIo.writeAsync(cloudPath, gcsTransferLibrary, Seq(CloudStorageOptions.withMimeType("text/plain"))) + } + + + lazy val monitoringOutput: Option[GcpBatchFileOutput] = monitoringScript map { _ => + GcpBatchFileOutput(s"$batchMonitoringParamName-out", + gcpBatchCallPaths.batchMonitoringLogPath, localMonitoringLogPath, workingDisk, optional = false, secondary = false, + contentType = plainTextContentType) + } + + override lazy val commandDirectory: Path = GcpBatchWorkingDisk.MountPoint + + + // Primary entry point for cromwell to run GCP Batch job + override def executeAsync(): Future[ExecutionHandle] = { + + // Want to force runtimeAttributes to evaluate so we can fail quickly now if we need to: + def evaluateRuntimeAttributes = Future.fromTry(Try(runtimeAttributes)) + + def generateInputOutputParameters: Future[InputOutputParameters] = Future.fromTry(Try { + val rcFileOutput = GcpBatchFileOutput(returnCodeFilename, returnCodeGcsPath, DefaultPathBuilder.get(returnCodeFilename), workingDisk, optional = false, secondary = false, + contentType = plainTextContentType) + + val memoryRetryRCFileOutput = GcpBatchFileOutput( + memoryRetryRCFilename, + memoryRetryRCGcsPath, + DefaultPathBuilder.get(memoryRetryRCFilename), + workingDisk, + optional = true, + secondary = false, + contentType = plainTextContentType + ) + + case class StandardStream(name: String, f: StandardPaths => Path) { + val filename: String = f(gcpBatchCallPaths.standardPaths).name + } + + val standardStreams = List( + StandardStream("stdout", _.output), + StandardStream("stderr", _.error) + ) map { s => + GcpBatchFileOutput(s.name, returnCodeGcsPath.sibling(s.filename), DefaultPathBuilder.get(s.filename), + workingDisk, optional = false, secondary = false, uploadPeriod = batchAttributes.logFlushPeriod, contentType = plainTextContentType) + } + + InputOutputParameters( + DetritusInputParameters( + executionScriptInputParameter = cmdInput, + monitoringScriptInputParameter = monitoringScript + ), + generateInputs(jobDescriptor).toList, + standardStreams ++ generateOutputs(jobDescriptor).toList, + DetritusOutputParameters( + monitoringScriptOutputParameter = monitoringOutput, + rcFileOutputParameter = rcFileOutput, + memoryRetryRCFileOutputParameter = memoryRetryRCFileOutput + ), + List.empty + ) + + }) + + + val gcpBatchParameters = CreateGcpBatchParameters( + jobDescriptor = jobDescriptor, + runtimeAttributes = runtimeAttributes, + batchAttributes = batchAttributes, + projectId = batchAttributes.project, + region = batchAttributes.location) + + val runBatchResponse = for { + _ <- evaluateRuntimeAttributes + _ <- uploadScriptFile() + customLabels <- Future.fromTry(GcpLabel.fromWorkflowOptions(workflowDescriptor.workflowOptions)) + _ = customLabels.foreach(x => println(s"ZZZ Custom Labels - $x")) + batchParameters <- generateInputOutputParameters + _ = batchParameters.fileInputParameters.foreach(x => println(s"ZZZ File InputParameters - $x")) + _ = batchParameters.jobInputParameters.foreach(x => println(s"ZZZ InputParameters - $x")) + _ = batchParameters.fileOutputParameters.foreach(x => println(s"ZZZ File OutputParameters - $x")) + _ = batchParameters.jobOutputParameters.foreach(x => println(s"ZZZ OutputParameters - $x")) + createParameters = createBatchParameters(batchParameters, customLabels) + drsLocalizationManifestCloudPath = jobPaths.callExecutionRoot / GcpBatchJobPaths.DrsLocalizationManifestName + _ <- uploadDrsLocalizationManifest(createParameters, drsLocalizationManifestCloudPath) + gcsTransferConfiguration = initializationData.gcpBatchConfiguration.batchAttributes.gcsTransferConfiguration + gcsTransferLibraryCloudPath = jobPaths.callExecutionRoot / GcpBatchJobPaths.GcsTransferLibraryName + transferLibraryContainerPath = createParameters.commandScriptContainerPath.sibling(GcsTransferLibraryName) + _ <- uploadGcsTransferLibrary(createParameters, gcsTransferLibraryCloudPath, gcsTransferConfiguration) + gcsLocalizationScriptCloudPath = jobPaths.callExecutionRoot / GcpBatchJobPaths.GcsLocalizationScriptName + referenceInputsToMountedPathsOpt = getReferenceInputsToMountedPathsOpt(createParameters) + _ <- uploadGcsLocalizationScript(createParameters, gcsLocalizationScriptCloudPath, transferLibraryContainerPath, gcsTransferConfiguration, referenceInputsToMountedPathsOpt) + gcsDelocalizationScriptCloudPath = jobPaths.callExecutionRoot / GcpBatchJobPaths.GcsDelocalizationScriptName + _ <- uploadGcsDelocalizationScript(createParameters, gcsDelocalizationScriptCloudPath, transferLibraryContainerPath, gcsTransferConfiguration) + _ = createParameters.privateDockerKeyAndEncryptedToken.isDefined + jobName = "job-" + java.util.UUID.randomUUID.toString + request = GcpBatchRequest(workflowId, createParameters, jobName = jobName, gcpBatchParameters) + response <- runBatchJob(request = request, backendSingletonActor = backendSingletonActor) + _ = sendGoogleLabelsToMetadata(customLabels) + _ = sendIncrementMetricsForReferenceFiles(referenceInputsToMountedPathsOpt.map(_.keySet)) + + } yield response + + // TODO: Handle when the job gets aborted before it starts being processed + runBatchResponse.map { runId => + PendingExecutionHandle( + jobDescriptor = jobDescriptor, + pendingJob = runId, + runInfo = Option(Run(runId)), + previousState = None) + } + } + + override def reconnectAsync(jobId: StandardAsyncJob): Future[ExecutionHandle] = { + log.info("reconnect async runs") // in for debugging remove later + val handle = PendingExecutionHandle[StandardAsyncJob, StandardAsyncRunInfo, StandardAsyncRunState](jobDescriptor, jobId, Option(Run(jobId)), previousState = None) + Future.successful(handle) + } + + override lazy val pollBackOff: SimpleExponentialBackoff = SimpleExponentialBackoff(5 + .second, 5 + .minutes, 1.1) + + override lazy val executeOrRecoverBackOff: SimpleExponentialBackoff = SimpleExponentialBackoff( + initialInterval = 5 + .seconds, maxInterval = 20 + .seconds, multiplier = 1.1) + + override lazy val runtimeEnvironment: RuntimeEnvironment = { + RuntimeEnvironmentBuilder(jobDescriptor.runtimeAttributes, GcpBatchWorkingDisk.MountPoint, GcpBatchWorkingDisk.MountPoint)(standardParams.minimumRuntimeSettings) + } + + protected def sendIncrementMetricsForReferenceFiles(referenceInputFilesOpt: Option[Set[GcpBatchInput]]): Unit = { + referenceInputFilesOpt match { + case Some(referenceInputFiles) => + referenceInputFiles.foreach { referenceInputFile => + increment(NonEmptyList.of("referencefiles", referenceInputFile.relativeHostPath.pathAsString)) + } + case _ => + // do nothing - reference disks feature is either not configured in Cromwell or disabled in workflow options + } + } + + protected def sendIncrementMetricsForDockerImageCache(dockerImageCacheDiskOpt: Option[String], + dockerImageAsSpecifiedByUser: String, + isDockerImageCacheUsageRequested: Boolean): Unit = { + (isDockerImageCacheUsageRequested, dockerImageCacheDiskOpt) match { + case (true, None) => increment(NonEmptyList("docker", List("image", "cache", "image_not_in_cache", dockerImageAsSpecifiedByUser))) + case (true, Some(_)) => increment(NonEmptyList("docker", List("image", "cache", "used_image_from_cache", dockerImageAsSpecifiedByUser))) + case (false, Some(_)) => increment(NonEmptyList("docker", List("image", "cache", "cached_image_not_used", dockerImageAsSpecifiedByUser))) + case _ => // docker image cache not requested and image is not in cache anyway - do nothing + } + } + + override def pollStatusAsync(handle: GcpBatchPendingExecutionHandle): Future[RunStatus] = { + // yes, we use the whole jobName as the id + val jobNameStr = handle.pendingJob.jobId + + for { + _ <- Future.unit // trick to get into a future context + _ = log.info(s"started polling for $jobNameStr") + jobName = JobName.parse(jobNameStr) + job <- fetchJob(jobName, backendSingletonActor) + } yield RunStatus.fromJobStatus(job.getStatus.getState) + } + + override def isTerminal(runStatus: RunStatus): Boolean = { + runStatus match { + case _: RunStatus.TerminalRunStatus => + log.info(s"isTerminal match terminal run status with $runStatus") + true + case other => + log.info(f"isTerminal match _ running with status $other") + false + } + } + + override def isDone(runStatus: RunStatus): Boolean = { + runStatus match { + case _: RunStatus.Succeeded => + log.info("GCP batch job succeeded matched isDone") + true + case _: RunStatus.UnsuccessfulRunStatus => + log.info("GCP batch job unsuccessful matched isDone") + false + case _ => + log.info(s"did not match isDone: $runStatus") + throw new RuntimeException(s"Cromwell programmer blunder: isDone was called on an incomplete RunStatus ($runStatus).") + } + } + + override def getTerminalEvents(runStatus: RunStatus): Seq[ExecutionEvent] = { + runStatus match { + case t: RunStatus.TerminalRunStatus => + log.warning(s"Tried to get terminal events on a terminal status without events: $runStatus") + t.eventList + + case unknown => + throw new RuntimeException(s"handleExecutionSuccess not called with RunStatus.Success. Instead got $unknown") + } + } + + override lazy val startMetadataKeyValues: Map[String, Any] = super[GcpBatchJobCachingActorHelper].startMetadataKeyValues + + // TODO: review sending machine and Instance type + override def getTerminalMetadata(runStatus: RunStatus): Map[String, Any] = { + runStatus match { + case _: TerminalRunStatus => Map() + case unknown => throw new RuntimeException(s"Attempt to get terminal metadata from non terminal status: $unknown") + } + } + + override def mapOutputWomFile(womFile: WomFile): WomFile = { + womFileToGcsPath(generateOutputs(jobDescriptor))(womFile) + } + + + override def globParentDirectory(womGlobFile: WomGlobFile): Path = { + val (_, disk) = relativePathAndAttachedDisk(womGlobFile.value, runtimeAttributes.disks) + disk.mountPoint + } + + + protected def googleProject(descriptor: BackendWorkflowDescriptor): String = { + descriptor.workflowOptions.getOrElse(WorkflowOptionKeys.GoogleProject, batchAttributes.project) + } + + protected def computeServiceAccount(descriptor: BackendWorkflowDescriptor): String = { + descriptor.workflowOptions.getOrElse(WorkflowOptionKeys.GoogleComputeServiceAccount, batchAttributes.computeServiceAccount) + } + + protected def fuseEnabled(descriptor: BackendWorkflowDescriptor): Boolean = { + descriptor.workflowOptions.getBoolean(WorkflowOptionKeys.EnableFuse).toOption.getOrElse(batchAttributes.enableFuse) + } + + protected def useDockerImageCache(descriptor: BackendWorkflowDescriptor): Boolean = { + descriptor.workflowOptions.getBoolean(WorkflowOptionKeys.UseDockerImageCache).getOrElse(false) + } + + override def cloudResolveWomFile(womFile: WomFile): WomFile = { + womFile.mapFile { value => + getPath(value) match { + case Success(drsPath: DrsPath) => DrsResolver.getSimpleGsUri(drsPath).unsafeRunSync().getOrElse(value) + case Success(path) => path.pathAsString + case _ => value + } + } + } + + override def mapCommandLineWomFile(womFile: WomFile): WomFile = { + womFile.mapFile { value => + (getPath(value), asAdHocFile(womFile)) match { + case (Success(gcsPath: GcsPath), Some(adHocFile)) => + // Ad hoc files will be placed directly at the root ("/cromwell_root/ad_hoc_file.txt") unlike other input files + // for which the full path is being propagated ("/cromwell_root/path/to/input_file.txt") + workingDisk.mountPoint.resolve(adHocFile.alternativeName.getOrElse(gcsPath.name)).pathAsString + case (Success(path@(_: GcsPath | _: HttpPath)), _) => + workingDisk.mountPoint.resolve(path.pathWithoutScheme).pathAsString + case (Success(drsPath: DrsPath), _) => + val filePath = DrsResolver.getContainerRelativePath(drsPath).unsafeRunSync() + workingDisk.mountPoint.resolve(filePath).pathAsString + case (Success(sraPath: SraPath), _) => + workingDisk.mountPoint.resolve(s"sra-${sraPath.accession}/${sraPath.pathWithoutScheme}").pathAsString + case _ => value + } + } + } + + override def mapCommandLineJobInputWomFile(womFile: WomFile): WomFile = { + womFile.mapFile(value => + getPath(value) match { + case Success(gcsPath: GcsPath) => workingDisk.mountPoint.resolve(gcsPath.pathWithoutScheme).pathAsString + case Success(drsPath: DrsPath) => + val filePath = DrsResolver.getContainerRelativePath(drsPath).unsafeRunSync() + workingDisk.mountPoint.resolve(filePath).pathAsString + case _ => value + } + ) + } + + def womFileToGcsPath(batchOutputs: Set[GcpBatchOutput])(womFile: WomFile): WomFile = { + womFile mapFile { path => + batchOutputs collectFirst { + case batchOutput if batchOutput.name == makeSafeReferenceName(path) => + val pathAsString = batchOutput.cloudPath.pathAsString + if (batchOutput.isFileParameter && !batchOutput.cloudPath.exists) { + // This is not an error if the path represents a `File?` optional output (the Batch delocalization script + // should have failed if this file output was not optional but missing). Throw to produce the correct "empty + // optional" value for a missing optional file output. + throw new FileNotFoundException(s"GCS output file not found: $pathAsString") + } + pathAsString + } getOrElse { + GcsPathBuilder.validateGcsPath(path) match { + case _: ValidFullGcsPath => path + + /* + * Strip the prefixes in RuntimeOutputMapping.prefixFilters from the path, one at a time. + * For instance + * file:///cromwell_root/bucket/workflow_name/6d777414-5ee7-4c60-8b9e-a02ec44c398e/call-A/file.txt will progressively become + * + * /cromwell_root/bucket/workflow_name/6d777414-5ee7-4c60-8b9e-a02ec44c398e/call-A/file.txt + * bucket/workflow_name/6d777414-5ee7-4c60-8b9e-a02ec44c398e/call-A/file.txt + * call-A/file.txt + * + * This code is called as part of a path mapper that will be applied to the WOMified cwl.output.json. + * The cwl.output.json when it's being read by Cromwell from the bucket still contains local paths + * (as they were created by the cwl tool). + * In order to keep things working we need to map those local paths to where they were actually delocalized, + * which is determined in cromwell.backend.google.pipelines.v2beta.api.Delocalization. + */ + case _ => (callRootPath / + RuntimeOutputMapping + .prefixFilters(workflowPaths.workflowRoot) + .foldLeft(path)({ + case (newPath, prefix) => newPath.stripPrefix(prefix) + }) + ).pathAsString + } + } + } + } + + // No need for Cromwell-performed localization in the PAPI backend, ad hoc values are localized directly from GCS to the VM by PAPI. + override lazy val localizeAdHocValues: List[AdHocValue] => ErrorOr[List[StandardAdHocValue]] = _.map(Coproduct[StandardAdHocValue](_)).validNel +} + + + + + diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchBackendSingletonActor.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchBackendSingletonActor.scala new file mode 100644 index 00000000000..66e74b3f2c8 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchBackendSingletonActor.scala @@ -0,0 +1,122 @@ +package cromwell.backend.google.batch.actors + +import akka.actor.{Actor, ActorLogging, ActorRef, Props, Timers} +import com.google.cloud.batch.v1.JobName +import com.google.longrunning.Operation +import cromwell.backend.BackendSingletonActorAbortWorkflow +import cromwell.backend.google.batch.api.{GcpBatchApiRequestHandler, GcpBatchRequestFactory} +import cromwell.backend.google.batch.models.GcpBatchRequest +import cromwell.backend.google.batch.monitoring.BatchInstrumentation +import cromwell.core.Dispatcher.BackendDispatcher +import cromwell.services.instrumentation.CromwellInstrumentationScheduler + +import scala.concurrent.{ExecutionContext, Future} +import scala.util.{Failure, Success} + +object GcpBatchBackendSingletonActor { + def props(requestFactory: GcpBatchRequestFactory, serviceRegistryActor: ActorRef)(implicit requestHandler: GcpBatchApiRequestHandler): Props = { + Props(new GcpBatchBackendSingletonActor(requestFactory, serviceRegistryActor = serviceRegistryActor)) + .withDispatcher(BackendDispatcher) + } + + // This is the only type of messages that can be processed by this actor from this actor + sealed trait Action extends Product with Serializable + object Action { + final case class SubmitJob(request: GcpBatchRequest) extends Action + final case class QueryJob(jobName: JobName) extends Action + final case class AbortJob(jobName: JobName) extends Action + } + + // This is the only type of messages produced from this actor while reacting to received messages + sealed trait Event extends Product with Serializable + object Event { + final case class JobSubmitted(job: com.google.cloud.batch.v1.Job) extends Event + final case class JobStatusRetrieved(job: com.google.cloud.batch.v1.Job) extends Event + final case class JobAbortRequestSent(operation: Operation) extends Event + final case class ActionFailed(jobName: String, cause: Throwable) extends Event + } + +} + +final class GcpBatchBackendSingletonActor(requestFactory: GcpBatchRequestFactory, override val serviceRegistryActor: ActorRef)(implicit requestHandler: GcpBatchApiRequestHandler) + extends Actor + with ActorLogging + with BatchInstrumentation + with CromwellInstrumentationScheduler + with Timers { + + import GcpBatchBackendSingletonActor._ + + private implicit val ec: ExecutionContext = context.dispatcher + + override def preStart() = { + startInstrumentationTimer() + super.preStart() + } + + private def normalReceive: Receive = { + case Action.SubmitJob(request) => + val replyTo = sender() + log.info(s"Submitting job (${request.jobName}) to GCP, workflowId = ${request.workflowId}") + Future { + requestHandler.submit(requestFactory.submitRequest(request)) + }.onComplete { + case Failure(exception) => + log.error(exception, s"Failed to submit job (${request.jobName}) to GCP, workflowId = ${request.workflowId}") + replyTo ! Event.ActionFailed(request.jobName, exception) + + case Success(job) => + log.info(s"Job (${request.jobName}) submitted to GCP, workflowId = ${request.workflowId}, id = ${job.getUid}") + replyTo ! Event.JobSubmitted(job) + } + + case Action.QueryJob(jobName) => + val replyTo = sender() + + Future { + requestHandler.query(requestFactory.queryRequest(jobName)) + }.onComplete { + case Success(job) => + log.info(s"Job ($jobName) retrieved from GCP, state = ${job.getStatus.getState}") + replyTo ! Event.JobStatusRetrieved(job) + + case Failure(exception) => + log.error(exception, s"Failed to query job status ($jobName) from GCP") + replyTo ! Event.ActionFailed(jobName.toString ,exception) + } + + case Action.AbortJob(jobName) => + val replyTo = sender() + + Future { + requestHandler.abort(requestFactory.abortRequest(jobName)) + }.onComplete { + case Success(operation) => + log.info(s"Job ($jobName) aborted from GCP") + replyTo ! Event.JobAbortRequestSent(operation) + + case Failure(exception) => + log.error(exception, s"Failed to abort job ($jobName) from GCP") + replyTo ! Event.ActionFailed(jobName.toString, exception) + } + + // Cromwell sends this message + case BackendSingletonActorAbortWorkflow(workflowId) => + // It seems that AbortJob(jobName) is processed before this message, hence, we don't need to do anything else. + // If it ever becomes necessary, we'll need to create link submitted jobs to its workflow id, which require + // us to be cautious because batch deletes jobs instead of canceling them, hence, we should not delete jobs + // that are on a final state. + log.info(s"Cromwell requested to abort workflow $workflowId") + + case other => log.error(s"Unexpected message from {} to ${this.getClass.getSimpleName}: {}", sender().path.name, other) + } + + override def receive = instrumentationReceive(loadMetricHandler _).orElse(normalReceive) + + private def loadMetricHandler() = { + // TODO: Implement this once we have details to expose +// val load = if (workQueue.size > LoadConfig.PAPIThreshold) HighLoad else NormalLoad +// serviceRegistryActor ! LoadMetric("PAPIQueryManager", load) +// updateQueueSize(workQueue.size) + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchFinalizationActor.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchFinalizationActor.scala new file mode 100644 index 00000000000..dd57edc8b4e --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchFinalizationActor.scala @@ -0,0 +1,35 @@ +package cromwell.backend.google.batch.actors + +import akka.actor.ActorRef +import cromwell.backend._ +import cromwell.backend.google.batch.models.GcpBatchConfiguration +import cromwell.backend.standard.{StandardFinalizationActor, StandardFinalizationActorParams} +import cromwell.core.CallOutputs +import cromwell.core.io.AsyncIoActorClient +import cromwell.filesystems.gcs.batch.GcsBatchCommandBuilder +import wom.graph.CommandCallNode + +case class GcpBatchFinalizationActorParams +( + workflowDescriptor: BackendWorkflowDescriptor, + ioActor: ActorRef, + batchConfiguration: GcpBatchConfiguration, + calls: Set[CommandCallNode], + jobExecutionMap: JobExecutionMap, + workflowOutputs: CallOutputs, + initializationDataOption: Option[BackendInitializationData] +) extends StandardFinalizationActorParams { + override def configurationDescriptor: BackendConfigurationDescriptor = batchConfiguration.configurationDescriptor +} + +class GcpBatchFinalizationActor(val batchParams: GcpBatchFinalizationActorParams) extends StandardFinalizationActor(batchParams) with AsyncIoActorClient { + + lazy val batchConfiguration: GcpBatchConfiguration = batchParams.batchConfiguration + + override lazy val ioCommandBuilder = GcsBatchCommandBuilder + override def ioActor: ActorRef = batchParams.ioActor +} + +object GcpBatchFinalizationActor { + +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchInitializationActor.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchInitializationActor.scala new file mode 100644 index 00000000000..466a4f20d7e --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchInitializationActor.scala @@ -0,0 +1,254 @@ +package cromwell.backend.google.batch.actors + +import _root_.io.circe.Decoder +import _root_.io.circe.generic.semiauto.deriveDecoder +import _root_.io.circe.parser.decode +import akka.actor.ActorRef +import com.google.api.client.http.{HttpRequest, HttpResponse} +import com.google.api.services.cloudkms.v1.model.EncryptRequest +import com.google.api.services.cloudkms.v1.{CloudKMS, CloudKMSScopes} +import com.google.api.services.cloudresourcemanager.{CloudResourceManager, CloudResourceManagerScopes} +import com.google.api.services.genomics.v2alpha1.GenomicsScopes +import com.google.api.services.lifesciences.v2beta.CloudLifeSciencesScopes +import com.google.api.services.storage.StorageScopes +import com.google.auth.Credentials +import com.google.auth.http.HttpCredentialsAdapter +import com.google.auth.oauth2.OAuth2Credentials +import cromwell.backend.google.batch._ +import cromwell.backend.google.batch.actors.GcpBatchInitializationActor._ +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.{VirtualPrivateCloudConfiguration, VirtualPrivateCloudLabels, VirtualPrivateCloudLiterals} +import cromwell.backend.google.batch.models._ +import cromwell.backend.google.batch.runnable.WorkflowOptionKeys +import cromwell.backend.standard.{StandardInitializationActor, StandardInitializationActorParams, StandardValidatedRuntimeAttributesBuilder} +import cromwell.backend.{BackendConfigurationDescriptor, BackendInitializationData, BackendWorkflowDescriptor} +import cromwell.cloudsupport.gcp.auth.GoogleAuthMode.{httpTransport, jsonFactory} +import cromwell.cloudsupport.gcp.auth.{GoogleAuthMode, UserServiceAccountMode} +import cromwell.core.io.AsyncIoActorClient +import cromwell.core.{Dispatcher, WorkflowOptions} +import cromwell.filesystems.gcs.GoogleUtil._ +import cromwell.filesystems.gcs.batch.GcsBatchCommandBuilder +import org.apache.commons.codec.binary.Base64 +import org.apache.commons.lang3.exception.ExceptionUtils +import spray.json.{JsObject, JsString} +import wom.graph.CommandCallNode + +import scala.concurrent.Future +import scala.util.Try +import scala.util.control.NonFatal + +case class GcpBatchInitializationActorParams +( + workflowDescriptor: BackendWorkflowDescriptor, + ioActor: ActorRef, + calls: Set[CommandCallNode], + batchConfiguration: GcpBatchConfiguration, + serviceRegistryActor: ActorRef, + restarting: Boolean +) extends StandardInitializationActorParams { + override val configurationDescriptor: BackendConfigurationDescriptor = batchConfiguration.configurationDescriptor + +} +class GcpBatchInitializationActor(batchParams: GcpBatchInitializationActorParams) extends StandardInitializationActor(batchParams) with AsyncIoActorClient { + + override lazy val ioActor: ActorRef = batchParams.ioActor + protected val gcpBatchConfiguration: GcpBatchConfiguration = batchParams.batchConfiguration + protected val workflowOptions: WorkflowOptions = workflowDescriptor.workflowOptions + private lazy val ioEc = context.system.dispatchers.lookup(Dispatcher.IoDispatcher) + + override lazy val runtimeAttributesBuilder: StandardValidatedRuntimeAttributesBuilder = + GcpBatchRuntimeAttributes + .runtimeAttributesBuilder(gcpBatchConfiguration) + + // Credentials object for the GCS API + private lazy val gcsCredentials: Future[Credentials] = gcpBatchConfiguration.batchAttributes.auths.gcs + .retryCredentials(workflowOptions, List(StorageScopes.DEVSTORAGE_FULL_CONTROL)) + + // Credentials object for the Genomics API + private lazy val genomicsCredentials: Future[Credentials] = gcpBatchConfiguration.batchAttributes.auths.genomics + .retryCredentials(workflowOptions, List( + CloudLifeSciencesScopes + .CLOUD_PLATFORM, + GenomicsScopes.GENOMICS + )) + + val privateDockerEncryptionKeyName: Option[String] = { + val optionsEncryptionKey = workflowOptions.get(GoogleAuthMode.DockerCredentialsEncryptionKeyNameKey).toOption + optionsEncryptionKey.orElse(gcpBatchConfiguration.dockerEncryptionKeyName) + } + + val privateDockerToken: Option[String] = { + val optionsDockerToken = workflowOptions.get(GoogleAuthMode.DockerCredentialsTokenKey).toOption + optionsDockerToken.orElse(gcpBatchConfiguration.dockerToken) + } + + lazy val privateDockerEncryptedToken: Option[String] = { + val effectiveAuth: Option[GoogleAuthMode] = { + // Use the user service account if a user service account value is provided in the workflow options and there's + // a user service account auth in the list of auths. + // TODO is it okay that this would silently ignore user auths if there isn't one defined in the config list of auths? + // That doesn't seem great but it's effectively what the existing code around user service accounts appears to be doing. + val userServiceAccountAuth: Option[GoogleAuthMode] = for { + _ <- workflowOptions.get(GoogleAuthMode.UserServiceAccountKey).toOption + usaAuth <- gcpBatchConfiguration.googleConfig.authsByName.values collectFirst { case u: UserServiceAccountMode => u } + } yield usaAuth + + def encryptionAuthFromConfig: Option[GoogleAuthMode] = gcpBatchConfiguration.dockerEncryptionAuthName.flatMap { name => + gcpBatchConfiguration.googleConfig.auth(name).toOption + } + // If there's no user service account auth in the workflow options fall back to an auth specified in config. + userServiceAccountAuth orElse encryptionAuthFromConfig + } + + val unencrypted: Option[String] = privateDockerToken flatMap { dockerToken => + new String(Base64.decodeBase64(dockerToken)).split(':') match { + case Array(username, password) => + // unencrypted tokens are base64-encoded username:password + Option(JsObject( + Map( + "username" -> JsString(username), + "password" -> JsString(password) + )).compactPrint) + case _ => throw new RuntimeException(s"provided dockerhub token '$dockerToken' is not a base64-encoded username:password") + } + } + + for { + plain <- unencrypted + auth <- effectiveAuth + key <- privateDockerEncryptionKeyName + credentials = auth.credentials(workflowOptions.get(_).get, List(CloudKMSScopes.CLOUD_PLATFORM)) + encrypted = encryptKms(key, credentials, plain) + } yield encrypted + } + + private def vpcNetworkAndSubnetworkProjectLabelsFuture(): Future[Option[VpcAndSubnetworkProjectLabelValues]] = { + def googleProject(descriptor: BackendWorkflowDescriptor): String = { + descriptor.workflowOptions.getOrElse(WorkflowOptionKeys.GoogleProject, batchParams.batchConfiguration.batchAttributes.project) + } + + def projectMetadataRequest(vpcConfig: VirtualPrivateCloudLabels): Future[HttpRequest] = { + Future { + val credentials = vpcConfig.auth.credentials(workflowOptions.get(_).getOrElse(throw new RuntimeException("Unable to find the necessary workflow option for auth credentials")), List(CloudResourceManagerScopes.CLOUD_PLATFORM)) + + val httpCredentialsAdapter = new HttpCredentialsAdapter(credentials) + val cloudResourceManagerBuilder = new CloudResourceManager + .Builder(GoogleAuthMode.httpTransport, GoogleAuthMode.jsonFactory, httpCredentialsAdapter) + .setApplicationName(gcpBatchConfiguration.googleConfig.applicationName) + .build() + + val project = cloudResourceManagerBuilder.projects().get(googleProject(workflowDescriptor)) + + project.buildHttpRequest() + } + } + + def projectMetadataResponseToLabels(httpResponse: HttpResponse): Future[ProjectLabels] = { + implicit val googleProjectMetadataLabelDecoder: Decoder[ProjectLabels] = deriveDecoder + Future.fromTry(decode[ProjectLabels](httpResponse.parseAsString()).toTry).recoverWith { + case NonFatal(e) => Future.failed(new RuntimeException(s"Failed to parse labels from project metadata response from Google Cloud Resource Manager API. " + + s"${ExceptionUtils.getMessage(e)}", e)) + } + } + + def networkLabelsFromProjectLabels(vpcConfig: VirtualPrivateCloudLabels, + projectLabels: ProjectLabels, + ): Option[VpcAndSubnetworkProjectLabelValues] = { + projectLabels.labels.get(vpcConfig.network) map { vpcNetworkLabelValue => + val subnetworkLabelOption = vpcConfig.subnetwork.flatMap { s => + projectLabels.labels.collectFirst { + case (labelName, labelValue) if labelName.equals(s) => labelValue + } + } + + VpcAndSubnetworkProjectLabelValues(vpcNetworkLabelValue, subnetworkLabelOption) + } + } + + def fetchVpcLabelsFromProjectMetadata(vpcConfig: VirtualPrivateCloudLabels + ): Future[Option[VpcAndSubnetworkProjectLabelValues]] = { + for { + projectMetadataResponse <- projectMetadataRequest(vpcConfig).map(_.executeAsync().get()) + projectLabels <- projectMetadataResponseToLabels(projectMetadataResponse) + } yield networkLabelsFromProjectLabels(vpcConfig, projectLabels) + } + + /* + First, try to fetch the network information from labels, where that fetch may still return None. + Then, if we did not discover a network via labels for whatever reason try to look for literal values. + */ + def fetchVpcLabels(vpcConfig: VirtualPrivateCloudConfiguration + ): Future[Option[VpcAndSubnetworkProjectLabelValues]] = { + // Added explicit types to hopefully help future devs who stumble across this two-step code + val fetchedFromLabels: Future[Option[VpcAndSubnetworkProjectLabelValues]] = vpcConfig.labelsOption match { + case Some(labels: VirtualPrivateCloudLabels) => fetchVpcLabelsFromProjectMetadata(labels) + case None => Future.successful(None) + } + fetchedFromLabels map { + _ orElse { + vpcConfig.literalsOption map { literals: VirtualPrivateCloudLiterals => + VpcAndSubnetworkProjectLabelValues(literals.network, literals.subnetwork) + } + } + } + } + + val vpcConfig: VirtualPrivateCloudConfiguration = + gcpBatchConfiguration.batchAttributes.virtualPrivateCloudConfiguration + fetchVpcLabels(vpcConfig) + } + + override lazy val workflowPaths: Future[GcpBatchWorkflowPaths] = for { + gcsCred <- gcsCredentials + genomicsCred <- genomicsCredentials + validatedPathBuilders <- pathBuilders + } yield new GcpBatchWorkflowPaths( + workflowDescriptor, gcsCred, genomicsCred, gcpBatchConfiguration, validatedPathBuilders, standardStreamNameToFileNameMetadataMapper)(ioEc) + + + override lazy val initializationData: Future[GcpBackendInitializationData] = for { + batchWorkflowPaths <- workflowPaths + gcsCreds <- gcsCredentials + vpcNetworkAndSubnetworkProjectLabels <- vpcNetworkAndSubnetworkProjectLabelsFuture() + } yield models.GcpBackendInitializationData( + workflowPaths = batchWorkflowPaths, + runtimeAttributesBuilder = runtimeAttributesBuilder, + gcpBatchConfiguration = gcpBatchConfiguration, + gcsCredentials = gcsCreds, + privateDockerEncryptionKeyName = privateDockerEncryptionKeyName, + privateDockerEncryptedToken = privateDockerEncryptedToken, + vpcNetworkAndSubnetworkProjectLabels = vpcNetworkAndSubnetworkProjectLabels + ) + + override def validateWorkflowOptions(): Try[Unit] = GcpLabel.fromWorkflowOptions(workflowOptions).map(_ => ()) + + override def beforeAll(): Future[Option[BackendInitializationData]] = { + for { + paths <- workflowPaths + _ = publishWorkflowRoot(paths.workflowRoot.pathAsString) + data <- initializationData + } yield Option(data) + } + + def standardStreamNameToFileNameMetadataMapper(gcpBatchJobPaths: GcpBatchJobPaths, streamName: String): String = + GcpBatchInitializationActor.defaultStandardStreamNameToFileNameMetadataMapper(gcpBatchJobPaths, streamName) + + override lazy val ioCommandBuilder: GcsBatchCommandBuilder.type = GcsBatchCommandBuilder + +} + +object GcpBatchInitializationActor { + // For metadata publishing purposes default to using the name of a standard stream as the stream's filename. + def defaultStandardStreamNameToFileNameMetadataMapper(gcpBatchJobPaths: GcpBatchJobPaths, streamName: String): String = streamName + + def encryptKms(keyName: String, credentials: OAuth2Credentials, plainText: String): String = { + val httpCredentialsAdapter = new HttpCredentialsAdapter(credentials) + val kms = new CloudKMS.Builder(httpTransport, jsonFactory, httpCredentialsAdapter) + .setApplicationName("cromwell") + .build() + + val request = new EncryptRequest().encodePlaintext(plainText.toCharArray.map(_.toByte)) + val response = kms.projects.locations.keyRings.cryptoKeys.encrypt(keyName, request).execute + response.getCiphertext + } + +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchJobCachingActorHelper.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchJobCachingActorHelper.scala new file mode 100644 index 00000000000..e86404bef1e --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchJobCachingActorHelper.scala @@ -0,0 +1,87 @@ +package cromwell.backend.google.batch.actors + +import cromwell.backend.google.batch.io.{GcpBatchAttachedDisk, GcpBatchWorkingDisk} +import cromwell.backend.google.batch.runnable.{GcpBatchMetadataKeys, WorkflowOptionKeys} +import cromwell.backend.google.batch.models._ +import cromwell.backend.standard.StandardCachingActorHelper +import cromwell.core.labels.Labels +import cromwell.core.logging.JobLogging +import cromwell.core.path.Path +import cromwell.services.metadata.CallMetadataKeys + +import scala.language.postfixOps + +trait GcpBatchJobCachingActorHelper extends StandardCachingActorHelper { + this: GcpBatchAsyncBackendJobExecutionActor with JobLogging => + + lazy val initializationData: GcpBackendInitializationData = { + backendInitializationDataAs[GcpBackendInitializationData] + } + lazy val batchConfiguration: GcpBatchConfiguration = initializationData.gcpBatchConfiguration + + lazy val gcpBatchCallPaths: GcpBatchJobPaths = jobPaths.asInstanceOf[GcpBatchJobPaths] + + lazy val runtimeAttributes = GcpBatchRuntimeAttributes( + validatedRuntimeAttributes, + batchConfiguration + .runtimeConfig + ) + + lazy val maxPreemption: Int = runtimeAttributes.preemptible + + + lazy val workingDisk: GcpBatchAttachedDisk = runtimeAttributes.disks.find(_.name == GcpBatchWorkingDisk.Name).get + + lazy val callRootPath: Path = gcpBatchCallPaths.callExecutionRoot + lazy val returnCodeFilename: String = gcpBatchCallPaths.returnCodeFilename + lazy val returnCodeGcsPath: Path = gcpBatchCallPaths.returnCode + lazy val gcpBatchLogPath: Path = gcpBatchCallPaths.batchLogPath + lazy val memoryRetryRCFilename: String = gcpBatchCallPaths.memoryRetryRCFilename + lazy val memoryRetryRCGcsPath: Path = gcpBatchCallPaths.memoryRetryRC + + lazy val batchAttributes: GcpBatchConfigurationAttributes = batchConfiguration.batchAttributes + + lazy val defaultLabels: Labels = { + val workflow = jobDescriptor.workflowDescriptor + val call = jobDescriptor.taskCall + val subWorkflow = workflow.callable + val subWorkflowLabels = if (!subWorkflow.equals(workflow.rootWorkflow)) + Labels("cromwell-sub-workflow-name" -> subWorkflow.name) + else + Labels.empty + + val alias = call.localName + val aliasLabels = if (!alias.equals(call.callable.name)) + Labels("wdl-call-alias" -> alias) + else + Labels.empty + + Labels( + "cromwell-workflow-id" -> s"cromwell-${workflow.rootWorkflowId}", + "wdl-task-name" -> call.callable.name + ) ++ subWorkflowLabels ++ aliasLabels + } + + + lazy val originalLabels: Labels = defaultLabels + + lazy val backendLabels: Seq[GcpLabel] = GcpLabel.safeLabels(originalLabels.asTuple: _*) + + lazy val originalLabelEvents: Map[String, String] = originalLabels.value map { l => s"${CallMetadataKeys.Labels}:${l.key}" -> l.value } toMap + + override protected def nonStandardMetadata: Map[String, Any] = { + val googleProject = initializationData + .workflowPaths + .workflowDescriptor + .workflowOptions + .get(WorkflowOptionKeys.GoogleProject) + .getOrElse(batchAttributes.project) + + Map[String, Any]( + GcpBatchMetadataKeys.GoogleProject -> googleProject, + GcpBatchMetadataKeys.ExecutionBucket -> initializationData.workflowPaths.executionRootString, + "preemptible" -> preemptible + ) ++ originalLabelEvents + } + +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchApiRequestHandler.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchApiRequestHandler.scala new file mode 100644 index 00000000000..1a877558942 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchApiRequestHandler.scala @@ -0,0 +1,34 @@ +package cromwell.backend.google.batch.api + +import com.google.api.gax.rpc.FixedHeaderProvider +import com.google.cloud.batch.v1._ +import com.google.common.collect.ImmutableMap +import com.google.longrunning.Operation + +class GcpBatchApiRequestHandler { + def submit(request: CreateJobRequest): Job = withClient { client => + client.createJobCallable + .call(request) + } + + def query(request: GetJobRequest): Job = withClient { client => + client.getJob(request) + } + + def abort(request: DeleteJobRequest): Operation = withClient { client => + client.deleteJobCallable().call(request) + } + + private def withClient[T](f: BatchServiceClient => T): T = { + // set user agent to cromwell so requests can be differentiated on batch + val headers = ImmutableMap.of("user-agent", "cromwell") + val headerProvider = FixedHeaderProvider.create(headers) + val batchSettings = BatchServiceSettings.newBuilder.setHeaderProvider(headerProvider).build + val client = BatchServiceClient.create(batchSettings) + try { + f(client) + } finally { + client.close() + } + } +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactory.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactory.scala new file mode 100644 index 00000000000..6976bd09867 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactory.scala @@ -0,0 +1,102 @@ +package cromwell.backend.google.batch.api + +import com.google.cloud.batch.v1.{CreateJobRequest, DeleteJobRequest, GetJobRequest, JobName} +import cromwell.backend.BackendJobDescriptor +import cromwell.backend.google.batch.io.GcpBatchAttachedDisk +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.VirtualPrivateCloudConfiguration +import cromwell.backend.google.batch.models._ +import cromwell.backend.google.batch.monitoring.{CheckpointingConfiguration, MonitoringImage} +import cromwell.core.path.Path +import wom.runtime.WomOutputRuntimeExtractor + +import scala.concurrent.duration.FiniteDuration + +trait GcpBatchRequestFactory { + def submitRequest(data: GcpBatchRequest): CreateJobRequest + + def queryRequest(jobName: JobName): GetJobRequest + + def abortRequest(jobName: JobName): DeleteJobRequest + +} + +object GcpBatchRequestFactory { + + type MountsToEnv = List[String] => Map[String, String] + + /** + * Input parameters that are not strictly needed by the user's command but are Cromwell byproducts. + */ + case class DetritusInputParameters( + executionScriptInputParameter: GcpBatchFileInput, + monitoringScriptInputParameter: Option[GcpBatchFileInput] + ) { + def all: List[GcpBatchFileInput] = List(executionScriptInputParameter) ++ monitoringScriptInputParameter + } + + /** + * Output parameters that are not produced by the user's command but are Cromwell byproducts. + */ + case class DetritusOutputParameters( + monitoringScriptOutputParameter: Option[GcpBatchFileOutput], + rcFileOutputParameter: GcpBatchFileOutput, + memoryRetryRCFileOutputParameter: GcpBatchFileOutput + ) { + def all: List[GcpBatchFileOutput] = memoryRetryRCFileOutputParameter :: List(rcFileOutputParameter) ++ monitoringScriptOutputParameter + } + + /** + * Bundle containing all input and output parameters to a PAPI job + * Detrituses and actual inputs / outputs to the job are separated for more clarity and to leave open the possibility + * to treat them differently. + */ + case class InputOutputParameters( + detritusInputParameters: DetritusInputParameters, + jobInputParameters: List[GcpBatchInput], + jobOutputParameters: List[GcpBatchOutput], + detritusOutputParameters: DetritusOutputParameters, + literalInputParameters: List[GcpBatchLiteralInput] + ) { + lazy val fileInputParameters: List[GcpBatchInput] = jobInputParameters ++ detritusInputParameters.all + lazy val fileOutputParameters: List[GcpBatchOutput] = detritusOutputParameters.all ++ jobOutputParameters + } + + case class CreateBatchDockerKeyAndToken(key: String, encryptedToken: String) + + case class CreateBatchJobParameters(jobDescriptor: BackendJobDescriptor, + runtimeAttributes: GcpBatchRuntimeAttributes, + dockerImage: String, + cloudWorkflowRoot: Path, + cloudCallRoot: Path, + commandScriptContainerPath: Path, + logGcsPath: Path, + inputOutputParameters: InputOutputParameters, + projectId: String, + computeServiceAccount: String, + googleLabels: Seq[GcpLabel], + preemptible: Boolean, + batchTimeout: FiniteDuration, + jobShell: String, + privateDockerKeyAndEncryptedToken: Option[CreateBatchDockerKeyAndToken], + womOutputRuntimeExtractor: Option[WomOutputRuntimeExtractor], + adjustedSizeDisks: Seq[GcpBatchAttachedDisk], + virtualPrivateCloudConfiguration: VirtualPrivateCloudConfiguration, + retryWithMoreMemoryKeys: Option[List[String]], + fuseEnabled: Boolean, + referenceDisksForLocalizationOpt: Option[List[GcpBatchAttachedDisk]], + monitoringImage: MonitoringImage, + checkpointingConfiguration: CheckpointingConfiguration, + enableSshAccess: Boolean, + vpcNetworkAndSubnetworkProjectLabels: Option[VpcAndSubnetworkProjectLabelValues], + dockerhubCredentials: (String, String) + ) { + def literalInputs = inputOutputParameters.literalInputParameters + + def inputParameters = inputOutputParameters.fileInputParameters + + def outputParameters = inputOutputParameters.fileOutputParameters + + def allParameters = inputParameters ++ outputParameters + } + +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala new file mode 100644 index 00000000000..435992b8cd8 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala @@ -0,0 +1,232 @@ +package cromwell.backend.google.batch.api + +import com.google.cloud.batch.v1.AllocationPolicy.Accelerator +import com.google.cloud.batch.v1.{DeleteJobRequest, GetJobRequest, JobName} +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration +import cromwell.backend.google.batch.models.GcpBatchRequest +import cromwell.backend.google.batch.runnable._ +import cromwell.backend.google.batch.util.BatchUtilityConversions +import com.google.cloud.batch.v1.AllocationPolicy.{AttachedDisk, InstancePolicy, InstancePolicyOrTemplate, LocationPolicy, NetworkInterface, NetworkPolicy, ProvisioningModel} +import com.google.cloud.batch.v1.LogsPolicy.Destination +import com.google.cloud.batch.v1.{AllocationPolicy, ComputeResource, CreateJobRequest, Job, LogsPolicy, Runnable, ServiceAccount, TaskGroup, TaskSpec, Volume} +import com.google.protobuf.Duration +import cromwell.backend.google.batch.io.GcpBatchAttachedDisk +import cromwell.backend.google.batch.models.VpcAndSubnetworkProjectLabelValues + +import scala.jdk.CollectionConverters._ + +class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransferConfiguration) extends GcpBatchRequestFactory + with BatchUtilityConversions + with UserRunnable + with ContainerSetup + with Localization + with Delocalization + with MemoryRetryCheckRunnable + with MonitoringRunnable + with CheckpointingRunnable { + + override def queryRequest(jobName: JobName): GetJobRequest = GetJobRequest.newBuilder.setName(jobName.toString).build + + override def abortRequest(jobName: JobName): DeleteJobRequest = DeleteJobRequest.newBuilder.setName(jobName.toString).build() + + + def createNetworkWithVPC(vpcAndSubnetworkProjectLabelValues: VpcAndSubnetworkProjectLabelValues, data: GcpBatchRequest): NetworkInterface.Builder = { + + val network = NetworkInterface + .newBuilder + .setNoExternalIpAddress(data.gcpBatchParameters.runtimeAttributes.noAddress) + .setNetwork(vpcAndSubnetworkProjectLabelValues.networkName(data.gcpBatchParameters.projectId)) + + vpcAndSubnetworkProjectLabelValues + .subnetNameOption(data.gcpBatchParameters.projectId) + .foreach(network.setSubnetwork) + + network + + } + + def createNetwork(data: GcpBatchRequest): NetworkInterface.Builder = { + data.createParameters.vpcNetworkAndSubnetworkProjectLabels match { + case Some(vpcAndSubnetworkProjectLabelValues) => createNetworkWithVPC(vpcAndSubnetworkProjectLabelValues, data) + case _ => NetworkInterface.newBuilder().setNoExternalIpAddress(data.createParameters.runtimeAttributes.noAddress) + } + } + + private def createComputeResource(cpu: Long, memory: Long, bootDiskSizeMb: Long) = { + ComputeResource + .newBuilder + .setCpuMilli(cpu) + .setMemoryMib(memory) + .setBootDiskMib(bootDiskSizeMb) + .build + } + + private def createInstancePolicy(cpuPlatform: String, spotModel: ProvisioningModel, accelerators: Option[Accelerator.Builder], attachedDisks: List[AttachedDisk]) = { + + //set GPU count to 0 if not included in workflow + val gpuAccelerators = accelerators.getOrElse(Accelerator.newBuilder.setCount(0).setType("")) + val instancePolicy = InstancePolicy + .newBuilder + .setProvisioningModel(spotModel) + .addAllDisks(attachedDisks.asJava) + .setMinCpuPlatform(cpuPlatform) + .buildPartial() + + //add GPUs if GPU count is greater than 1 + if (gpuAccelerators.getCount >= 1) { + val instancePolicyGpu = instancePolicy.toBuilder + instancePolicyGpu.addAccelerators(gpuAccelerators).build + instancePolicyGpu + } else { + instancePolicy.toBuilder + } + + } + + + private def createNetworkPolicy(networkInterface: NetworkInterface): NetworkPolicy = { + NetworkPolicy + .newBuilder + .addNetworkInterfaces(0, networkInterface) + .build + } + + private def createTaskSpec(runnables: List[Runnable], computeResource: ComputeResource, retryCount: Int, durationInSeconds: Long, volumes: List[Volume]) = { + TaskSpec + .newBuilder + .addAllRunnables(runnables.asJava) + .setComputeResource(computeResource) + .addAllVolumes(volumes.asJava) + .setMaxRetryCount(retryCount) + .setMaxRunDuration(Duration + .newBuilder + .setSeconds(durationInSeconds) + .build) + } + + private def createTaskGroup(taskCount: Long, task: TaskSpec.Builder): TaskGroup = { + TaskGroup + .newBuilder + .setTaskCount(taskCount) + .setTaskSpec(task) + .build + + } + + private def createAllocationPolicy(data: GcpBatchRequest, locationPolicy: LocationPolicy, instancePolicy: InstancePolicy, networkPolicy: NetworkPolicy, serviceAccount: ServiceAccount) = { + AllocationPolicy + .newBuilder + .setLocation(locationPolicy) + .setNetwork(networkPolicy) + .putLabels("cromwell-workflow-id", toLabel(data.workflowId.toString)) //label for workflow from WDL + .putLabels("goog-batch-worker", "true") + .putAllLabels((data.createParameters.googleLabels.map(label => label.key -> label.value).toMap.asJava)) + .setServiceAccount(serviceAccount) + .addInstances(InstancePolicyOrTemplate + .newBuilder + .setPolicy(instancePolicy) + .build) + .build + } + + override def submitRequest(data: GcpBatchRequest): CreateJobRequest = { + + val batchAttributes = data.gcpBatchParameters.batchAttributes + val runtimeAttributes = data.gcpBatchParameters.runtimeAttributes + val createParameters = data.createParameters + val retryCount = data.gcpBatchParameters.runtimeAttributes.preemptible + val allDisksToBeMounted: Seq[GcpBatchAttachedDisk] = createParameters.adjustedSizeDisks ++ createParameters.referenceDisksForLocalizationOpt.getOrElse(List.empty) + val gcpBootDiskSizeMb = convertGbToMib(runtimeAttributes) + + // set parent for metadata storage of job information + lazy val parent = s"projects/${data.gcpBatchParameters.projectId}/locations/${data.gcpBatchParameters.region}" + val gcpSa = ServiceAccount.newBuilder.setEmail(batchAttributes.computeServiceAccount).build + + // make zones path + val zones = toZonesPath(runtimeAttributes.zones) + + // convert to millicores for Batch + val cpu = runtimeAttributes.cpu + val cpuCores = toCpuCores(cpu.toString.toLong) + + val cpuPlatform = runtimeAttributes.cpuPlatform.getOrElse("") + + // convert memory to MiB for Batch + val memory = toMemMib(runtimeAttributes.memory) + + // Determine max runtime for Batch + val durationInSeconds: Long = data.gcpBatchParameters.batchAttributes.batchTimeout.toSeconds + + // Batch defaults to 1 task + val taskCount: Long = 1 + + println(f"command script container path ${data.createParameters.commandScriptContainerPath}") + println(f"cloud workflow root ${data.createParameters.cloudWorkflowRoot}") + println(f"all parameters:\n ${data.createParameters.allParameters.mkString("\n")}") + + // parse preemption value and set value for Spot. Spot is replacement for preemptible + val spotModel = toProvisioningModel(runtimeAttributes.preemptible) + + // Set GPU accelerators + val accelerators = runtimeAttributes.gpuResource.map(toAccelerator) + + val networkInterface = createNetwork(data = data) + val networkPolicy = createNetworkPolicy(networkInterface.build()) + val allDisks = toDisks(allDisksToBeMounted) + val allVolumes = toVolumes(allDisksToBeMounted) + + val containerSetup: List[Runnable] = containerSetupRunnables(allVolumes) + val localization: List[Runnable] = localizeRunnables(createParameters, allVolumes) + val userRunnable: List[Runnable] = userRunnables(data.createParameters, allVolumes) + val memoryRetryRunnable: List[Runnable] = checkForMemoryRetryRunnables(createParameters, allVolumes) + val deLocalization: List[Runnable] = deLocalizeRunnables(createParameters, allVolumes) + val monitoringSetup: List[Runnable] = monitoringSetupRunnables(createParameters, allVolumes) + val monitoringShutdown: List[Runnable] = monitoringShutdownRunnables(createParameters) + val checkpointingStart: List[Runnable] = checkpointingSetupRunnables(createParameters, allVolumes) + val checkpointingShutdown: List[Runnable] = checkpointingShutdownRunnables(createParameters, allVolumes) + val sshAccess: List[Runnable] = List.empty //sshAccessActions(createPipelineParameters, mounts) + + val sortedRunnables: List[Runnable] = RunnableUtils.sortRunnables( + containerSetup = containerSetup, + localization = localization, + userRunnable = userRunnable, + memoryRetryRunnable = memoryRetryRunnable, + deLocalization = deLocalization, + monitoringSetup = monitoringSetup, + monitoringShutdown = monitoringShutdown, + checkpointingStart = checkpointingStart, + checkpointingShutdown = checkpointingShutdown, + sshAccess = sshAccess, + isBackground = _.getBackground, + ) + + val computeResource = createComputeResource(cpuCores, memory, gcpBootDiskSizeMb) + val taskSpec = createTaskSpec(sortedRunnables, computeResource, retryCount, durationInSeconds, allVolumes) + val taskGroup: TaskGroup = createTaskGroup(taskCount, taskSpec) + val instancePolicy = createInstancePolicy(cpuPlatform, spotModel, accelerators, allDisks) + val locationPolicy = LocationPolicy.newBuilder.addAllowedLocations(zones).build + val allocationPolicy = createAllocationPolicy(data, locationPolicy, instancePolicy.build, networkPolicy, gcpSa) + val job = Job + .newBuilder + .addTaskGroups(taskGroup) + .setAllocationPolicy(allocationPolicy) + .putLabels("submitter", "cromwell") // label to signify job submitted by cromwell for larger tracking purposes within GCP batch + .putLabels("goog-batch-worker", "true") + .putAllLabels((data.createParameters.googleLabels.map(label => label.key -> label.value).toMap.asJava)) + .setLogsPolicy(LogsPolicy + .newBuilder + .setDestination(Destination.CLOUD_LOGGING) + .build) + + println(f"job shell ${data.createParameters.jobShell}") + println(f"script container path ${data.createParameters.commandScriptContainerPath}") + println(f"labels ${data.createParameters.googleLabels}") + + CreateJobRequest + .newBuilder + .setParent(parent) + .setJob(job) + .setJobId(data.jobName) + .build() + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/authentication/GcpBatchAuths.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/authentication/GcpBatchAuths.scala new file mode 100644 index 00000000000..7dabe4ca46d --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/authentication/GcpBatchAuths.scala @@ -0,0 +1,5 @@ +package cromwell.backend.google.batch.authentication + +import cromwell.cloudsupport.gcp.auth.GoogleAuthMode + +case class GcpBatchAuths(genomics: GoogleAuthMode, gcs: GoogleAuthMode) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/authentication/GcpBatchVMAuthentication.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/authentication/GcpBatchVMAuthentication.scala new file mode 100644 index 00000000000..4f14d1b92b1 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/authentication/GcpBatchVMAuthentication.scala @@ -0,0 +1,60 @@ +package cromwell.backend.google.batch.authentication + +import cats.data.Validated.{Invalid, Valid} +import cats.syntax.validated._ +import common.validation.ErrorOr._ +import common.validation.Validation._ +import cromwell.cloudsupport.gcp.GoogleConfiguration +import cromwell.core.DockerCredentials +import spray.json.{JsString, JsValue} + +/** + * Interface for Authentication information that can be included as a json object in the file uploaded to GCS + * upon workflow creation and used in the VM. + */ +sealed trait GcpBatchAuthObject { + def context: String + + def map: Map[String, JsValue] + + def toMap: Map[String, Map[String, JsValue]] = Map(context -> map) +} + +object GcpBatchDockerCredentials { + + def apply(dockerCredentials: DockerCredentials, googleConfig: GoogleConfiguration): GcpBatchDockerCredentials = { + // If there's an encryption key defined there must be a valid auth defined to encrypt it. + val authValidation = dockerCredentials.keyName match { + case None => ().validNel // fine + case _ => + for { + authName <- dockerCredentials.authName.toErrorOr("KMS Encryption key defined for private Docker but no auth specified") + _ <- googleConfig.auth(authName) + } yield () + } + + authValidation match { + case Invalid(errors) => + throw new RuntimeException(errors.toList.mkString(", ")) + case Valid(_) => + new GcpBatchDockerCredentials( + token = dockerCredentials.token, + keyName = dockerCredentials.keyName, + authName = dockerCredentials.authName) + } + } +} + +/** + * Authentication information to pull docker images as the user. + */ +case class GcpBatchDockerCredentials(override val token: String, + override val keyName: Option[String], + override val authName: Option[String]) + extends DockerCredentials(token = token, keyName = keyName, authName = authName) with GcpBatchAuthObject { + + override val context = "docker" + override val map = Map( + "token" -> JsString(token) + ) +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/callcaching/BatchBackendCacheHitCopyingActor.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/callcaching/BatchBackendCacheHitCopyingActor.scala new file mode 100644 index 00000000000..a0fc2e96317 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/callcaching/BatchBackendCacheHitCopyingActor.scala @@ -0,0 +1,88 @@ +package cromwell.backend.google.batch.callcaching + +import com.google.cloud.storage.contrib.nio.CloudStorageOptions +import common.util.TryUtil +import cromwell.backend.BackendInitializationData +import cromwell.backend.google.batch.models.GcpBackendInitializationData +import cromwell.backend.io.JobPaths +import cromwell.backend.standard.callcaching.{StandardCacheHitCopyingActor, StandardCacheHitCopyingActorParams} +import cromwell.core.CallOutputs +import cromwell.core.io.{IoCommand, IoTouchCommand} +import cromwell.core.path.Path +import cromwell.core.simpleton.{WomValueBuilder, WomValueSimpleton} +import cromwell.filesystems.gcs.batch.GcsBatchCommandBuilder +import wom.values.WomFile + +import scala.language.postfixOps +import scala.util.Try + +class BatchBackendCacheHitCopyingActor(standardParams: StandardCacheHitCopyingActorParams) extends StandardCacheHitCopyingActor(standardParams){ + override protected val commandBuilder: GcsBatchCommandBuilder.type = GcsBatchCommandBuilder + private val cachingStrategy: BatchCacheHitDuplicationStrategy = BackendInitializationData + .as[GcpBackendInitializationData](standardParams.backendInitializationDataOption) + .gcpBatchConfiguration.batchAttributes.cacheHitDuplicationStrategy + + override def processSimpletons(womValueSimpletons: Seq[WomValueSimpleton], + sourceCallRootPath: Path, + ): Try[(CallOutputs, Set[IoCommand[_]])] = + cachingStrategy match { + case CopyCachedOutputs => super.processSimpletons(womValueSimpletons, sourceCallRootPath) + case UseOriginalCachedOutputs => + val touchCommands: Seq[Try[IoTouchCommand]] = womValueSimpletons collect { + case WomValueSimpleton(_, wdlFile: WomFile) => getPath(wdlFile.value) flatMap GcsBatchCommandBuilder.touchCommand + } + + TryUtil.sequence(touchCommands) map { + WomValueBuilder.toJobOutputs(jobDescriptor.taskCall.outputPorts, womValueSimpletons) -> _.toSet + } + } + + override def extractBlacklistPrefix(path: String): Option[String] = Option(path.stripPrefix("gs://").takeWhile(_ != '/')) + + override def processDetritus(sourceJobDetritusFiles: Map[String, String] + ): Try[(Map[String, Path], Set[IoCommand[_]])] = + cachingStrategy match { + case CopyCachedOutputs => super.processDetritus(sourceJobDetritusFiles) + case UseOriginalCachedOutputs => + // apply getPath on each detritus string file + val detritusAsPaths = detritusFileKeys(sourceJobDetritusFiles).toSeq map { key => + key -> getPath(sourceJobDetritusFiles(key)) + } toMap + + // Don't forget to re-add the CallRootPathKey that has been filtered out by detritusFileKeys + TryUtil.sequenceMap(detritusAsPaths, "Failed to make paths out of job detritus") flatMap { newDetritus => + Try { + // PROD-444: Keep It Short and Simple: Throw on the first error and let the outer Try catch-and-re-wrap + (newDetritus + (JobPaths.CallRootPathKey -> destinationCallRootPath)) -> + newDetritus.values.map(GcsBatchCommandBuilder.touchCommand(_).get).toSet + } + } + } + + override protected def additionalIoCommands(sourceCallRootPath: Path, + originalSimpletons: Seq[WomValueSimpleton], + newOutputs: CallOutputs, + originalDetritus: Map[String, String], + newDetritus: Map[String, Path]): Try[List[Set[IoCommand[_]]]] = Try { + cachingStrategy match { + case UseOriginalCachedOutputs => + val content = + s""" + |This directory does not contain any output files because this job matched an identical job that was previously run, thus it was a cache-hit. + |Cromwell is configured to not copy outputs during call caching. To change this, edit the filesystems.gcs.caching.duplication-strategy field in your backend configuration. + |The original outputs can be found at this location: ${sourceCallRootPath.pathAsString} + """.stripMargin + + // PROD-444: Keep It Short and Simple: Throw on the first error and let the outer Try catch-and-re-wrap + List(Set( + GcsBatchCommandBuilder.writeCommand( + path = jobPaths.forCallCacheCopyAttempts.callExecutionRoot / "call_caching_placeholder.txt", + content = content, + options = Seq(CloudStorageOptions.withMimeType("text/plain")), + ).get + )) + case CopyCachedOutputs => List.empty + } + } + +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/callcaching/BatchBackendFileHashingActor.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/callcaching/BatchBackendFileHashingActor.scala new file mode 100644 index 00000000000..a3f000e7085 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/callcaching/BatchBackendFileHashingActor.scala @@ -0,0 +1,8 @@ +package cromwell.backend.google.batch.callcaching + +import cromwell.backend.standard.callcaching.{StandardFileHashingActor, StandardFileHashingActorParams} +import cromwell.filesystems.gcs.batch.GcsBatchCommandBuilder + +class BatchBackendFileHashingActor(standardParams: StandardFileHashingActorParams) extends StandardFileHashingActor(standardParams) { + override val ioCommandBuilder = GcsBatchCommandBuilder +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/callcaching/BatchCacheHitDuplicationStrategy.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/callcaching/BatchCacheHitDuplicationStrategy.scala new file mode 100644 index 00000000000..778a0c2fefe --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/callcaching/BatchCacheHitDuplicationStrategy.scala @@ -0,0 +1,7 @@ +package cromwell.backend.google.batch.callcaching + +sealed trait BatchCacheHitDuplicationStrategy + +case object CopyCachedOutputs extends BatchCacheHitDuplicationStrategy +case object UseOriginalCachedOutputs extends BatchCacheHitDuplicationStrategy + diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/errors/InvalidGcsPathsInManifestFileException.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/errors/InvalidGcsPathsInManifestFileException.scala new file mode 100644 index 00000000000..708906e8f17 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/errors/InvalidGcsPathsInManifestFileException.scala @@ -0,0 +1,7 @@ +package cromwell.backend.google.batch.errors + +import scala.util.control.NoStackTrace + +class InvalidGcsPathsInManifestFileException(paths: List[String]) extends Exception with NoStackTrace { + override def getMessage: String = s"Some of the paths in manifest file are not valid GCS paths: \n${paths.mkString("\n")}" +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/io/DiskType.java b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/io/DiskType.java new file mode 100644 index 00000000000..1adfa3e53b9 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/io/DiskType.java @@ -0,0 +1,15 @@ +package cromwell.backend.google.batch.io; + +public enum DiskType { + LOCAL("LOCAL", "LOCAL_SSD"), + SSD("SSD", "PERSISTENT_SSD"), + HDD("HDD", "PERSISTENT_HDD"); + + public final String diskTypeName; + public final String googleTypeName; + + DiskType(final String diskTypeName, final String googleTypeName) { + this.diskTypeName = diskTypeName; + this.googleTypeName = googleTypeName; + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/io/GcpBatchAttachedDisk.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/io/GcpBatchAttachedDisk.scala new file mode 100644 index 00000000000..0e49985e24f --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/io/GcpBatchAttachedDisk.scala @@ -0,0 +1,98 @@ +package cromwell.backend.google.batch.io + +import cats.data.Validated._ +import cats.syntax.apply._ +import cats.syntax.validated._ +import common.exception.MessageAggregation +import common.validation.ErrorOr._ +import cromwell.backend.DiskPatterns._ +import cromwell.core.path.{DefaultPathBuilder, Path} +import wdl4s.parser.MemoryUnit +import wom.format.MemorySize +import wom.values._ + +import scala.util.Try + +object GcpBatchAttachedDisk { + def parse(s: String): Try[GcpBatchAttachedDisk] = { + + def sizeGbValidation(sizeGbString: String): ErrorOr[Int] = validateLong(sizeGbString).map(_.toInt) + + def diskTypeValidation(diskTypeString: String): ErrorOr[DiskType] = validateDiskType(diskTypeString) + + val validation: ErrorOr[GcpBatchAttachedDisk] = s match { + case WorkingDiskPattern(sizeGb, diskType) => (validateDiskType(diskType), sizeGbValidation(sizeGb)) mapN { + GcpBatchWorkingDisk.apply + } + case MountedDiskPattern(mountPoint, sizeGb, diskType) => (sizeGbValidation(sizeGb), diskTypeValidation(diskType)) mapN { (s, dt) => PipelinesApiEmptyMountedDisk(dt, s, DefaultPathBuilder.get(mountPoint)) } + case _ => s"Disk strings should be of the format 'local-disk SIZE TYPE' or '/mount/point SIZE TYPE' but got: '$s'".invalidNel + } + + Try(validation match { + case Valid(localDisk) => localDisk + case Invalid(nels) => + throw new UnsupportedOperationException with MessageAggregation { + val exceptionContext = "" + val errorMessages: List[String] = nels.toList + } + }) + } + + private def validateDiskType(diskTypeName: String): ErrorOr[DiskType] = { + DiskType.values().find(_.diskTypeName == diskTypeName) match { + case Some(diskType) => diskType.validNel + case None => + val diskTypeNames = DiskType.values.map(_.diskTypeName).mkString(", ") + s"Disk TYPE $diskTypeName should be one of $diskTypeNames".invalidNel + } + } + + private def validateLong(value: String): ErrorOr[Long] = { + try { + value.toLong.validNel + } catch { + case _: IllegalArgumentException => s"$value not convertible to a Long".invalidNel + } + } + + implicit class EnhancedDisks(val disks: Seq[GcpBatchAttachedDisk]) extends AnyVal { + def adjustWorkingDiskWithNewMin(minimum: MemorySize, onAdjustment: => Unit): Seq[GcpBatchAttachedDisk] = disks map { + case disk: GcpBatchWorkingDisk if disk == GcpBatchWorkingDisk.Default && disk.sizeGb < minimum.to(MemoryUnit.GB).amount.toInt => + onAdjustment + disk.copy(sizeGb = minimum.to(MemoryUnit.GB).amount.toInt) + case other => other + } + } +} + +trait GcpBatchAttachedDisk { + def name: String + def diskType: DiskType + def sizeGb: Int + def mountPoint: Path +} + +case class PipelinesApiEmptyMountedDisk(diskType: DiskType, sizeGb: Int, mountPoint: Path) extends GcpBatchAttachedDisk { + val name = s"d-${mountPoint.pathAsString.md5Sum}" + + override def toString: String = s"$mountPoint $sizeGb ${diskType.diskTypeName}" +} + +object GcpBatchWorkingDisk { + val MountPoint: Path = DefaultPathBuilder.get("/mnt/disks/cromwell_root") + val Name = "local-disk" + val Default = GcpBatchWorkingDisk(DiskType.SSD, 10) +} + +case class GcpBatchWorkingDisk(diskType: DiskType, sizeGb: Int) extends GcpBatchAttachedDisk { + val mountPoint: Path = GcpBatchWorkingDisk.MountPoint + val name: String = GcpBatchWorkingDisk.Name + + override def toString: String = s"$name $sizeGb ${diskType.diskTypeName}" +} + +case class GcpBatchReferenceFilesDisk(image: String, sizeGb: Int) extends GcpBatchAttachedDisk { + val mountPoint: Path = DefaultPathBuilder.get(s"/mnt/${image.md5Sum}") + val name: String = s"d-${mountPoint.pathAsString.md5Sum}" + val diskType: DiskType = DiskType.HDD +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/io/package.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/io/package.scala new file mode 100644 index 00000000000..25489ff6869 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/io/package.scala @@ -0,0 +1,31 @@ + +package cromwell.backend.google.batch + +import com.google.api.client.http.HttpResponseException +import com.google.cloud.storage.contrib.nio.CloudStorageOptions +import cromwell.core.path.Path + +package object io { + implicit class PathEnhanced(val path: Path) extends AnyVal { + + def writeAsJson(content: String): Path = { + path.writeBytes(content.getBytes.iterator)(Seq(CloudStorageOptions.withMimeType("application/json"))) + } + + def writeAsText(content: String): Path = { + path.writeBytes(content.getBytes.iterator)(Seq(CloudStorageOptions.withMimeType("text/plain"))) + } + } + + private [batch] def isFatalJesException(t: Throwable): Boolean = t match { + case e: HttpResponseException if e.getStatusCode == 403 => true + case e: HttpResponseException if e.getStatusCode == 400 && e.getContent.contains("INVALID_ARGUMENT") => true + case _ => false + } + + private [batch] def isTransientJesException(t: Throwable): Boolean = t match { + // Quota exceeded + case e: HttpResponseException if e.getStatusCode == 429 => true + case _ => false + } +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/CreateGcpBatchParameters.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/CreateGcpBatchParameters.scala new file mode 100644 index 00000000000..d6258169fa8 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/CreateGcpBatchParameters.scala @@ -0,0 +1,10 @@ +package cromwell.backend.google.batch.models + +import cromwell.backend.BackendJobDescriptor + +case class CreateGcpBatchParameters(jobDescriptor: BackendJobDescriptor, + runtimeAttributes: GcpBatchRuntimeAttributes, + batchAttributes: GcpBatchConfigurationAttributes, + projectId: String, + region: String + ) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBackendInitializationData.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBackendInitializationData.scala new file mode 100644 index 00000000000..47169b30f0a --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBackendInitializationData.scala @@ -0,0 +1,16 @@ +package cromwell.backend.google.batch.models + +import com.google.auth.Credentials +import cromwell.backend.google.batch.util.BatchExpressionFunctions +import cromwell.backend.standard.{StandardInitializationData, StandardValidatedRuntimeAttributesBuilder} + +case class GcpBackendInitializationData( + override val workflowPaths: GcpBatchWorkflowPaths, + override val runtimeAttributesBuilder: StandardValidatedRuntimeAttributesBuilder, + gcpBatchConfiguration: GcpBatchConfiguration, + gcsCredentials: Credentials, + privateDockerEncryptionKeyName: Option[String], + privateDockerEncryptedToken: Option[String], + vpcNetworkAndSubnetworkProjectLabels: Option[VpcAndSubnetworkProjectLabelValues] + + ) extends StandardInitializationData(workflowPaths, runtimeAttributesBuilder, classOf[BatchExpressionFunctions] ) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchConfiguration.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchConfiguration.scala new file mode 100644 index 00000000000..dc397c5755e --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchConfiguration.scala @@ -0,0 +1,38 @@ +package cromwell.backend.google.batch.models + +import com.typesafe.config.Config +import cromwell.backend.BackendConfigurationDescriptor +import cromwell.backend.google.batch.authentication.{GcpBatchAuths, GcpBatchDockerCredentials} +import cromwell.cloudsupport.gcp.GoogleConfiguration +import cromwell.core.BackendDockerConfiguration +import net.ceedubs.ficus.Ficus._ +import spray.json._ + +import scala.concurrent.duration.FiniteDuration + + +class GcpBatchConfiguration(val configurationDescriptor: BackendConfigurationDescriptor, + val googleConfig: GoogleConfiguration, + val batchAttributes: GcpBatchConfigurationAttributes + ) extends DefaultJsonProtocol { + + val batchAuths: GcpBatchAuths = batchAttributes.auths + val root: String = configurationDescriptor.backendConfig.getString("root") + val batchTimeout: FiniteDuration = batchAttributes.batchTimeout + val runtimeConfig: Option[Config] = configurationDescriptor.backendRuntimeAttributesConfig + + + val dockerCredentials: Option[GcpBatchDockerCredentials] = { + BackendDockerConfiguration.build(configurationDescriptor.backendConfig).dockerCredentials map { creds => + GcpBatchDockerCredentials.apply(creds, googleConfig) + } + } + + val dockerEncryptionKeyName: Option[String] = dockerCredentials flatMap { _.keyName } + val dockerEncryptionAuthName: Option[String] = dockerCredentials flatMap { _.authName } + val dockerToken: Option[String] = dockerCredentials map { _.token } + + val jobShell: String = configurationDescriptor.backendConfig.as[Option[String]]("job-shell").getOrElse( + configurationDescriptor.globalConfig.getOrElse("system.job-shell", "/bin/bash")) + +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchConfigurationAttributes.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchConfigurationAttributes.scala new file mode 100644 index 00000000000..0a57b0a2a5e --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchConfigurationAttributes.scala @@ -0,0 +1,465 @@ +package cromwell.backend.google.batch.models + +import cats.data.Validated._ +import cats.data.{NonEmptyList, Validated} +import cats.implicits._ +import com.typesafe.config.{Config, ConfigValue} +import com.typesafe.scalalogging.StrictLogging +import common.exception.MessageAggregation +import common.validation.ErrorOr._ +import common.validation.Validation._ +import cromwell.backend.CommonBackendConfigurationAttributes +import cromwell.backend.google.batch._ +import cromwell.backend.google.batch.authentication.GcpBatchAuths +import cromwell.backend.google.batch.callcaching.{BatchCacheHitDuplicationStrategy, CopyCachedOutputs, UseOriginalCachedOutputs} +import cromwell.backend.google.batch.io.GcpBatchReferenceFilesDisk +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.{BatchRequestTimeoutConfiguration, GcsTransferConfiguration, VirtualPrivateCloudConfiguration} +import cromwell.backend.google.batch.util.{DockerImageCacheEntry, GcpBatchDockerCacheMappingOperations, GcpBatchReferenceFilesMappingOperations} +import cromwell.cloudsupport.gcp.GoogleConfiguration +import cromwell.cloudsupport.gcp.auth.GoogleAuthMode +import cromwell.filesystems.gcs.GcsPathBuilder +import cromwell.filesystems.gcs.GcsPathBuilder.ValidFullGcsPath +import eu.timepit.refined.api.Refined +import eu.timepit.refined.numeric.Positive +import eu.timepit.refined.{refineMV, refineV} +import net.ceedubs.ficus.Ficus._ +import org.slf4j.{Logger, LoggerFactory} + +import scala.concurrent.duration._ +import scala.jdk.CollectionConverters._ +import scala.util.matching.Regex +import scala.util.{Failure, Success, Try} + +case class GcpBatchConfigurationAttributes(project: String, + computeServiceAccount: String, + auths: GcpBatchAuths, + restrictMetadataAccess: Boolean, + dockerhubToken: String, + enableFuse: Boolean, + executionBucket: String, + location: String, + maxPollingInterval: Int, + qps: Int Refined Positive, + cacheHitDuplicationStrategy: BatchCacheHitDuplicationStrategy, + requestWorkers: Int Refined Positive, + batchTimeout: FiniteDuration, + logFlushPeriod: Option[FiniteDuration], + gcsTransferConfiguration: GcsTransferConfiguration, + virtualPrivateCloudConfiguration: VirtualPrivateCloudConfiguration, + batchRequestTimeoutConfiguration: BatchRequestTimeoutConfiguration, + referenceFileToDiskImageMappingOpt: Option[Map[String, GcpBatchReferenceFilesDisk]], + dockerImageToCacheDiskImageMappingOpt: Option[Map[String, DockerImageCacheEntry]], + checkpointingInterval: FiniteDuration + ) + +object GcpBatchConfigurationAttributes extends GcpBatchDockerCacheMappingOperations with GcpBatchReferenceFilesMappingOperations with StrictLogging { + + /** + * param transferAttempts This is the number of attempts, not retries, hence it is positive. + */ + case class GcsTransferConfiguration(transferAttempts: Int Refined Positive, parallelCompositeUploadThreshold: String) + + final case class VirtualPrivateCloudLabels(network: String, subnetwork: Option[String], auth: GoogleAuthMode) + + final case class VirtualPrivateCloudLiterals(network: String, subnetwork: Option[String]) + + final case class VirtualPrivateCloudConfiguration(labelsOption: Option[VirtualPrivateCloudLabels], + literalsOption: Option[VirtualPrivateCloudLiterals], + ) + + final case class BatchRequestTimeoutConfiguration(readTimeoutMillis: Option[Int Refined Positive], connectTimeoutMillis: Option[Int Refined Positive]) + + + lazy val Logger: Logger = LoggerFactory.getLogger("BatchConfiguration") + + val BatchApiDefaultQps = 1000 + val DefaultGcsTransferAttempts: Refined[Int, Positive] = refineMV[Positive](3) + + val checkpointingIntervalKey = "checkpointing-interval" + + private val batchKeys = CommonBackendConfigurationAttributes.commonValidConfigurationAttributeKeys ++ Set( + "project", + "root", + "maximum-polling-interval", + "genomics", + "genomics.location", + "genomics.compute-service-account", + "genomics.auth", + "genomics.restrict-metadata-access", + "genomics.enable-fuse", + "genomics-api-queries-per-100-seconds", + "genomics.localization-attempts", + "genomics.parallel-composite-upload-threshold", + "filesystems", + "filesystems.drs.auth", + "filesystems.gcs.auth", + "filesystems.gcs.project", + "filesystems.gcs.caching.duplication-strategy", + "concurrent-job-limit", + "request-workers", + "batch-timeout", + "batch-requests.timeouts.read", + "batch-requests.timeouts.connect", + "default-runtime-attributes.bootDiskSizeGb", + "default-runtime-attributes.noAddress", + "default-runtime-attributes.preemptible", + "default-runtime-attributes.zones", + "virtual-private-cloud", + "virtual-private-cloud.network-name", + "virtual-private-cloud.subnetwork-name", + "virtual-private-cloud.network-label-key", + "virtual-private-cloud.subnetwork-label-key", + "virtual-private-cloud.auth", + "reference-disk-localization-manifests", + "docker-image-cache-manifest-file", + checkpointingIntervalKey + ) + + private val deprecatedJesKeys: Map[String, String] = Map( + "genomics.default-zones" -> "default-runtime-attributes.zones" + ) + + def apply(googleConfig: GoogleConfiguration, backendConfig: Config, backendName: String): GcpBatchConfigurationAttributes = { + + + def vpcErrorMessage(missingKeys: List[String]) = s"Virtual Private Cloud configuration is invalid. Missing keys: `${missingKeys.mkString(",")}`.".invalidNel + + def validateVPCLabelsConfig(networkOption: Option[String], + subnetworkOption: Option[String], + authOption: Option[String], + ): ErrorOr[Option[VirtualPrivateCloudLabels]] = { + (networkOption, subnetworkOption, authOption) match { + case (Some(network), _, Some(auth)) => googleConfig.auth(auth) match { + case Valid(validAuth) => + Option(VirtualPrivateCloudLabels(network, subnetworkOption, validAuth)).validNel + case Invalid(error) => s"Auth $auth is not valid for Virtual Private Cloud configuration. Reason: $error".invalidNel + } + case (Some(_), _, None) => vpcErrorMessage(List("auth")) + case (None, _, Some(_)) => vpcErrorMessage(List("network-label-key")) + case (None, Some(_), None) => vpcErrorMessage(List("network-label-key", "auth")) + case (None, None, None) => None.validNel + } + } + + + def validateVPCLiteralsConfig(networkNameOption: Option[String], + subnetworkNameOption: Option[String], + ): ErrorOr[Option[VirtualPrivateCloudLiterals]] = { + (networkNameOption, subnetworkNameOption) match { + case (None, Some(_)) => vpcErrorMessage(List("network-name")) + case (Some(networkName), _) => Option(VirtualPrivateCloudLiterals(networkName, subnetworkNameOption)).valid + case (None, None) => None.valid + } + } + + def validateVPCConfig(networkNameOption: Option[String], + subnetworkNameOption: Option[String], + networkLabelOption: Option[String], + subnetworkLabelOption: Option[String], + authOption: Option[String], + ): ErrorOr[VirtualPrivateCloudConfiguration] = { + val vpcLabelsValidation = + validateVPCLabelsConfig(networkLabelOption, subnetworkLabelOption, authOption) + val vpcLiteralsValidation = + validateVPCLiteralsConfig(networkNameOption, subnetworkNameOption) + (vpcLabelsValidation, vpcLiteralsValidation) mapN VirtualPrivateCloudConfiguration + } + + val configKeys = backendConfig.entrySet().asScala.toSet map { entry: java.util.Map.Entry[String, ConfigValue] => entry.getKey } + warnNotRecognized(configKeys, batchKeys, backendName, Logger) + + + def warnDeprecated(keys: Set[String], deprecated: Map[String, String], logger: Logger): Unit = { + val deprecatedKeys = keys.intersect(deprecated.keySet) + deprecatedKeys foreach { key => logger.warn(s"Found deprecated configuration key $key, replaced with ${deprecated.get(key)}") } + } + + warnDeprecated(configKeys, deprecatedJesKeys, Logger) + + + val project: ErrorOr[String] = validate { + backendConfig.as[String]("project") + } + val executionBucket: ErrorOr[String] = validate { + backendConfig.as[String]("root") + } + val location: ErrorOr[String] = validate { + backendConfig.as[String]("genomics.location") + } + val maxPollingInterval: Int = backendConfig.as[Option[Int]]("maximum-polling-interval").getOrElse(600) + val computeServiceAccount: String = backendConfig.as[Option[String]]("genomics.compute-service-account").getOrElse("default") + val genomicsAuthName: ErrorOr[String] = validate { + backendConfig.as[String]("genomics.auth") + } + val genomicsRestrictMetadataAccess: ErrorOr[Boolean] = validate { + backendConfig.as[Option[Boolean]]("genomics.restrict-metadata-access").getOrElse(false) + } + val genomicsEnableFuse: ErrorOr[Boolean] = validate { + backendConfig.as[Option[Boolean]]("genomics.enable-fuse").getOrElse(false) + } + + val dockerhubToken: ErrorOr[String] = validate { + backendConfig.as[Option[String]]("dockerhub.token").getOrElse("") + } + + val gcsFilesystemAuthName: ErrorOr[String] = validate { + backendConfig.as[String]("filesystems.gcs.auth") + } + val qpsValidation = validateQps(backendConfig) + val duplicationStrategy = validate { + backendConfig.as[Option[String]]("filesystems.gcs.caching.duplication-strategy").getOrElse("copy") match { + case "copy" => CopyCachedOutputs + case "reference" => UseOriginalCachedOutputs + case other => throw new IllegalArgumentException(s"Unrecognized caching duplication strategy: $other. Supported strategies are copy and reference. See reference.conf for more details.") + } + } + val requestWorkers: ErrorOr[Int Refined Positive] = validatePositiveInt(backendConfig.as[Option[Int]]("request-workers").getOrElse(3), "request-workers") + + val batchTimeout: FiniteDuration = backendConfig.getOrElse("batch-timeout", 7.days) + + val logFlushPeriod: Option[FiniteDuration] = backendConfig.as[Option[FiniteDuration]]("log-flush-period") match { + case Some(duration) if duration.isFinite => Option(duration) + // "Inf" disables upload + case Some(_) => None + // Defaults to 1 minute + case None => Option(1.minute) + } + + val parallelCompositeUploadThreshold = validateGsutilMemorySpecification(backendConfig, "genomics.parallel-composite-upload-threshold") + + val localizationAttempts: ErrorOr[Int Refined Positive] = backendConfig.as[Option[Int]]("genomics.localization-attempts") + .map(attempts => validatePositiveInt(attempts, "genomics.localization-attempts")) + .getOrElse(DefaultGcsTransferAttempts.validNel) + + val gcsTransferConfiguration: ErrorOr[GcsTransferConfiguration] = + (localizationAttempts, parallelCompositeUploadThreshold) mapN GcsTransferConfiguration.apply + + val vpcNetworkName: ErrorOr[Option[String]] = validate { + backendConfig.getAs[String]("virtual-private-cloud.network-name") + } + val vpcSubnetworkName: ErrorOr[Option[String]] = validate { + backendConfig.getAs[String]("virtual-private-cloud.subnetwork-name") + } + val vpcNetworkLabel: ErrorOr[Option[String]] = validate { + backendConfig.getAs[String]("virtual-private-cloud.network-label-key") + } + val vpcSubnetworkLabel: ErrorOr[Option[String]] = validate { + backendConfig.getAs[String]("virtual-private-cloud.subnetwork-label-key") + } + val vpcAuth: ErrorOr[Option[String]] = validate { + backendConfig.getAs[String]("virtual-private-cloud.auth") + } + + val virtualPrivateCloudConfiguration: ErrorOr[VirtualPrivateCloudConfiguration] = { + (vpcNetworkName, vpcSubnetworkName, vpcNetworkLabel, vpcSubnetworkLabel, vpcAuth) flatMapN validateVPCConfig + } + + val batchRequestsReadTimeout = readOptionalPositiveMillisecondsIntFromDuration(backendConfig, "batch-requests.timeouts.read") + val batchRequestsConnectTimeout = readOptionalPositiveMillisecondsIntFromDuration(backendConfig, "batch-requests.timeouts.connect") + + val batchRequestTimeoutConfigurationValidation = (batchRequestsReadTimeout, batchRequestsConnectTimeout) mapN { (read, connect) => + BatchRequestTimeoutConfiguration(readTimeoutMillis = read, connectTimeoutMillis = connect) + } + + val referenceDiskLocalizationManifestFiles: ErrorOr[Option[List[ManifestFile]]] = validateReferenceDiskManifestConfigs(backendConfig, backendName) + + val dockerImageCacheManifestFile: ErrorOr[Option[ValidFullGcsPath]] = validateGcsPathToDockerImageCacheManifestFile(backendConfig) + + val checkpointingInterval: FiniteDuration = backendConfig.getOrElse(checkpointingIntervalKey, 10.minutes) + + def authGoogleConfigForBatchConfigurationAttributes(project: String, + bucket: String, + genomicsName: String, + location: String, + restrictMetadata: Boolean, + dockerhubToken: String, + enableFuse: Boolean, + gcsName: String, + qps: Int Refined Positive, + cacheHitDuplicationStrategy: BatchCacheHitDuplicationStrategy, + requestWorkers: Int Refined Positive, + gcsTransferConfiguration: GcsTransferConfiguration, + virtualPrivateCloudConfiguration: VirtualPrivateCloudConfiguration, + batchRequestTimeoutConfiguration: BatchRequestTimeoutConfiguration, + referenceDiskLocalizationManifestFilesOpt: Option[List[ManifestFile]], + dockerImageCacheManifestFileOpt: Option[ValidFullGcsPath]): ErrorOr[GcpBatchConfigurationAttributes] = + (googleConfig.auth(genomicsName), googleConfig.auth(gcsName)) mapN { + (genomicsAuth, gcsAuth) => + val generatedReferenceFilesMappingOpt = referenceDiskLocalizationManifestFilesOpt map { + generateReferenceFilesMapping(genomicsAuth, _) + } + val dockerImageToCacheDiskImageMappingOpt = dockerImageCacheManifestFileOpt map { + generateDockerImageToDiskImageMapping(genomicsAuth, _) + } + models.GcpBatchConfigurationAttributes( + project = project, + computeServiceAccount = computeServiceAccount, + auths = GcpBatchAuths(genomicsAuth, gcsAuth), + restrictMetadataAccess = restrictMetadata, + dockerhubToken = dockerhubToken, + enableFuse = enableFuse, + executionBucket = bucket, + location = location, + maxPollingInterval = maxPollingInterval, + qps = qps, + cacheHitDuplicationStrategy = cacheHitDuplicationStrategy, + requestWorkers = requestWorkers, + batchTimeout = batchTimeout, + logFlushPeriod = logFlushPeriod, + gcsTransferConfiguration = gcsTransferConfiguration, + virtualPrivateCloudConfiguration = virtualPrivateCloudConfiguration, + batchRequestTimeoutConfiguration = batchRequestTimeoutConfiguration, + referenceFileToDiskImageMappingOpt = generatedReferenceFilesMappingOpt, + dockerImageToCacheDiskImageMappingOpt = dockerImageToCacheDiskImageMappingOpt, + checkpointingInterval = checkpointingInterval + ) + } + + + (project, + executionBucket, + genomicsAuthName, + location, + genomicsRestrictMetadataAccess, + dockerhubToken, + genomicsEnableFuse, + gcsFilesystemAuthName, + qpsValidation, + duplicationStrategy, + requestWorkers, + gcsTransferConfiguration, + virtualPrivateCloudConfiguration, + batchRequestTimeoutConfigurationValidation, + referenceDiskLocalizationManifestFiles, + dockerImageCacheManifestFile + ) flatMapN authGoogleConfigForBatchConfigurationAttributes match { + case Valid(r) => r + case Invalid(f) => + throw new IllegalArgumentException with MessageAggregation { + override val exceptionContext = "Google Cloud Batch configuration is not valid: Errors" + override val errorMessages: List[String] = f.toList + } + } + } + + private def validateSingleGcsPath(gcsPath: String): ErrorOr[ValidFullGcsPath] = { + GcsPathBuilder.validateGcsPath(gcsPath) match { + case validPath: ValidFullGcsPath => validPath.validNel + case invalidPath => s"Invalid GCS path: $invalidPath".invalidNel + } + } + + private[batch] def validateGcsPathToDockerImageCacheManifestFile(backendConfig: Config): ErrorOr[Option[ValidFullGcsPath]] = { + backendConfig.getAs[String]("docker-image-cache-manifest-file") match { + case Some(gcsPath) => validateSingleGcsPath(gcsPath).map(Option.apply) + case None => None.validNel + } + } + + /** + * Validate that the entries corresponding to "reference-disk-localization-manifests" in the specified + * backend are parseable as `ManifestFile`s. + */ + private[batch] def validateReferenceDiskManifestConfigs(backendConfig: Config, backendName: String): ErrorOr[Option[List[ManifestFile]]] = { + Try(backendConfig.getAs[List[Config]]("reference-disk-localization-manifests")) match { + case Failure(e) => + ("Error attempting to parse value for 'reference-disk-localization-manifests' as List[Config]: " + + e.getMessage).invalidNel + case Success(s) => + s match { + case Some(configs) => + import _root_.io.circe.config.parser + import _root_.io.circe.generic.auto._ + + // Unfortunately the `as` method of `config` collides with the Ficus method of the same name, so invoke its + // equivalent using clunkier syntax: + configs traverse parser.decode[ManifestFile] match { + case Right(manifests) => + logger.info(s"Reference disks feature for $backendName backend is configured with the following reference images: ${manifests.map(_.imageIdentifier).mkString(", ")}.") + Option(manifests).validNel + case Left(err) => + val message = s"Reference disks misconfigured for backend $backendName, could not parse as List[ManifestFile]" + logger.error(message, err.getCause) + s"$message: ${err.getMessage}".invalidNel + } + case None => + logger.info(s"Reference disks feature for $backendName backend is not configured.") + None.validNel + } + } + } + + def validateQps(config: Config): ErrorOr[Int Refined Positive] = { + import eu.timepit.refined._ + + val qp100s = config.as[Option[Int]]("genomics-api-queries-per-100-seconds").getOrElse(BatchApiDefaultQps) + val qpsCandidate = qp100s / 100 + + refineV[Positive](qpsCandidate) match { + case Left(_) => s"Calculated QPS for Google Genomics API ($qpsCandidate/s) was not a positive integer (supplied value was $qp100s per 100s)".invalidNel + case Right(refined) => refined.validNel + } + } + + def validateGsutilMemorySpecification(config: Config, configPath: String): ErrorOr[String] = { + val entry = config.as[Option[String]](configPath) + entry match { + case None => "0".validNel + case Some(v@GsutilHumanBytes(_, _)) => v.validNel + case Some(bad) => s"Invalid gsutil memory specification in Cromwell configuration at path '$configPath': '$bad'".invalidNel + } + } + + def validatePositiveInt(n: Int, configPath: String): Validated[NonEmptyList[String], Refined[Int, Positive]] = { + refineV[Positive](n) match { + case Left(_) => s"Value $n for $configPath is not strictly positive".invalidNel + case Right(refined) => refined.validNel + } + } + + def readOptionalPositiveMillisecondsIntFromDuration(backendConfig: Config, configPath: String): ErrorOr[Option[Int Refined Positive]] = { + + def validate(n: FiniteDuration) = { + val result: ErrorOr[Int Refined Positive] = Try(n.toMillis.toInt).toErrorOr flatMap { millisInt => + refineV[Positive](millisInt) match { + case Left(_) => s"Value $n for $configPath is not strictly positive".invalidNel + case Right(refined) => refined.validNel + } + } + + result.contextualizeErrors(s"Parse '$configPath' value $n as a positive Int (in milliseconds)") + } + + backendConfig.as[Option[FiniteDuration]](configPath) match { + case Some(value) => validate(value).map(Option.apply) + case None => None.validNel + } + } + + // Copy/port of gsutil's "_GenerateSuffixRegex" + private[batch] lazy val GsutilHumanBytes: Regex = { + val _EXP_STRINGS = List( + List("B", "bit"), + List("KiB", "Kibit", "K"), + List("MiB", "Mibit", "M"), + List("GiB", "Gibit", "G"), + List("TiB", "Tibit", "T"), + List("PiB", "Pibit", "P"), + List("EiB", "Eibit", "E"), + ) + + val suffixes = for { + unit <- _EXP_STRINGS + name <- unit + } yield name + + // Differs from the Python original in a couple of ways: + //* The Python original uses named groups which are not supported in Scala regexes. + // (?P\d*\.\d+|\d+)\s*(?P%s)? + // + // * The Python original lowercases both the units and the human string before running the matcher. + // This Scala version turns on the (?i) case insensitive matching regex option instead. + val orSuffixes = suffixes.mkString("|") + "(?i)(\\d*\\.\\d+|\\d+)\\s*(%s)?".format(orSuffixes).r + } +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchCustomMachineType.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchCustomMachineType.scala new file mode 100644 index 00000000000..84366f4e969 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchCustomMachineType.scala @@ -0,0 +1,184 @@ +package cromwell.backend.google.batch.models + +import cromwell.backend.google.batch.models.CustomMachineType._ +import eu.timepit.refined.api.Refined +import eu.timepit.refined.numeric.Positive +import eu.timepit.refined.refineV +import mouse.all._ +import org.slf4j.Logger +import wdl4s.parser.MemoryUnit +import wom.format.MemorySize + +import scala.math.{log, pow} + +/** + * Adjusts memory and cpu for custom machine types. + * + * For more info see: + * - https://cloud.google.com/compute/docs/instances/creating-instance-with-custom-machine-type + * - https://cloud.google.com/compute/docs/instances/creating-instance-with-custom-machine-type#specifications + * - https://cloud.google.com/compute/docs/instances/creating-instance-with-custom-machine-type#gcloud + * - https://cloud.google.com/sdk/gcloud/reference/compute/instances/create#--custom-vm-type + */ +trait CustomMachineType { + /** + * The vm prefix to create this custom machine type. + */ + def vmTypePrefix: String + + /** + * The minimum memory per cpu. + */ + def minMemoryPerCpu: MemorySize + + /** + * The maximum memory per cpu. + */ + def maxMemoryPerCpu: MemorySize + + /** + * The total memory for a custom machine type must be a multiple of this value. + */ + def memoryFactor: MemorySize + + /** + * Increase the cpu to the next valid amount for this machine type. + */ + def validateCpu(cpu: Int Refined Positive): Int + + /** + * Increase the memory to the next valid amount for this machine type. + */ + def validateMemory(memory: MemorySize): MemorySize + + /** + * Generates a custom machine type based on the requested memory and cpu + */ + def machineType(requestedMemory: MemorySize, + requestedCpu: Int Refined Positive, + jobLogger: Logger, + ): String = { + val memory = requestedMemory |> validateMemory + val cpu = requestedCpu |> validateCpu + + val memoryPerCpuRatio = memory.bytes / cpu.toDouble + + lazy val adjustedMemory = MemorySize(minMemoryPerCpu.amount * cpu.toDouble, minMemoryPerCpu.unit) |> validateMemory + + lazy val adjustedCpu = refineV[Positive]((memory.bytes / maxMemoryPerCpu.bytes).ceil.toInt) match { + // If for some reason the above yields 0, keep the cpu value unchanged + case Left(_) => cpu + case Right(adjusted) => adjusted |> validateCpu + } + + val (validCpu, validMemory) = + if (memoryPerCpuRatio < minMemoryPerCpu.bytes) { + // If we're under the ratio, top up the memory. Because validMemory will only increase memory (if needed), + // there's no risk that the call to validMemory will make the ratio invalid + (cpu, adjustedMemory) + } else if (memoryPerCpuRatio > maxMemoryPerCpu.bytes) { + // If we're over the ratio, top up the CPU. Because validCpu will only increase CPU (if needed), there's no risk + // that the call to validCpu will make the ratio invalid + (adjustedCpu, memory) + } else { + // + (cpu, memory) + } + logAdjustment(requestedCpu.value, validCpu, requestedMemory, validMemory, jobLogger) + s"${vmTypePrefix}custom-$validCpu-${validMemory.to(MemoryUnit.MB).amount.intValue()}" + } + + private def logAdjustment(originalCpu: Int, + adjustedCpu: Int, + originalMemory: MemorySize, + adjustedMemory: MemorySize, + logger: Logger, + ): Unit = { + def memoryAdjustmentLog = s"memory was adjusted from ${originalMemory.toMBString} to ${adjustedMemory.toMBString}" + + def cpuAdjustmentLog = s"cpu was adjusted from $originalCpu to $adjustedCpu" + + val messageOption = + ( + originalCpu == adjustedCpu, + originalMemory.to(MemoryUnit.MB).amount == adjustedMemory.to(MemoryUnit.MB).amount, + ) match { + case (true, false) => Option(memoryAdjustmentLog) + case (false, true) => Option(cpuAdjustmentLog) + case (false, false) => Option(memoryAdjustmentLog + " and " + cpuAdjustmentLog) + case _ => None + } + + messageOption foreach { message => logger.info("To comply with GCE custom machine requirements, " + message) } + } +} + +object CustomMachineType { + implicit class EnhancedInformation(val information: MemorySize) extends AnyVal { + def asMultipleOf(factor: MemorySize): MemorySize = + MemorySize(factor.amount * (information.bytes / factor.bytes).ceil, factor.unit) + + def toMBString: String = information.to(MemoryUnit.MB).toString + } +} + +case object N1CustomMachineType extends CustomMachineType { + // For now using the legacy empty prefix that implies "n1-" + override val vmTypePrefix: String = "" + override val minMemoryPerCpu: MemorySize = MemorySize(0.9, MemoryUnit.GB) + override val maxMemoryPerCpu: MemorySize = MemorySize(6.5, MemoryUnit.GB) + override val memoryFactor: MemorySize = MemorySize(256, MemoryUnit.MB) + + override def validateCpu(cpu: Refined[Int, Positive]): Int = { + // Either one cpu, or an even number of cpus + cpu.value match { + case 1 => 1 + case cpu => cpu + (cpu % 2) + } + } + + override def validateMemory(memory: MemorySize): MemorySize = { + memory.asMultipleOf(memoryFactor) + } +} + +case object N2CustomMachineType extends CustomMachineType { + override val vmTypePrefix: String = "n2-" + override val minMemoryPerCpu: MemorySize = MemorySize(1.0, MemoryUnit.GB) + override val maxMemoryPerCpu: MemorySize = MemorySize(8.0, MemoryUnit.GB) + override val memoryFactor: MemorySize = MemorySize(256, MemoryUnit.MB) + + override def validateCpu(cpu: Refined[Int, Positive]): Int = { + // cpus must be divisible by 2 up to 32, and higher numbers must be divisible by 4 + cpu.value match { + case cpu if cpu <= 32 => cpu + (cpu % 2) + case cpu if cpu % 4 == 0 => cpu + case cpu => cpu + (4 - (cpu % 4)) + } + } + + override def validateMemory(memory: MemorySize): MemorySize = { + memory.asMultipleOf(memoryFactor) + } +} + +case object N2DCustomMachineType extends CustomMachineType { + override val vmTypePrefix: String = "n2d-" + override val minMemoryPerCpu: MemorySize = MemorySize(0.5, MemoryUnit.GB) + override val maxMemoryPerCpu: MemorySize = MemorySize(8.0, MemoryUnit.GB) + override val memoryFactor: MemorySize = MemorySize(256, MemoryUnit.MB) + + override def validateCpu(cpu: Refined[Int, Positive]): Int = { + cpu.value match { + case cpu if cpu <= 16 => 2 max pow(2, (log(cpu.toDouble)/log(2)).ceil).toInt + case cpu if cpu > 16 && cpu <= 96 && cpu % 16 == 0 => cpu + case cpu if cpu > 16 && cpu <= 96 => cpu + 16 - (cpu % 16) + case cpu if cpu > 96 => 96 + } + } + + override def validateMemory(memory: MemorySize): MemorySize = { + memory.asMultipleOf(memoryFactor) + } +} + diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchJobPaths.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchJobPaths.scala new file mode 100644 index 00000000000..923e5dc16f5 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchJobPaths.scala @@ -0,0 +1,43 @@ +package cromwell.backend.google.batch.models + +import cromwell.backend.BackendJobDescriptorKey +import cromwell.backend.io.JobPaths +import cromwell.core.path.Path + +object GcpBatchJobPaths { + + val BatchLogPathKey = "jesLog" + val BatchMonitoringKey = "monitoring" + val BatchMonitoringImageKey = "monitoringImage" + val BatchExecParamName = "exec" + val GcsTransferLibraryName = "gcs_transfer.sh" + val GcsLocalizationScriptName = "gcs_localization.sh" + val GcsDelocalizationScriptName = "gcs_delocalization.sh" + val DrsLocalizationManifestName = "drs_manifest" + +} +case class GcpBatchJobPaths(override val workflowPaths: GcpBatchWorkflowPaths, jobKey: BackendJobDescriptorKey, override val isCallCacheCopyAttempt: Boolean = false) extends JobPaths { + + def batchLogBasename = { + val index = jobKey + .index + .map(s => s"-$s") + .getOrElse("") + s"${ + jobKey + .node + .localName + }$index" + } + + val batchLogFilename: String = s"$batchLogBasename.log" + lazy val batchLogPath: Path = callExecutionRoot.resolve(batchLogFilename) + + val batchMonitoringLogFilename: String = s"${GcpBatchJobPaths.BatchMonitoringKey}.log" + lazy val batchMonitoringLogPath: Path = callExecutionRoot.resolve(batchMonitoringLogFilename) + + val batchMonitoringScriptFilename: String = s"${GcpBatchJobPaths.BatchMonitoringKey}.sh" + val batchMonitoringImageScriptFilename: String = s"${GcpBatchJobPaths.BatchMonitoringImageKey}.sh" + + override def forCallCacheCopyAttempts: JobPaths = this.copy(isCallCacheCopyAttempt = true) +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchParameters.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchParameters.scala new file mode 100644 index 00000000000..c2c75ce5667 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchParameters.scala @@ -0,0 +1,83 @@ +package cromwell.backend.google.batch.models + +import akka.http.scaladsl.model.ContentType +import cromwell.backend.google.batch.io.GcpBatchAttachedDisk +import cromwell.core.path.Path + +import scala.concurrent.duration.FiniteDuration + +sealed trait BatchParameter { + def name: String + + def mount: GcpBatchAttachedDisk + + /** + * The Path where the input file resides. The backend-specific localization + * code handles creating actions for each specific filesystem + * implementation. + * + * e.g: gs://root_bucket/input_data/my_input.bam + */ + def cloudPath: Path + + /** + * Relative path on the host machine where the file should be localized to / delocalized from. + * Note that the actual localization / delocalization happens in a docker container, therefore + * [[containerPath]] should be used as the actual source / destination path when localizing / delocalizing + * + * e.g: root_bucket/input_data/my_input.bam + */ + def relativeHostPath: Path + + /** + * Path in the docker container. It must be mounted on the docker from / to its hostPath + * + * e.g: /cromwell_root/root_bucket/input_data/my_input.bam + */ + def containerPath: Path = mount.mountPoint.resolve(relativeHostPath) + + /** + * True if this parameter represents a file; false if it represents a directory. + */ + def isFileParameter: Boolean = this match { + case _: GcpBatchFileInput => true + case _: GcpBatchFileOutput => true + case _: GcpBatchDirectoryInput => false + case _: GcpBatchDirectoryOutput => false + } +} + +sealed trait GcpBatchInput extends BatchParameter +sealed trait GcpBatchOutput extends BatchParameter { + def contentType: Option[ContentType] = None +} + +final case class GcpBatchFileInput(name: String, + cloudPath: Path, + relativeHostPath: Path, + mount: GcpBatchAttachedDisk) extends GcpBatchInput + +final case class GcpBatchDirectoryInput(name: String, + cloudPath: Path, + relativeHostPath: Path, + mount: GcpBatchAttachedDisk) extends GcpBatchInput + +final case class GcpBatchFileOutput(name: String, + cloudPath: Path, + relativeHostPath: Path, + mount: GcpBatchAttachedDisk, + optional: Boolean, + secondary: Boolean, + uploadPeriod: Option[FiniteDuration] = None, + override val contentType: Option[ContentType] = None) extends GcpBatchOutput + +final case class GcpBatchDirectoryOutput(name: String, + cloudPath: Path, + relativeHostPath: Path, + mount: GcpBatchAttachedDisk, + optional: Boolean, + secondary: Boolean, + override val contentType: Option[ContentType] = None) extends GcpBatchOutput + +// TODO: Remove when support for V1 is stopped, this is only used to pass the extra_param auth file +final case class GcpBatchLiteralInput(name: String, value: String) \ No newline at end of file diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRequest.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRequest.scala new file mode 100644 index 00000000000..3c616c0ba4e --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRequest.scala @@ -0,0 +1,9 @@ +package cromwell.backend.google.batch.models + +import cromwell.backend.google.batch.api.GcpBatchRequestFactory.CreateBatchJobParameters +import cromwell.core.WorkflowId + +case class GcpBatchRequest(workflowId: WorkflowId, + createParameters: CreateBatchJobParameters, + jobName: String, + gcpBatchParameters: CreateGcpBatchParameters) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala new file mode 100644 index 00000000000..9fe3327f35d --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala @@ -0,0 +1,294 @@ +package cromwell.backend.google.batch.models + +import cats.implicits.{catsSyntaxValidatedId, toTraverseOps} +import com.typesafe.config.Config +import common.validation.ErrorOr.ErrorOr +import cromwell.backend.google.batch.io.{GcpBatchAttachedDisk, GcpBatchWorkingDisk} +import cromwell.backend.google.batch.models.GpuResource.GpuType +import cromwell.backend.google.batch.util.{GpuTypeValidation, GpuValidation} +import cromwell.backend.standard.StandardValidatedRuntimeAttributesBuilder +import cromwell.backend.validation._ +import eu.timepit.refined._ +import eu.timepit.refined.api.Refined +import eu.timepit.refined.numeric.Positive +import wdl4s.parser.MemoryUnit +import wom.RuntimeAttributesKeys +import wom.format.MemorySize +import wom.types.{WomArrayType, WomStringType, WomType} +import wom.values.{WomArray, WomBoolean, WomInteger, WomString, WomValue} + +object GpuResource { + + final case class GpuType(name: String) { + override def toString: String = name + } + + object GpuType { + val NVIDIATeslaP100 = GpuType("nvidia-tesla-p100") + val NVIDIATeslaK80 = GpuType("nvidia-tesla-k80") + + val DefaultGpuType: GpuType = NVIDIATeslaK80 + val DefaultGpuCount: Int Refined Positive = refineMV[Positive](1) + val MoreDetailsURL = "https://cloud.google.com/compute/docs/gpus/" + } +} + +final case class GpuResource(gpuType: GpuType, gpuCount: Int Refined Positive) + +final case class GcpBatchRuntimeAttributes( + cpu: Int Refined Positive, + cpuPlatform: Option[String], + gpuResource: Option[GpuResource], + zones: Vector[String], + preemptible: Int, + bootDiskSize: Int, + memory: MemorySize, + disks: Seq[GcpBatchAttachedDisk], + dockerImage: String, + failOnStderr: Boolean, + continueOnReturnCode: ContinueOnReturnCode, + noAddress: Boolean, + useDockerImageCache: Option[Boolean], + checkpointFilename: Option[String]) + +object GcpBatchRuntimeAttributes { + + val ZonesKey = "zones" + private val ZonesDefaultValue = WomString("us-central1-b") + + val PreemptibleKey = "preemptible" + private val preemptibleValidationInstance = new IntRuntimeAttributesValidation(PreemptibleKey) + private val PreemptibleDefaultValue = WomInteger(0) + + val BootDiskSizeKey = "bootDiskSizeGb" + private val bootDiskValidationInstance = new IntRuntimeAttributesValidation(BootDiskSizeKey) + private val BootDiskDefaultValue = WomInteger(10) + + val NoAddressKey = "noAddress" + private val noAddressValidationInstance = new BooleanRuntimeAttributesValidation(NoAddressKey) + private val NoAddressDefaultValue = WomBoolean(false) + + val DisksKey = "disks" + private val DisksDefaultValue = WomString(s"${GcpBatchWorkingDisk.Name} 10 SSD") + + val CpuPlatformKey = "cpuPlatform" + private val cpuPlatformValidationInstance = new StringRuntimeAttributesValidation(CpuPlatformKey) + .optional + // via `gcloud compute zones describe us-central1-a` + val CpuPlatformIntelCascadeLakeValue = "Intel Cascade Lake" + val CpuPlatformAMDRomeValue = "AMD Rome" + + private def cpuMinValidation(runtimeConfig: Option[Config]): RuntimeAttributesValidation[Int Refined Positive] = CpuValidation + .instanceMin + .withDefault(CpuValidation.configDefaultWomValue(runtimeConfig) getOrElse CpuValidation.defaultMin) + + val UseDockerImageCacheKey = "useDockerImageCache" + private val useDockerImageCacheValidationInstance = new BooleanRuntimeAttributesValidation(UseDockerImageCacheKey) + .optional + + val CheckpointFileKey = "checkpointFile" + private val checkpointFileValidationInstance = new StringRuntimeAttributesValidation(CheckpointFileKey).optional + + private val MemoryDefaultValue = "2048 MB" + + private def cpuValidation(runtimeConfig: Option[Config]): RuntimeAttributesValidation[Int Refined Positive] = CpuValidation + .instance + .withDefault(CpuValidation + .configDefaultWomValue(runtimeConfig) getOrElse CpuValidation + .defaultMin) + private def cpuPlatformValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[String] = cpuPlatformValidationInstance + private def gpuTypeValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[GpuType] = GpuTypeValidation.optional + + private def gpuCountValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[Int Refined Positive] = GpuValidation.optional + private def gpuMinValidation(runtimeConfig: Option[Config]):OptionalRuntimeAttributesValidation[Int Refined Positive] = GpuValidation.optionalMin + + private val dockerValidation: RuntimeAttributesValidation[String] = DockerValidation.instance + + private def failOnStderrValidation(runtimeConfig: Option[Config]) = FailOnStderrValidation.default(runtimeConfig) + + private def continueOnReturnCodeValidation(runtimeConfig: Option[Config]) = ContinueOnReturnCodeValidation.default(runtimeConfig) + + private def disksValidation(runtimeConfig: Option[Config]): RuntimeAttributesValidation[Seq[GcpBatchAttachedDisk]] = DisksValidation + .withDefault(DisksValidation.configDefaultWomValue(runtimeConfig) getOrElse DisksDefaultValue) + + private def zonesValidation(runtimeConfig: Option[Config]): RuntimeAttributesValidation[Vector[String]] = ZonesValidation + .withDefault(ZonesValidation + .configDefaultWomValue(runtimeConfig) getOrElse ZonesDefaultValue) + + private def preemptibleValidation(runtimeConfig: Option[Config]): RuntimeAttributesValidation[Int] = preemptibleValidationInstance + .withDefault(preemptibleValidationInstance.configDefaultWomValue(runtimeConfig) getOrElse PreemptibleDefaultValue) + + private def memoryValidation(runtimeConfig: Option[Config]): RuntimeAttributesValidation[MemorySize] = { + MemoryValidation.withDefaultMemory( + RuntimeAttributesKeys.MemoryKey, + MemoryValidation.configDefaultString(RuntimeAttributesKeys.MemoryKey, runtimeConfig) getOrElse MemoryDefaultValue) + } + + private def memoryMinValidation(runtimeConfig: Option[Config]): RuntimeAttributesValidation[MemorySize] = { + MemoryValidation.withDefaultMemory( + RuntimeAttributesKeys.MemoryMinKey, + MemoryValidation + .configDefaultString(RuntimeAttributesKeys.MemoryMinKey, runtimeConfig) getOrElse MemoryDefaultValue) + } + + + private def bootDiskSizeValidation(runtimeConfig: Option[Config]): RuntimeAttributesValidation[Int] = bootDiskValidationInstance + .withDefault(bootDiskValidationInstance.configDefaultWomValue(runtimeConfig) getOrElse BootDiskDefaultValue) + + private def noAddressValidation(runtimeConfig: Option[Config]): RuntimeAttributesValidation[Boolean] = noAddressValidationInstance + .withDefault(noAddressValidationInstance + .configDefaultWomValue(runtimeConfig) getOrElse NoAddressDefaultValue) + + private def useDockerImageCacheValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[Boolean] = + useDockerImageCacheValidationInstance + + private val outDirMinValidation: OptionalRuntimeAttributesValidation[MemorySize] = { + InformationValidation.optional(RuntimeAttributesKeys.OutDirMinKey, MemoryUnit.MB, allowZero = true) + } + + private val tmpDirMinValidation: OptionalRuntimeAttributesValidation[MemorySize] = { + InformationValidation.optional(RuntimeAttributesKeys.TmpDirMinKey, MemoryUnit.MB, allowZero = true) + } + + private val inputDirMinValidation: OptionalRuntimeAttributesValidation[MemorySize] = { + InformationValidation.optional(RuntimeAttributesKeys.DnaNexusInputDirMinKey, MemoryUnit.MB, allowZero = true) + } + + def runtimeAttributesBuilder(batchConfiguration: GcpBatchConfiguration): StandardValidatedRuntimeAttributesBuilder = { + val runtimeConfig = batchConfiguration.runtimeConfig + StandardValidatedRuntimeAttributesBuilder.default(runtimeConfig).withValidation( + gpuCountValidation(runtimeConfig), + gpuTypeValidation(runtimeConfig), + cpuValidation(runtimeConfig), + cpuPlatformValidation(runtimeConfig), + cpuMinValidation(runtimeConfig), + gpuMinValidation(runtimeConfig), + disksValidation(runtimeConfig), + noAddressValidation(runtimeConfig), + zonesValidation(runtimeConfig), + preemptibleValidation(runtimeConfig), + memoryValidation(runtimeConfig), + memoryMinValidation(runtimeConfig), + bootDiskSizeValidation(runtimeConfig), + useDockerImageCacheValidation(runtimeConfig), + checkpointFileValidationInstance, + dockerValidation, + outDirMinValidation, + tmpDirMinValidation, + inputDirMinValidation + ) + } + + def apply(validatedRuntimeAttributes: ValidatedRuntimeAttributes, runtimeAttrsConfig: Option[Config]): GcpBatchRuntimeAttributes = { + val cpu: Int Refined Positive = RuntimeAttributesValidation.extract(cpuValidation(runtimeAttrsConfig), validatedRuntimeAttributes) + val cpuPlatform: Option[String] = RuntimeAttributesValidation.extractOption(cpuPlatformValidation(runtimeAttrsConfig).key, validatedRuntimeAttributes) + val checkpointFileName: Option[String] = RuntimeAttributesValidation.extractOption(checkpointFileValidationInstance.key, validatedRuntimeAttributes) + + //GPU + lazy val gpuType: Option[GpuType] = RuntimeAttributesValidation + .extractOption(gpuTypeValidation(runtimeAttrsConfig).key, validatedRuntimeAttributes) + lazy val gpuCount: Option[Int Refined Positive] = RuntimeAttributesValidation + .extractOption(gpuCountValidation(runtimeAttrsConfig).key, validatedRuntimeAttributes) + + val gpuResource: Option[GpuResource] = if (gpuType.isDefined || gpuCount.isDefined) { + Option(GpuResource(gpuType.getOrElse(GpuType.DefaultGpuType), gpuCount + .getOrElse(GpuType.DefaultGpuCount))) + } else { + None + } + + + val docker: String = RuntimeAttributesValidation.extract(dockerValidation, validatedRuntimeAttributes) + val failOnStderr: Boolean = RuntimeAttributesValidation.extract(failOnStderrValidation(runtimeAttrsConfig), validatedRuntimeAttributes) + val continueOnReturnCode: ContinueOnReturnCode = RuntimeAttributesValidation.extract(continueOnReturnCodeValidation(runtimeAttrsConfig), validatedRuntimeAttributes) + val noAddress: Boolean = RuntimeAttributesValidation.extract(noAddressValidation(runtimeAttrsConfig), validatedRuntimeAttributes) + val zones: Vector[String] = RuntimeAttributesValidation.extract(ZonesValidation, validatedRuntimeAttributes) + val preemptible: Int = RuntimeAttributesValidation.extract(preemptibleValidation(runtimeAttrsConfig), validatedRuntimeAttributes) + val bootDiskSize: Int = RuntimeAttributesValidation.extract(bootDiskSizeValidation(runtimeAttrsConfig), validatedRuntimeAttributes) + val memory: MemorySize = RuntimeAttributesValidation.extract(memoryValidation(runtimeAttrsConfig), validatedRuntimeAttributes) + val disks: Seq[GcpBatchAttachedDisk] = RuntimeAttributesValidation.extract(disksValidation(runtimeAttrsConfig), validatedRuntimeAttributes) + val useDockerImageCache: Option[Boolean] = RuntimeAttributesValidation.extractOption(useDockerImageCacheValidation(runtimeAttrsConfig).key, validatedRuntimeAttributes) + + val outDirMin: Option[MemorySize] = RuntimeAttributesValidation + .extractOption(outDirMinValidation.key, validatedRuntimeAttributes) + val tmpDirMin: Option[MemorySize] = RuntimeAttributesValidation + .extractOption(tmpDirMinValidation.key, validatedRuntimeAttributes) + val inputDirMin: Option[MemorySize] = RuntimeAttributesValidation + .extractOption(inputDirMinValidation.key, validatedRuntimeAttributes) + + + val totalExecutionDiskSizeBytes = List(inputDirMin.map(_.bytes), outDirMin.map(_.bytes), tmpDirMin.map(_.bytes)) + .flatten.fold(MemorySize(0, MemoryUnit.Bytes).bytes)(_ + _) + val totalExecutionDiskSize = MemorySize(totalExecutionDiskSizeBytes, MemoryUnit.Bytes) + + val adjustedDisks = disks.adjustWorkingDiskWithNewMin(totalExecutionDiskSize, ()) + + new GcpBatchRuntimeAttributes( + cpu, + cpuPlatform, + gpuResource, + zones, + preemptible, + bootDiskSize, + memory, + adjustedDisks, + docker, + failOnStderr, + continueOnReturnCode, + noAddress, + useDockerImageCache, + checkpointFileName) + } + + +} + +object ZonesValidation extends RuntimeAttributesValidation[Vector[String]] { + override def key: String = GcpBatchRuntimeAttributes.ZonesKey + + override def coercion: Iterable[WomType] = Set(WomStringType, WomArrayType(WomStringType)) + + override protected def validateValue: PartialFunction[WomValue, ErrorOr[Vector[String]]] = { + case WomString(s) => s.split("\\s+").toVector.validNel + case WomArray(womType, value) if womType.memberType == WomStringType => + value.map(_.valueString).toVector.validNel + } + + override protected def missingValueMessage: String = + s"Expecting $key runtime attribute to be either a whitespace separated String or an Array[String]" +} + +object DisksValidation extends RuntimeAttributesValidation[Seq[GcpBatchAttachedDisk]] { + override def key: String = GcpBatchRuntimeAttributes.DisksKey + + override def coercion: Iterable[WomType] = Set(WomStringType, WomArrayType(WomStringType)) + + override protected def validateValue: PartialFunction[WomValue, ErrorOr[Seq[GcpBatchAttachedDisk]]] = { + case WomString(value) => validateLocalDisks(value.split(",\\s*").toSeq) + case WomArray(womType, values) if womType.memberType == WomStringType => + validateLocalDisks(values.map(_.valueString)) + } + + private def validateLocalDisks(disks: Seq[String]): ErrorOr[Seq[GcpBatchAttachedDisk]] = { + val diskNels: ErrorOr[Seq[GcpBatchAttachedDisk]] = disks.toList.traverse[ErrorOr, GcpBatchAttachedDisk](validateLocalDisk) + val defaulted: ErrorOr[Seq[GcpBatchAttachedDisk]] = addDefault(diskNels) + defaulted + } + + private def validateLocalDisk(disk: String): ErrorOr[GcpBatchAttachedDisk] = { + GcpBatchAttachedDisk.parse(disk) match { + case scala.util.Success(attachedDisk) => attachedDisk.validNel + case scala.util.Failure(ex) => ex.getMessage.invalidNel + } + } + + private def addDefault(disksNel: ErrorOr[Seq[GcpBatchAttachedDisk]]): ErrorOr[Seq[GcpBatchAttachedDisk]] = { + disksNel map { + case disks if disks.exists(_.name == GcpBatchWorkingDisk.Name) => disks + case disks => disks :+ GcpBatchWorkingDisk.Default + } + } + + override protected def missingValueMessage: String = + s"Expecting $key runtime attribute to be a comma separated String or Array[String]" +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchWorkflowPaths.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchWorkflowPaths.scala new file mode 100644 index 00000000000..fcb80241b21 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchWorkflowPaths.scala @@ -0,0 +1,86 @@ +package cromwell.backend.google.batch.models + +import com.google.api.gax.retrying.RetrySettings +import com.google.auth.Credentials +import com.typesafe.config.Config +import cromwell.backend.google.batch.models.GcpBatchWorkflowPaths.callCachePathPrefixFromExecutionRoot +import cromwell.backend.google.batch.runnable.WorkflowOptionKeys +import cromwell.backend.io.WorkflowPaths +import cromwell.backend.{BackendJobDescriptorKey, BackendWorkflowDescriptor} +import cromwell.cloudsupport.gcp.gcs.GcsStorage +import cromwell.core.WorkflowOptions +import cromwell.core.path.Path +import cromwell.core.path.PathFactory.PathBuilders +import cromwell.filesystems.gcs.GcsPathBuilder + +import scala.concurrent.ExecutionContext +import scala.language.postfixOps + +object GcpBatchWorkflowPaths { + val GcsRootOptionKey = "gcp_batch_gcs_root" + private val AuthFilePathOptionKey = "auth_bucket" + private val GcsPrefix = "gs://" + + private def callCachePathPrefixFromExecutionRoot(executionRoot: String): String = { + // If the root looks like gs://bucket/stuff-under-bucket this should return gs://bucket + GcsPrefix + executionRoot.substring(GcsPrefix.length).takeWhile(_ != '/') + } +} +case class GcpBatchWorkflowPaths(workflowDescriptor: BackendWorkflowDescriptor, + gcsCredentials: Credentials, + genomicsCredentials: Credentials, + gcpBatchConfiguration: GcpBatchConfiguration, + override val pathBuilders: PathBuilders, + // This allows for the adjustment of the standard stream file names in PAPI v1 to match the + // combined controller + job standard output and error files. PAPI v1 controls the periodic + // delocalization of these files so the metadata Cromwell publishes for these files needs + // to match the PAPI v1 names. + standardStreamNameToFileNameMetadataMapper: (GcpBatchJobPaths, String) => String)(implicit ec: ExecutionContext) extends WorkflowPaths { + + override lazy val executionRootString: String = workflowDescriptor.workflowOptions.getOrElse(GcpBatchWorkflowPaths.GcsRootOptionKey, gcpBatchConfiguration.root) + override lazy val callCacheRootPrefix: Option[String] = Option(callCachePathPrefixFromExecutionRoot(executionRootString)) + + private val workflowOptions: WorkflowOptions = workflowDescriptor.workflowOptions + + val gcsAuthFilePath: Path = { + // The default auth file bucket is always at the root of the root workflow + val defaultBucket = executionRoot.resolve(workflowDescriptor.rootWorkflow.name) + .resolve(workflowDescriptor.rootWorkflowId.toString) + val bucket = workflowDescriptor.workflowOptions + .get(GcpBatchWorkflowPaths.AuthFilePathOptionKey) getOrElse defaultBucket + .pathAsString + + /* + * This is an "exception". The filesystem used here is built from genomicsAuth + * unlike everywhere else where the filesystem used is built from gcsFileSystemAuth + */ + val pathBuilderWithGenomicsAuth = GcsPathBuilder.fromCredentials( + genomicsCredentials, + gcpBatchConfiguration.googleConfig.applicationName, + RetrySettings.newBuilder().build(), + GcsStorage.DefaultCloudStorageConfiguration, + workflowOptions, + Option(gcpBatchConfiguration.batchAttributes.project) + ) + + val authBucket = pathBuilderWithGenomicsAuth.build(bucket) recover { + case ex => throw new Exception(s"Invalid gcs auth_bucket path $bucket", ex) + } get + + authBucket.resolve(s"${workflowDescriptor.rootWorkflowId}_auth.json") + } + + val monitoringScriptPath: Option[Path] = workflowOptions.get(WorkflowOptionKeys.MonitoringScript) + .toOption map { path => + // Fail here if the path exists but can't be built + getPath(path).get + } + override def toJobPaths(workflowPaths: WorkflowPaths, jobKey: BackendJobDescriptorKey): GcpBatchJobPaths = { + new GcpBatchJobPaths(workflowPaths.asInstanceOf[GcpBatchWorkflowPaths], jobKey) + } + override protected def withDescriptor(workflowDescriptor: BackendWorkflowDescriptor): WorkflowPaths = this.copy(workflowDescriptor = workflowDescriptor) + override def config: Config = gcpBatchConfiguration.configurationDescriptor.backendConfig +} + + + diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpLabel.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpLabel.scala new file mode 100644 index 00000000000..49e2941bf59 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpLabel.scala @@ -0,0 +1,102 @@ +package cromwell.backend.google.batch.models + +import cats.data.Validated.{Invalid, Valid} +import cats.instances.list._ +import cats.syntax.apply._ +import cats.syntax.traverse._ +import cats.syntax.validated._ +import common.exception.AggregatedMessageException +import common.validation.ErrorOr.ErrorOr +import cromwell.core.{CromwellFatalExceptionMarker, WorkflowOptions} +import spray.json.{JsObject, JsString} + +import scala.util.control.NoStackTrace +import scala.util.{Failure, Success, Try} + +final case class GcpLabel(key: String, value: String) + +object GcpLabel { + + val MaxLabelLength = 63 + val GoogleLabelRegexPattern = "[a-z]([-a-z0-9]*[a-z0-9])?" + val GoogleLabelRegex = GoogleLabelRegexPattern.r + + // This function is used to coerce a string into one that meets the requirements for a label submission to Google Pipelines API. + // See 'labels' in https://cloud.google.com/genomics/reference/rpc/google.genomics.v1alpha2#google.genomics.v1alpha2.RunPipelineArgs + def safeGoogleName(mainText: String, emptyAllowed: Boolean = false): String = { + + validateLabelRegex(mainText) match { + case Valid(labelText) => labelText + case invalid @ _ if mainText.equals("") && emptyAllowed => mainText + case invalid @ _ => + def appendSafe(current: String, nextChar: Char): String = { + nextChar match { + case c if c.isLetterOrDigit || c == '-' => current + c.toLower + case _ => current + '-' + } + } + + val foldResult = mainText.toCharArray.foldLeft("")(appendSafe) + + val startsValid = foldResult.headOption.exists(_.isLetter) + val endsValid = foldResult.lastOption.exists(_.isLetterOrDigit) + + val validStart = if (startsValid) foldResult else "x--" + foldResult + val validStartAndEnd = if (endsValid) validStart else validStart + "--x" + + val length = validStartAndEnd.length + val tooLong = length > MaxLabelLength + + if (tooLong) { + val middleSeparator = "---" + val subSectionLength = (MaxLabelLength - middleSeparator.length) / 2 + validStartAndEnd.substring(0, subSectionLength) + middleSeparator + validStartAndEnd.substring(length - subSectionLength, length) + } else { + validStartAndEnd + } + } + } + + def validateLabelRegex(s: String): ErrorOr[String] = { + (GoogleLabelRegex.pattern.matcher(s).matches, s.length <= MaxLabelLength) match { + case (true, true) => s.validNel + case (false, false) => s"Invalid label field: `$s` did not match regex '$GoogleLabelRegexPattern' and it is ${s.length} characters. The maximum is $MaxLabelLength.".invalidNel + case (false, _) => s"Invalid label field: `$s` did not match the regex '$GoogleLabelRegexPattern'".invalidNel + case (_, false) => s"Invalid label field: `$s` is ${s.length} characters. The maximum is $MaxLabelLength.".invalidNel + } + } + + + def safeLabels(values: (String, String)*): Seq[GcpLabel] = { + def safeGoogleLabel(kvp: (String, String)): GcpLabel = { + GcpLabel(safeGoogleName(kvp._1), safeGoogleName(kvp._2, emptyAllowed = true)) + } + values.map(safeGoogleLabel) + } + + def validateLabel(key: String, value: String): ErrorOr[GcpLabel] = { + (validateLabelRegex(key), validateLabelRegex(value)).mapN { (validKey, validValue) => GcpLabel(validKey, validValue) } + } + + def fromWorkflowOptions(workflowOptions: WorkflowOptions): Try[Seq[GcpLabel]] = { + + def extractGoogleLabelsFromJsObject(jsObject: JsObject): Try[Seq[GcpLabel]] = { + val asErrorOr = jsObject.fields.toList.traverse { + case (key: String, value: JsString) => GcpLabel.validateLabel(key, value.value) + case (key, other) => s"Bad label value type for '$key'. Expected simple string but got $other".invalidNel : ErrorOr[GcpLabel] + } + + asErrorOr match { + case Valid(value) => Success(value) + case Invalid(errors) => Failure(new AggregatedMessageException("Invalid 'google_labels' in workflow options", errors.toList) with CromwellFatalExceptionMarker with NoStackTrace) + } + } + + workflowOptions.toMap.get("google_labels") match { + case Some(obj: JsObject) => extractGoogleLabelsFromJsObject(obj) + case Some(other) => Failure(new Exception(s"Invalid 'google_labels' in workflow options. Must be a simple JSON object mapping string keys to string values. Got $other") with NoStackTrace with CromwellFatalExceptionMarker) + case None => Success(Seq.empty) + } + } + +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/ManifestFile.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/ManifestFile.scala new file mode 100644 index 00000000000..9b8d1fcafc8 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/ManifestFile.scala @@ -0,0 +1,3 @@ +package cromwell.backend.google.batch.models + +case class ManifestFile(imageIdentifier: String, diskSizeGb: Int, files: List[ReferenceFile]) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/PreviousRetryReasons.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/PreviousRetryReasons.scala new file mode 100644 index 00000000000..521dec4ee57 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/PreviousRetryReasons.scala @@ -0,0 +1,44 @@ +package cromwell.backend.google.batch.models + +import cats.syntax.apply._ +import cats.syntax.validated._ +import common.validation.ErrorOr.ErrorOr +import cromwell.backend.google.batch.GcpBatchBackendLifecycleActorFactory.{preemptionCountKey, unexpectedRetryCountKey} +import cromwell.services.keyvalue.KeyValueServiceActor._ + +import scala.util.{Failure, Success, Try} + +case class PreviousRetryReasons(preempted: Int, unexpectedRetry: Int) + +object PreviousRetryReasons { + + def tryApply(prefetchedKvEntries: Map[String, KvResponse], attemptNumber: Int): ErrorOr[PreviousRetryReasons] = { + val validatedPreemptionCount = validatedKvResponse(prefetchedKvEntries.get(preemptionCountKey), preemptionCountKey) + val validatedUnexpectedRetryCount = validatedKvResponse(prefetchedKvEntries.get(unexpectedRetryCountKey), unexpectedRetryCountKey) + + (validatedPreemptionCount, validatedUnexpectedRetryCount) mapN { PreviousRetryReasons.apply } + } + + def apply(knownPreemptedCount: Int, knownUnexpectedRetryCount: Int, attempt: Int): PreviousRetryReasons = { + // If we have anything unaccounted for, we can top up the unexpected retry count. + // NB: 'attempt' is 1-indexed, so, magic number: + // NB2: for sanity's sake, I won't let this unaccounted for drop below 0, just in case... + val unaccountedFor = Math.max(attempt - 1 - knownPreemptedCount - knownUnexpectedRetryCount, 0) + PreviousRetryReasons(knownPreemptedCount, knownUnexpectedRetryCount + unaccountedFor) + } + + private def validatedKvResponse(r: Option[KvResponse], fromKey: String): ErrorOr[Int] = r match { + case Some(KvPair(_, v)) => validatedInt(v, fromKey) + case Some(_: KvKeyLookupFailed) => 0.validNel + case Some(KvFailure(_, failure)) => s"Failed to get key $fromKey: ${failure.getMessage}".invalidNel + case Some(_: KvPutSuccess) => s"Programmer Error: Got a KvPutSuccess from a Get request...".invalidNel + case None => s"Programmer Error: Engine made no effort to prefetch $fromKey".invalidNel + } + + private def validatedInt(s: String, fromKey: String): ErrorOr[Int] = { + Try(s.toInt) match { + case Success(i) => i.validNel + case Failure(_) => s"Unexpected value found in the KV store: $fromKey='$s'".invalidNel + } + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/ProjectLabels.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/ProjectLabels.scala new file mode 100644 index 00000000000..88c8e1aab57 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/ProjectLabels.scala @@ -0,0 +1,3 @@ +package cromwell.backend.google.batch.models + +final case class ProjectLabels(labels: Map[String, String]) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/ReferenceFile.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/ReferenceFile.scala new file mode 100644 index 00000000000..21e280d318f --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/ReferenceFile.scala @@ -0,0 +1,3 @@ +package cromwell.backend.google.batch.models + +case class ReferenceFile(path: String, crc32c: Long) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/Run.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/Run.scala new file mode 100644 index 00000000000..418d8ee2a35 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/Run.scala @@ -0,0 +1,5 @@ +package cromwell.backend.google.batch.models + +import cromwell.backend.standard.StandardAsyncJob + +case class Run(job: StandardAsyncJob) \ No newline at end of file diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/RunStatus.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/RunStatus.scala new file mode 100644 index 00000000000..f5631ef6afa --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/RunStatus.scala @@ -0,0 +1,62 @@ +package cromwell.backend.google.batch.models + +import com.google.cloud.batch.v1.JobStatus +import cromwell.core.ExecutionEvent +import org.slf4j.{Logger, LoggerFactory} + +sealed trait RunStatus + +object RunStatus { + + private val log: Logger = LoggerFactory.getLogger(RunStatus.toString) + + def fromJobStatus(status: JobStatus.State): RunStatus = status match { + case JobStatus.State.QUEUED => + log.info("job queued") + Running + case JobStatus.State.SCHEDULED => + log.info("job scheduled") + Running + case JobStatus.State.RUNNING => + log.info("job running") + Running + case JobStatus.State.SUCCEEDED => + log.info("job scheduled") + Succeeded(List(ExecutionEvent("complete in GCP Batch"))) //update to more specific + case JobStatus.State.FAILED => + log.info("job failed") + Failed(List.empty) + case JobStatus.State.DELETION_IN_PROGRESS => + log.info("deletion in progress") + DeletionInProgress + case JobStatus.State.STATE_UNSPECIFIED => + log.info("state unspecified") + StateUnspecified + case JobStatus.State.UNRECOGNIZED => + log.info("state unrecognized") + Unrecognized + case _ => + log.info(s"job status not matched: $status") + Running + } + + + sealed trait TerminalRunStatus extends RunStatus { + def eventList: Seq[ExecutionEvent] + } + + sealed trait UnsuccessfulRunStatus extends TerminalRunStatus + + case object Running extends RunStatus + case object DeletionInProgress extends RunStatus + case object StateUnspecified extends RunStatus + case object Unrecognized extends RunStatus + + case class Succeeded(override val eventList: Seq[ExecutionEvent]) extends TerminalRunStatus { + override def toString = "Succeeded" + } + + final case class Failed(override val eventList: Seq[ExecutionEvent]) extends UnsuccessfulRunStatus { + override def toString = "Failed" + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/VpcAndSubnetworkProjectLabelValues.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/VpcAndSubnetworkProjectLabelValues.scala new file mode 100644 index 00000000000..a77bcd96d87 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/VpcAndSubnetworkProjectLabelValues.scala @@ -0,0 +1,30 @@ +package cromwell.backend.google.batch.models + +import cromwell.backend.google.batch.models.VpcAndSubnetworkProjectLabelValues._ + +final case class VpcAndSubnetworkProjectLabelValues(vpcName: String, subnetNameOpt: Option[String]) { + /** + * Returns a qualified network name replacing the string `\${projectId}` in the network name if found. + */ + def networkName(projectId: String): String = { + val networkNameTemplate = + if (vpcName.contains("/")) { + vpcName + } else { + s"projects/$ProjectIdToken/global/networks/$vpcName/" + } + + networkNameTemplate.replace(ProjectIdToken, projectId) + } + + /** + * Replaces the string `\${projectId}` in the subnet name if found. + */ + def subnetNameOption(projectId: String): Option[String] = { + subnetNameOpt map { _.replace(ProjectIdToken, projectId) } + } +} + +object VpcAndSubnetworkProjectLabelValues { + private val ProjectIdToken = s"$${projectId}" +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/monitoring/BatchInstrumentation.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/monitoring/BatchInstrumentation.scala new file mode 100644 index 00000000000..a09cd9d5fed --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/monitoring/BatchInstrumentation.scala @@ -0,0 +1,32 @@ +package cromwell.backend.google.batch.monitoring + +import cats.data.NonEmptyList +import cromwell.core.instrumentation.InstrumentationKeys._ +import cromwell.core.instrumentation.InstrumentationPrefixes._ +import cromwell.services.instrumentation.CromwellInstrumentation +import cromwell.services.instrumentation.CromwellInstrumentation._ + +object BatchInstrumentation { + private val BatchKey = NonEmptyList.of("batch") + private val BatchPollKey = BatchKey.concatNel("poll") + private val BatchRunKey = BatchKey.concatNel("run") + private val BatchAbortKey = BatchKey.concatNel("abort") + + private val BatchPollFailedKey = BatchPollKey.concatNel(FailureKey) + private val BatchRunFailedKey = BatchRunKey.concatNel(FailureKey) + private val BatchAbortFailedKey = BatchAbortKey.concatNel(FailureKey) +} + +trait BatchInstrumentation extends CromwellInstrumentation { + import BatchInstrumentation._ + + def pollSuccess() = increment(BatchPollKey.concatNel(SuccessKey), BackendPrefix) + def pollFailed() = increment(BatchPollFailedKey.concatNel(FailureKey), BackendPrefix) + + def runSuccess() = increment(BatchRunKey.concatNel(SuccessKey), BackendPrefix) + def runFailed() = increment(BatchRunFailedKey.concatNel(FailureKey), BackendPrefix) + + def abortSuccess() = increment(BatchAbortKey.concatNel(SuccessKey), BackendPrefix) + def abortFailed() = increment(BatchAbortFailedKey.concatNel(FailureKey), BackendPrefix) + +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/monitoring/CheckpointingConfiguration.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/monitoring/CheckpointingConfiguration.scala new file mode 100644 index 00000000000..ab04a4a71a9 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/monitoring/CheckpointingConfiguration.scala @@ -0,0 +1,75 @@ +package cromwell.backend.google.batch.monitoring + +import cromwell.backend.BackendJobDescriptor +import cromwell.backend.io.WorkflowPaths +import cromwell.core.path.Path + +import scala.concurrent.duration.FiniteDuration + +final class CheckpointingConfiguration(jobDescriptor: BackendJobDescriptor, + workflowPaths: WorkflowPaths, + commandDirectory: Path, + checkpointInterval: FiniteDuration + ) { + def checkpointFileCloud(checkpointFileName: String): String = { + // The checkpoint file for ANY attempt always goes in the "attempt 1" directory. That way we guarantee that + // every attempt is able to recover from the single source of checkpointing truth. + workflowPaths.toJobPaths(jobDescriptor.key.copy(attempt = 1), jobDescriptor.workflowDescriptor) + .callExecutionRoot.resolve("__checkpointing").resolve(checkpointFileName).toAbsolutePath.pathAsString + } + def tmpCheckpointFileCloud(checkpointFileName: String): String = checkpointFileCloud(checkpointFileName) + "-tmp" + + def checkpointFileLocal(checkpointFileName: String): String = { + commandDirectory.resolve(checkpointFileName).toAbsolutePath.pathAsString + } + def tmpCheckpointFileLocal(checkpointFileName: String): String = checkpointFileLocal(checkpointFileName) + "-tmp" + + def localizePreviousCheckpointCommand(checkpointFileName: String): String = { + val local = checkpointFileLocal(checkpointFileName) + val cloud = checkpointFileCloud(checkpointFileName) + + s"gsutil cp $cloud $local || touch $local" + } + + def checkpointingCommand(checkpointFilename: String, multilineActionSquasher: String => String): List[String] = { + val local = checkpointFileLocal(checkpointFilename) + val localTmp = tmpCheckpointFileLocal(checkpointFilename) + val cloud = checkpointFileCloud(checkpointFilename) + val cloudTmp = tmpCheckpointFileCloud(checkpointFilename) + + val checkpointUploadScript = + s"""touch $local + |while true + |do + | # Attempt to make a local copy of the checkpoint file + | echo "CHECKPOINTING: Making local copy of $local" + | COPY_SUCCESS="false" + | while [ "$$COPY_SUCCESS" != "true" ] + | do + | PRE_COPY_TIMESTAMP="$$(stat -c'%Z' $local)" + | cp $local $localTmp + | if [ "$$PRE_COPY_TIMESTAMP" == "$$(stat -c'%Z' $local)" ] + | then + | COPY_SUCCESS="true" + | else + | echo "CHECKPOINTING: $local was modified while trying to make a local copy. Will retry in 10s..." + | sleep 10 + | fi + | done + | + | # Perform the upload: + | echo "CHECKPOINTING: Uploading new checkpoint content" + | gsutil -m mv $localTmp $cloudTmp + | echo "CHECKPOINTING: Replacing cloud checkpoint file with new content" + | gsutil -m mv $cloudTmp $cloud + | echo "CHECKPOINTING: Sleeping for ${checkpointInterval.toString} before next checkpoint" + | sleep ${checkpointInterval.toSeconds} + |done""".stripMargin + + List( + "/bin/bash", + "-c", + multilineActionSquasher(checkpointUploadScript) + ) + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/monitoring/Env.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/monitoring/Env.scala new file mode 100644 index 00000000000..df0cb3b070c --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/monitoring/Env.scala @@ -0,0 +1,24 @@ +package cromwell.backend.google.batch.monitoring + +import cromwell.backend.BackendJobDescriptor + +object Env { + /** + * Name of an environmental variable + */ + val WorkflowId = "WORKFLOW_ID" + val TaskCallName = "TASK_CALL_NAME" + val TaskCallIndex = "TASK_CALL_INDEX" + val TaskCallAttempt = "TASK_CALL_ATTEMPT" + val DiskMounts = "DISK_MOUNTS" + + def monitoringImageEnvironment(jobDescriptor: BackendJobDescriptor) + (mountPaths: List[String]): Map[String, String] = + Map( + Env.WorkflowId -> jobDescriptor.workflowDescriptor.id.toString, + Env.TaskCallName -> jobDescriptor.taskCall.localName, + Env.TaskCallIndex -> jobDescriptor.key.index.map(_.toString).getOrElse("NA"), + Env.TaskCallAttempt -> jobDescriptor.key.attempt.toString, + Env.DiskMounts -> mountPaths.mkString(" "), + ) +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/monitoring/MonitoringImage.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/monitoring/MonitoringImage.scala new file mode 100644 index 00000000000..578081dc46f --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/monitoring/MonitoringImage.scala @@ -0,0 +1,47 @@ +package cromwell.backend.google.batch.monitoring + +import cromwell.backend.BackendJobDescriptor +import cromwell.backend.google.batch.io.GcpBatchAttachedDisk +import cromwell.backend.google.batch.runnable.WorkflowOptionKeys +import cromwell.backend.google.batch.api.GcpBatchRequestFactory.MountsToEnv +import cromwell.backend.io.WorkflowPaths +import cromwell.core.WorkflowOptions +import cromwell.core.path.{Path, PathFactory} + +final class MonitoringImage(jobDescriptor: BackendJobDescriptor, + workflowOptions: WorkflowOptions, + workflowPaths: WorkflowPaths, + commandDirectory: Path, + workingDisk: GcpBatchAttachedDisk, + localMonitoringImageScriptPath: Path, + ) { + + val monitoringImageOption: Option[String] = workflowOptions.get(WorkflowOptionKeys.MonitoringImage).toOption + + val monitoringImageScriptContainerPath: Path = workingDisk.mountPoint.resolve(localMonitoringImageScriptPath) + + val monitoringImageScriptOption: Option[Path] = + for { + _ <- monitoringImageOption // Only use the monitoring_image_script when monitoring_image provided + monitoringImageScript <- workflowOptions.get(WorkflowOptionKeys.MonitoringImageScript).toOption + } yield { + PathFactory.buildPath( + monitoringImageScript, + workflowPaths.pathBuilders, + ) + } + + val monitoringImageCommand: List[String] = + monitoringImageScriptOption match { + case Some(_) => List( + "/bin/sh", + "-c", + s"cd '${commandDirectory.pathAsString}' && " + + s"chmod +x '${monitoringImageScriptContainerPath.pathAsString}' && " + + s"'${monitoringImageScriptContainerPath.pathAsString}'" + ) + case None => Nil + } + + val monitoringImageEnvironment: MountsToEnv = Env.monitoringImageEnvironment(jobDescriptor) +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/CheckpointingRunnable.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/CheckpointingRunnable.scala new file mode 100644 index 00000000000..5ad8d2b21f9 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/CheckpointingRunnable.scala @@ -0,0 +1,49 @@ +package cromwell.backend.google.batch.runnable + +import com.google.cloud.batch.v1.{Runnable, Volume} +import cromwell.backend.google.batch.api.GcpBatchRequestFactory.CreateBatchJobParameters + +trait CheckpointingRunnable { + def checkpointingSetupRunnables(createParameters: CreateBatchJobParameters, + volumes: List[Volume] + ): List[Runnable] = { + val result = createParameters.runtimeAttributes.checkpointFilename.map { checkpointFilename => + val checkpointingImage = RunnableUtils.CloudSdkImage + val checkpointingCommand = createParameters.checkpointingConfiguration.checkpointingCommand(checkpointFilename, RunnableCommands.multiLineBinBashCommand) + val checkpointingEnvironment = Map.empty[String, String] + + // Initial sync from cloud: + val initialCheckpointSyncRunnable = RunnableBuilder.cloudSdkShellRunnable( + createParameters.checkpointingConfiguration.localizePreviousCheckpointCommand(checkpointFilename) + )(volumes = volumes, flags = List.empty, labels = Map.empty) + val describeInitialCheckpointingSyncRunnable = RunnableBuilder.describeDocker("initial checkpointing sync", initialCheckpointSyncRunnable) + + // Background upload runnable: + val backgroundCheckpointingRunnable = RunnableBuilder.backgroundRunnable( + image = checkpointingImage, + command = checkpointingCommand, + environment = checkpointingEnvironment, + volumes = volumes + ) + val describeBackgroundCheckpointingRunnable = RunnableBuilder.describeDocker("begin checkpointing background runnable", backgroundCheckpointingRunnable) + + List(describeInitialCheckpointingSyncRunnable, initialCheckpointSyncRunnable, describeBackgroundCheckpointingRunnable, backgroundCheckpointingRunnable) + }.getOrElse(Nil) + + result.map(_.build) + } + + def checkpointingShutdownRunnables(createParameters: CreateBatchJobParameters, volumes: List[Volume]): List[Runnable] = { + val result = createParameters.runtimeAttributes.checkpointFilename.map { checkpointFilename => + val terminationRunnable = RunnableBuilder.terminateBackgroundRunnablesRunnable() + val describeTerminationRunnable = RunnableBuilder.describeDocker("terminate checkpointing runnable", terminationRunnable) + + val deleteCheckpointRunnable = RunnableBuilder.gcsFileDeletionRunnable(createParameters.checkpointingConfiguration.checkpointFileCloud(checkpointFilename), volumes) + val deleteTmpCheckpointRunnable = RunnableBuilder.gcsFileDeletionRunnable(createParameters.checkpointingConfiguration.tmpCheckpointFileCloud(checkpointFilename), volumes) + + List(describeTerminationRunnable, terminationRunnable, deleteCheckpointRunnable, deleteTmpCheckpointRunnable) + }.getOrElse(Nil) + + result.map(_.build) + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/ContainerSetup.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/ContainerSetup.scala new file mode 100644 index 00000000000..8c7badc37ba --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/ContainerSetup.scala @@ -0,0 +1,25 @@ +package cromwell.backend.google.batch.runnable + +import com.google.cloud.batch.v1.{Runnable, Volume} +import cromwell.backend.google.batch.io.GcpBatchWorkingDisk + +trait ContainerSetup { + import RunnableLabels._ + + def containerSetupRunnables(volumes: List[Volume]): List[Runnable] = { + val containerRoot = GcpBatchWorkingDisk.MountPoint.pathAsString + + // As opposed to V1, the container root does not have a 777 umask, which can cause issues for docker running as non root + // Run a first action to create the root and give it the right permissions + val containerRootSetup = RunnableBuilder + .cloudSdkShellRunnable(s"mkdir -p $containerRoot && chmod -R a+rwx $containerRoot")( + volumes = volumes, + labels = Map(Key.Tag -> Value.ContainerSetup), + flags = List.empty + ) + + RunnableBuilder + .annotateTimestampedRunnable("container setup", Value.ContainerSetup)(volumes, List(containerRootSetup)) + .map(_.build()) + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/Delocalization.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/Delocalization.scala new file mode 100644 index 00000000000..f1b28598a28 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/Delocalization.scala @@ -0,0 +1,116 @@ +package cromwell.backend.google.batch.runnable + +import com.google.cloud.batch.v1.{Runnable, Volume} +import common.util.StringUtil._ +import cromwell.backend.google.batch.api.GcpBatchRequestFactory.CreateBatchJobParameters +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration +import cromwell.backend.google.batch.models.GcpBatchJobPaths.GcsDelocalizationScriptName +import cromwell.backend.google.batch.util.GcpBatchParameterConversions._ +import cromwell.backend.google.batch.util.RuntimeOutputMapping +import cromwell.backend.google.batch.util.ToParameter.ops._ +import cromwell.core.path.Path +import wom.runtime.WomOutputRuntimeExtractor + +import java.util.UUID + +trait Delocalization { + + import RunnableBuilder._ + import RunnableCommands._ + import RunnableLabels._ + import RunnableUtils._ + + private def runtimeOutputExtractorRunnable(containerCallRoot: String, + outputFile: String, + womOutputRuntimeExtractor: WomOutputRuntimeExtractor): Runnable.Builder = { + val commands = List( + "-c", + // Create the directory where the fofn will be written + s"mkdir -p $$(dirname $outputFile) && " + + s"cd $containerCallRoot && " + + """echo "Runtime output files to be delocalized:" && """ + + s"${womOutputRuntimeExtractor.command} | tee $outputFile" + ) + + RunnableBuilder + .withImage(womOutputRuntimeExtractor.dockerImage.getOrElse(CloudSdkImage)) + .withCommand(commands: _*) + .withEntrypointCommand("/bin/bash") + .withLabels(Map(Key.Tag -> Value.Delocalization)) + } + + private def delocalizeRuntimeOutputsScript(fofnPath: String, workflowRoot: Path, cloudCallRoot: Path)(implicit gcsTransferConfiguration: GcsTransferConfiguration) = { + val gsutilCommand: String => String = { flag => + s"""rm -f $$HOME/.config/gcloud/gce && gsutil -m $flag cp -r $$line "${cloudCallRoot.pathAsString.ensureSlashed}$$gcs_path"""" + } + + def sedStripPrefix(prefix: String) = s"""sed -e "s/^${prefix.ensureSedEscaped}//"""" + + // See RuntimeOutputMapping.prefixFilters for more info on why this is needed + val prefixFilters = RuntimeOutputMapping + .prefixFilters(workflowRoot) + .map(sedStripPrefix) + .mkString(" | ") + + /* + * Delocalize all the files returned by the runtime output extractor + */ + s"""|#!/bin/bash + | + |set -x + | + |if [ -f $fofnPath ]; then + | while IFS= read line + | do + | gcs_path=$$(echo $$line | $prefixFilters) + | ( + | ${retry(recoverRequesterPaysError(cloudCallRoot)(gsutilCommand))} + | ) + | done <$fofnPath + |fi""".stripMargin + } + + private def delocalizeRuntimeOutputsRunnable(cloudCallRoot: Path, inputFile: String, workflowRoot: Path, volumes: List[Volume])(implicit gcsTransferConfiguration: GcsTransferConfiguration): Runnable.Builder = { + val command = multiLineCommand(delocalizeRuntimeOutputsScript(inputFile, workflowRoot, cloudCallRoot)) + RunnableBuilder.cloudSdkShellRunnable(command)(volumes = volumes, labels = Map(Key.Tag -> Value.Delocalization), flags = List.empty) + } + + def deLocalizeRunnables(createParameters: CreateBatchJobParameters, + volumes: List[Volume])(implicit gcsTransferConfiguration: GcsTransferConfiguration): List[Runnable] = { + val cloudCallRoot = createParameters.cloudCallRoot + val callExecutionContainerRoot = createParameters.commandScriptContainerPath.parent + + /* + * Ideally temporaryFofnForRuntimeOutputFiles should be somewhere else than the execution directory (we could mount anther directory) + * However because it runs after everything else there's no risk of polluting the task's results and the random ID ensures we don't override anything + */ + val temporaryFofnDirectoryForRuntimeOutputFiles = callExecutionContainerRoot.pathAsString.ensureSlashed + UUID.randomUUID().toString.split("-")(0) + val temporaryFofnForRuntimeOutputFiles = temporaryFofnDirectoryForRuntimeOutputFiles + "/runtime_output_files.txt" + + val runtimeExtractionRunnables = createParameters.womOutputRuntimeExtractor.toList flatMap { extractor => + List ( + runtimeOutputExtractorRunnable(callExecutionContainerRoot.pathAsString, temporaryFofnForRuntimeOutputFiles, extractor), + delocalizeRuntimeOutputsRunnable(cloudCallRoot, temporaryFofnForRuntimeOutputFiles, createParameters.cloudWorkflowRoot, volumes) + ) + } + + val gcsDelocalizationContainerPath = createParameters.commandScriptContainerPath.sibling(GcsDelocalizationScriptName) + + val delocalizationLabel = Map(Key.Tag -> Value.Delocalization) + val runGcsDelocalizationScript = cloudSdkShellRunnable( + s"/bin/bash $gcsDelocalizationContainerPath")(volumes = volumes, labels = delocalizationLabel, flags = List.empty) + + val annotatedRunnables: List[Runnable.Builder] = runGcsDelocalizationScript :: + createParameters.outputParameters.flatMap(_.toRunnables(volumes)) ++ + runtimeExtractionRunnables + + // NOTE: papiv2 delocalizes logs from /google but such logs are not available on batch + // See: https://cloud.google.com/life-sciences/docs/reference/rpc/google.cloud.lifesciences.v2beta + val all = RunnableBuilder.annotateTimestampedRunnable("delocalization", Value.Delocalization)( + volumes, + annotatedRunnables + ) + + all.map(_.build) + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/GcpBatchMetadataKeys.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/GcpBatchMetadataKeys.scala new file mode 100644 index 00000000000..f7c0302a98f --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/GcpBatchMetadataKeys.scala @@ -0,0 +1,11 @@ +package cromwell.backend.google.batch.runnable + +object GcpBatchMetadataKeys { + val GoogleProject = "gcpBatch:googleProject" + val ExecutionBucket = "gcpBatch:executionBucket" + val MonitoringScript = "gcpBatch:monitoringScript" + val MachineType = "gcpBatch:machineType" + val Zone = "gcpBatch:zone" + val InstanceName = "gcpBatch:instanceName" + val MonitoringLog = "monitoringLog" +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/Localization.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/Localization.scala new file mode 100644 index 00000000000..07fe3e848c4 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/Localization.scala @@ -0,0 +1,101 @@ +package cromwell.backend.google.batch.runnable + +import cloud.nio.impl.drs.DrsConfig +import com.google.cloud.batch.v1.{Runnable, Volume} +import com.typesafe.config.ConfigFactory +import cromwell.backend.google.batch.api.GcpBatchRequestFactory.CreateBatchJobParameters +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration +import cromwell.backend.google.batch.models.{GcpBatchFileInput, GcpBatchJobPaths} +import cromwell.backend.google.batch.util.GcpBatchParameterConversions._ +import cromwell.backend.google.batch.util.ToParameter.ops._ +import cromwell.core.path.Path +import cromwell.filesystems.drs.DrsPath + +trait Localization { + import GcpBatchJobPaths._ + import RunnableBuilder._ + import RunnableCommands._ + import RunnableLabels._ + + def localizeRunnables(createParameters: CreateBatchJobParameters, volumes: List[Volume]) + (implicit gcsTransferConfiguration: GcsTransferConfiguration): List[Runnable] = { + val localizationLabel = Map(Key.Tag -> Value.Localization) + + val gcsTransferLibraryContainerPath = createParameters.commandScriptContainerPath.sibling(GcsTransferLibraryName) + val localizeGcsTransferLibrary = cloudSdkShellRunnable(localizeFile( + cloudPath = createParameters.cloudCallRoot / GcsTransferLibraryName, + containerPath = gcsTransferLibraryContainerPath))(volumes = volumes, labels = localizationLabel, flags = List.empty) + + val gcsLocalizationContainerPath = createParameters.commandScriptContainerPath.sibling(GcsLocalizationScriptName) + val localizeGcsLocalizationScript = cloudSdkShellRunnable(localizeFile( + cloudPath = createParameters.cloudCallRoot / GcsLocalizationScriptName, + containerPath = gcsLocalizationContainerPath))(volumes = volumes, labels = localizationLabel, flags = List.empty) + + val gcsDelocalizationContainerPath = createParameters.commandScriptContainerPath.sibling(GcsDelocalizationScriptName) + val localizeGcsDelocalizationScript = cloudSdkShellRunnable(localizeFile( + cloudPath = createParameters.cloudCallRoot / GcsDelocalizationScriptName, + containerPath = gcsDelocalizationContainerPath))(volumes = volumes, labels = localizationLabel, flags = List.empty) + + val runGcsLocalizationScript = cloudSdkShellRunnable( + s"/bin/bash $gcsLocalizationContainerPath")(volumes = volumes, labels = localizationLabel, flags = List.empty) + + val drsInputs: List[DrsPath] = createParameters.inputOutputParameters.fileInputParameters.collect { + case GcpBatchFileInput(_, drsPath: DrsPath, _, _) => drsPath + } + + val drsLocalizationRunnable = if (drsInputs.nonEmpty) { + val drsLocalizationManifestContainerPath = createParameters.commandScriptContainerPath.sibling(DrsLocalizationManifestName) + val localizeDrsLocalizationManifest = cloudSdkShellRunnable(localizeFile( + cloudPath = createParameters.cloudCallRoot / DrsLocalizationManifestName, + containerPath = drsLocalizationManifestContainerPath))(volumes = volumes, labels = localizationLabel, flags = List.empty) + + // Requester pays project id is stored on each DrsPath, but will be the same for all DRS inputs to a + // particular workflow because it's determined by the Google project set in workflow options. + val requesterPaysProjectId: Option[String] = drsInputs.flatMap(_.requesterPaysProjectIdOption).headOption + val runDrsLocalization = Localization.drsRunnable(drsLocalizationManifestContainerPath, localizationLabel, requesterPaysProjectId) + List(localizeDrsLocalizationManifest, runDrsLocalization) + } else List[Runnable.Builder]() + + // Any "classic" PAPI v2 one-at-a-time localizations for non-GCS inputs. + val singletonLocalizations = createParameters.inputOutputParameters.fileInputParameters.flatMap(_.toRunnables(volumes)) + + val localizations = + localizeGcsTransferLibrary :: + localizeGcsLocalizationScript :: runGcsLocalizationScript :: + drsLocalizationRunnable ::: + localizeGcsDelocalizationScript :: + singletonLocalizations + + RunnableBuilder + .annotateTimestampedRunnable("localization", Value.Localization)(volumes, localizations) + .map(_.build) + } +} + +object Localization { + + private lazy val config = ConfigFactory.load + + def drsRunnable(manifestPath: Path, + labels: Map[String, String], + requesterPaysProjectId: Option[String] + ): Runnable.Builder = { + import RunnableBuilder.EnhancedRunnableBuilder + + val drsResolverConfig = config.getConfig("filesystems.drs.global.config.resolver") + val drsConfig = DrsConfig.fromConfig(drsResolverConfig) + val drsDockerImage = config.getString("drs.localization.docker-image") + + val manifestArg = List("-m", manifestPath.pathAsString) + val requesterPaysArg = requesterPaysProjectId.map(r => List("-r", r)).getOrElse(List.empty) + val drsCommand = manifestArg ++ requesterPaysArg + + val drsResolverEnv = DrsConfig.toEnv(drsConfig) + + RunnableBuilder + .withImage(drsDockerImage) + .withCommand(drsCommand: _*) + .withEnvironment(drsResolverEnv) + .withLabels(labels) + } +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/MemoryRetryCheckRunnable.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/MemoryRetryCheckRunnable.scala new file mode 100644 index 00000000000..3c728b21775 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/MemoryRetryCheckRunnable.scala @@ -0,0 +1,14 @@ +package cromwell.backend.google.batch.runnable + +import com.google.cloud.batch.v1.{Runnable, Volume} +import cromwell.backend.google.batch.api.GcpBatchRequestFactory.CreateBatchJobParameters + +trait MemoryRetryCheckRunnable { + + def checkForMemoryRetryRunnables(createParameters: CreateBatchJobParameters, volumes: List[Volume]): List[Runnable] = { + createParameters.retryWithMoreMemoryKeys match { + case Some(keys) => List(RunnableBuilder.checkForMemoryRetryRunnable(keys, volumes)).map(_.build) + case None => List.empty[Runnable] + } + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/MonitoringRunnable.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/MonitoringRunnable.scala new file mode 100644 index 00000000000..b48918610b2 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/MonitoringRunnable.scala @@ -0,0 +1,66 @@ +package cromwell.backend.google.batch.runnable + +import com.google.cloud.batch.v1.{Runnable, Volume} +import cromwell.backend.google.batch.api.GcpBatchRequestFactory.CreateBatchJobParameters +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration + +trait MonitoringRunnable { + def monitoringSetupRunnables(createParameters: CreateBatchJobParameters, + volumes: List[Volume] + )(implicit gcsTransferConfiguration: GcsTransferConfiguration): List[Runnable] = { + + val monitoringImageScriptRunnables = + createParameters.monitoringImage.monitoringImageScriptOption match { + case Some(script) => + val localizeScriptRunnable = + RunnableBuilder.monitoringImageScriptRunnable( + script, + createParameters.monitoringImage.monitoringImageScriptContainerPath, + volumes + ) + val describeLocalizeScriptRunnable = + RunnableBuilder.describeDocker( + "localizing monitoring image script runnable", + localizeScriptRunnable, + ) + List(describeLocalizeScriptRunnable, localizeScriptRunnable) + case None => Nil + } + + val monitoringImageRunnables = + createParameters.monitoringImage.monitoringImageOption match { + case Some(image) => + + val monitoringImage = image + val monitoringImageCommand = createParameters.monitoringImage.monitoringImageCommand + val monitoringImageEnvironment = createParameters.monitoringImage.monitoringImageEnvironment + + val monitoringRunnable = RunnableBuilder.backgroundRunnable( + monitoringImage, + monitoringImageCommand, + monitoringImageEnvironment(volumes.map(_.getMountPath)), + volumes + ) + + val describeMonitoringRunnable = RunnableBuilder.describeDocker("monitoring runnable", monitoringRunnable) + + List(describeMonitoringRunnable, monitoringRunnable) + + case None => Nil + } + + (monitoringImageScriptRunnables ++ monitoringImageRunnables).map(_.build) + } + + def monitoringShutdownRunnables(createParameters: CreateBatchJobParameters): List[Runnable] = { + createParameters.monitoringImage.monitoringImageOption match { + case Some(_) => + val terminationRunnable = RunnableBuilder.terminateBackgroundRunnablesRunnable() + + val describeTerminationRunnable = RunnableBuilder.describeDocker("terminate monitoring runnable", terminationRunnable) + + List(describeTerminationRunnable, terminationRunnable).map(_.build) + case None => Nil + } + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableBuilder.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableBuilder.scala new file mode 100644 index 00000000000..c68e3dbdd29 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableBuilder.scala @@ -0,0 +1,318 @@ +package cromwell.backend.google.batch.runnable + +import com.google.cloud.batch.v1.Runnable.Container +import com.google.cloud.batch.v1.{Environment,Runnable, Volume} +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration +import cromwell.backend.google.batch.models.{BatchParameter, GcpBatchInput, GcpBatchOutput} +//import cromwell.backend.google.batch.runnable.RunnableLabels._ +import cromwell.core.path.Path +import mouse.all.anySyntaxMouse + +import scala.concurrent.duration.{Duration, DurationInt, FiniteDuration} +import scala.jdk.CollectionConverters._ + +/** + * Utility singleton to create high level batch runnables. + */ +object RunnableBuilder { + + import RunnableLabels._ + import RunnableUtils._ + + implicit class EnhancedRunnableBuilder(val builder: Runnable.Builder) extends AnyVal { + /** + * Only for use with docker images KNOWN to not have entrypoints already set, + * or used with accompanying call to setEntrypoint("non-empty-string"). + * + * Otherwise use the withEntrypointCommand() workaround below since the google issue listed in BA-6406 is not being + * fixed. + */ + def withCommand(command: String*): Runnable.Builder = { + val container = builder.getContainerBuilder.addAllCommands(command.toList.asJava) + builder.setContainer(container) + } + + def withEntrypointCommand(command: String*): Runnable.Builder = { + builder + .setContainer( + builder.getContainerBuilder + .setEntrypoint(command.headOption.getOrElse("")) //set to blank string instead of null because batch does not support null + .addAllCommands( + command.drop(1).asJava + ) + ) + } + + def withFlags(flags: List[RunnableFlag]): Runnable.Builder = { + flags.foldLeft(builder) { + case (acc, RunnableFlag.IgnoreExitStatus) => acc.setIgnoreExitStatus(true) + case (acc, RunnableFlag.RunInBackground) => acc.setBackground(true) + case (acc, RunnableFlag.AlwaysRun) => acc.setAlwaysRun(true) + } + } + + def withEnvironment(environment: Map[String, String]): Runnable.Builder = { + val env = Environment.newBuilder.putAllVariables(environment.asJava) + builder.setEnvironment(env) + } + + + def withVolumes(volumes: List[Volume]): Runnable.Builder = { + val formattedVolumes = volumes.map { volume => + val mountPath = volume.getMountPath + val mountOptions = Option(volume.getMountOptionsList).map(_.asScala.toList).getOrElse(List.empty) + s"$mountPath:$mountPath:${mountOptions.mkString(",")}" + } + + builder.setContainer( + builder.getContainerBuilder.addAllVolumes(formattedVolumes.asJava) + ) + } + + def withLabels(labels: Map[String, String]): Runnable.Builder = builder.putAllLabels(labels.asJava) + + def withTimeout(timeout: Duration): Runnable.Builder = timeout match { + case _: FiniteDuration => + builder.setTimeout( + com.google.protobuf.Duration.newBuilder().setSeconds(timeout.toSeconds).build() + ) + + case _ => builder + } + + def withAlwaysRun(alwaysRun: Boolean): Runnable.Builder = builder.setAlwaysRun(alwaysRun) + + def withRunInBackground(runInBackground: Boolean): Runnable.Builder = builder.setBackground(runInBackground) + + def scalaLabels: Map[String, String] = { + val list = for { + keyValueList <- Option(builder.getLabelsMap).toList + keyValue <- keyValueList.asScala + } yield keyValue + list.toMap + } + } + + def withImage(image: String): Runnable.Builder = { + Runnable.newBuilder() + .setContainer(Container.newBuilder.setImageUri(image)) + } + + private def cloudSdkContainerBuilder: Container.Builder = { + Container.newBuilder.setImageUri(CloudSdkImage) + } + + def monitoringImageScriptRunnable(cloudPath: Path, containerPath: Path, volumes: List[Volume]) + (implicit gcsTransferConfiguration: GcsTransferConfiguration): Runnable.Builder = { + val command = RunnableCommands.localizeFile(cloudPath, containerPath) + val labels = Map(Key.Tag -> Value.Localization) + cloudSdkShellRunnable(command)(volumes = volumes, flags = List.empty, labels = labels) + } + + def backgroundRunnable(image: String, + command: List[String], + environment: Map[String, String], + volumes: List[Volume] + ): Runnable.Builder = { + withImage(image) + .withEntrypointCommand(command: _*) + .withRunInBackground(true) + .withVolumes(volumes) + .withEnvironment(environment) + .withFlags(List(RunnableFlag.RunInBackground, RunnableFlag.IgnoreExitStatus)) + .withLabels(Map(Key.Tag -> Value.Monitoring)) + } + + + def terminateBackgroundRunnablesRunnable(): Runnable.Builder = { + cloudSdkShellRunnable(terminateAllBackgroundRunnablesCommand)( + volumes = List.empty, + flags = List(RunnableFlag.AlwaysRun), + labels = Map(Key.Tag -> Value.Monitoring) + ) + } + + def gcsFileDeletionRunnable(cloudPath: String, volumes: List[Volume]): Runnable.Builder = { + cloudSdkShellRunnable( + s"""gsutil rm '$cloudPath'""" + )( + volumes = volumes, + flags = List(RunnableFlag.IgnoreExitStatus), + labels = Map(Key.Tag -> Value.Monitoring) + ) + } + + def userRunnable(docker: String, + scriptContainerPath: String, + jobShell: String, + volumes: List[Volume], + dockerhubCredentials: (String, String)): Runnable.Builder = { + + val container = (dockerhubCredentials._1, dockerhubCredentials._2) match { + case (username, password) if username.nonEmpty && password.nonEmpty => + Container.newBuilder + .setImageUri(docker) + .setEntrypoint(jobShell) + .addCommands(scriptContainerPath) + .setUsername(username) + .setPassword(password) + case _ => + Container.newBuilder + .setImageUri(docker) + .setEntrypoint(jobShell) + .addCommands(scriptContainerPath) + } + Runnable.newBuilder() + .setContainer(container) + .withVolumes(volumes) + .putLabels(Key.Tag, Value.UserRunnable) + } + + def checkForMemoryRetryRunnable(retryLookupKeys: List[String], volumes: List[Volume]): Runnable.Builder = { + cloudSdkShellRunnable(RunnableCommands.checkIfStderrContainsRetryKeys(retryLookupKeys))( + volumes = volumes, + flags = List(RunnableFlag.AlwaysRun), + labels = Map(Key.Tag -> Value.RetryWithMoreMemory) + ).withAlwaysRun(true) + } + + // Creates a Runnable that logs the docker command for the passed in runnable. + def describeDocker(description: String, runnable: Runnable.Builder): Runnable.Builder = { + logTimestampedRunnable( + s"Running $description: ${toDockerRun(runnable)}", + List.empty, + List.empty, + runnable.scalaLabels + ) + } + + private def timestampedMessage(message: String): String = + s"""printf '%s %s\\n' "$$(date -u '+%Y/%m/%d %H:%M:%S')" ${shellEscaped(message)}""" + + private def logTimestampedRunnable(message: String, volumes: List[Volume], flags: List[RunnableFlag], runnableLabels: Map[String, String]): Runnable.Builder = { + // Uses the cloudSdk image as that image will be used for other operations as well. + cloudSdkShellRunnable( + timestampedMessage(message) + )(volumes, flags, labels = runnableLabels collect { + case (key, value) if key == Key.Tag => Key.Logging -> value + case (key, value) => key -> value + }).withTimeout(timeout = 300.seconds) + } + + def cloudSdkRunnable: Runnable.Builder = Runnable.newBuilder.setContainer(cloudSdkContainerBuilder) + + def cloudSdkShellRunnable(shellCommand: String)( + volumes: List[Volume], + flags: List[RunnableFlag], + labels: Map[String, String], + timeout: Duration = Duration.Inf): Runnable.Builder = { + + Runnable.newBuilder.setContainer(cloudSdkContainerBuilder) + .withVolumes(volumes) + .withLabels(labels) + .withEntrypointCommand( + "/bin/sh", + "-c", + if (shellCommand.contains("\n")) shellCommand |> RunnableCommands.multiLineCommand else shellCommand + ) + .withFlags(flags) + .withTimeout(timeout) + } + + def annotateTimestampedRunnable(description: String, + loggingLabelValue: String, + isAlwaysRun: Boolean = false)( + volumes: List[Volume], + runnables: List[Runnable.Builder]): List[Runnable.Builder] = { + + val flags = if (isAlwaysRun) List(RunnableFlag.AlwaysRun) else List() + val labels = Map(Key.Logging -> loggingLabelValue) + val starting = logTimestampedRunnable(s"Starting $description.", volumes, flags, labels) + val done = logTimestampedRunnable(s"Done $description.", volumes, flags, labels) + List(starting) ++ runnables ++ List(done) + } + + /** + * Returns a set of labels for a parameter. + * + * @param parameter Input or output parameter to label. + * @return The labels. + */ + def parameterLabels(parameter: BatchParameter): Map[String, String] = { + parameter match { + case _: GcpBatchInput => + Map( + Key.Tag -> Value.Localization, + Key.InputName -> parameter.name + ) + case _: GcpBatchOutput => + Map( + Key.Tag -> Value.Delocalization, + Key.OutputName -> parameter.name + ) + } + } + + /** Creates a Runnable that describes the parameter localization or delocalization. */ + def describeParameter(parameter: BatchParameter, volumes: List[Volume], labels: Map[String, String]): Runnable.Builder = { + parameter match { + case _: GcpBatchInput => + val message = "Localizing input %s -> %s".format( + shellEscaped(parameter.cloudPath), + shellEscaped(parameter.containerPath), + ) + logTimestampedRunnable(message, volumes, List.empty, labels) + case _: GcpBatchOutput => + val message = "Delocalizing output %s -> %s".format( + shellEscaped(parameter.containerPath), + shellEscaped(parameter.cloudPath), + ) + logTimestampedRunnable(message, volumes, List(RunnableFlag.AlwaysRun), labels) + } + } + + // Converts an Runnable to a `docker run ...` command runnable in the shell. + private[runnable] def toDockerRun(runnable: Runnable.Builder): String = { + runnable.getContainer + .getCommandsList + .asScala + .toList + .map { cmd => shellEscaped(cmd) } + .mkString(" ") + + val commandArgs: String = Option(runnable.getContainerBuilder.getCommandsList) match { + case Some(commands) => + commands.asScala map { + case command if Option(command).isDefined => s" ${shellEscaped(command)}" + case _ => "" + } mkString "" + case None => "" + } + + val entrypointArg: String = Option(runnable.getContainerBuilder.getEntrypoint).filter(_.nonEmpty) match { + case Some(entrypoint) => s" --entrypoint=${shellEscaped(entrypoint)}" + case None => "" + } + + val imageArg: String = Option(runnable.getContainerBuilder.getImageUri) match { + case None => " " + case Some(imageUri) => s" ${shellEscaped(imageUri)}" + } + + val mountArgs: String = Option(runnable.getContainerBuilder.getVolumesList) match { + case None => "" + case Some(volumes) => + volumes.asScala map { + case volume if Option(volume).isEmpty => "" + case volume => s" -v ${shellEscaped(volume).replaceAll(":r[o|w]", "")}" + } mkString "" + } + + List("docker run", + mountArgs, + entrypointArg, + imageArg, + commandArgs, + ).mkString + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableCommands.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableCommands.scala new file mode 100644 index 00000000000..0bbd1c50d53 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableCommands.scala @@ -0,0 +1,185 @@ +package cromwell.backend.google.batch.runnable + +import akka.http.scaladsl.model.ContentType +import common.util.StringUtil._ +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration +import cromwell.core.path.Path +import cromwell.filesystems.gcs.GcsPath +import cromwell.filesystems.gcs.RequesterPaysErrors._ +import mouse.all._ +import org.apache.commons.codec.binary.Base64 +import org.apache.commons.text.StringEscapeUtils + +import java.nio.charset.StandardCharsets +import java.util.UUID +import scala.concurrent.duration._ + +object RunnableCommands { + + import RunnableUtils._ + + implicit val waitBetweenRetries: FiniteDuration = 5.seconds + + implicit class EnhancedCromwellPath(val path: Path) extends AnyVal { + def projectId: String = path match { + case gcs: GcsPath => gcs.projectId + case _ => "" + } + } + + implicit class ShellPath(val path: Path) extends AnyVal { + // The command String runs in Bourne shell so shell metacharacters in filenames must be escaped + def escape: String = StringEscapeUtils.escapeXSI(path.pathAsString) + } + + private def makeContentTypeFlag(contentType: Option[ContentType]) = contentType.map(ct => s"""-h "Content-Type: $ct"""").getOrElse("") + + def makeContainerDirectory(containerPath: Path) = s"mkdir -p ${containerPath.escape}" + + def delocalizeDirectory(containerPath: Path, cloudPath: Path, contentType: Option[ContentType]) + (implicit gcsTransferConfiguration: GcsTransferConfiguration): String = { + retry { + recoverRequesterPaysError(cloudPath) { flag => + s"rm -f $$HOME/.config/gcloud/gce && " + + s"gsutil $flag ${contentType |> makeContentTypeFlag} -m rsync -r ${containerPath.escape} ${cloudPath.escape}" + } + } + } + + /** + * As per https://cloud.google.com/storage/docs/gsutil/addlhelp/HowSubdirectoriesWork, rule #2 + * If one attempts a + * gsutil cp /local/file.txt gs://bucket/subdir/file.txt + * AND + * there exists a folder gs://bucket/subdir/file.txt_thisCouldBeAnything + * then gs://bucket/subdir/file.txt will be treated as a directory, and /local/file.txt will be copied under gs://bucket/subdir/file.txt/file.txt + * and not gs://bucket/subdir/file.txt. + * + * By instead using the parent directory (and ensuring it ends with a slash), gsutil will treat that as a directory and put the file under it. + * So the final gsutil command will look something like gsutil cp /local/file.txt gs://bucket/subdir/ + */ + def delocalizeFile(containerPath: Path, cloudPath: Path, contentType: Option[ContentType]) + (implicit gcsTransferConfiguration: GcsTransferConfiguration): String = { + retry { + recoverRequesterPaysError(cloudPath) { flag => + s"rm -f $$HOME/.config/gcloud/gce && " + + s"gsutil $flag ${contentType |> makeContentTypeFlag} cp ${containerPath.escape} ${cloudPath.parent.escape.ensureSlashed}" + } + } + } + + /** + * delocalizeFile necessarily copies the file to the same name. Use this if you want to to specify a name different from the original + * Make sure that there's no object named "yourfinalname_something" (see above) in the same cloud directory. + */ + def delocalizeFileTo(containerPath: Path, cloudPath: Path, contentType: Option[ContentType]) + (implicit gcsTransferConfiguration: GcsTransferConfiguration): String = { + retry { + recoverRequesterPaysError(cloudPath) { flag => + s"rm -f $$HOME/.config/gcloud/gce && " + + s"gsutil $flag ${contentType |> makeContentTypeFlag} cp ${containerPath.escape} ${cloudPath.escape}" + } + } + } + + def ifExist(containerPath: Path)(f: => String) = s"if [ -e ${containerPath.escape} ]; then $f; fi" + + def every(duration: FiniteDuration)(f: => String): String = + s"""while true; do + | ( + | $f + | ) > /dev/null 2>&1 + | sleep ${duration.toSeconds} + |done""".stripMargin + + def retry(f: => String)(implicit gcsTransferConfiguration: GcsTransferConfiguration, wait: FiniteDuration): String = { + s"""for i in $$(seq ${gcsTransferConfiguration.transferAttempts}); do + | ( + | $f + | ) + | RC=$$? + | if [ "$$RC" = "0" ]; then + | break + | fi + | if [ $$i -lt ${gcsTransferConfiguration.transferAttempts} ]; then + | ${s"""Waiting ${wait.toSeconds} seconds and retrying""" |> timestampedMessage} + | sleep ${wait.toSeconds} + | fi + |done + |exit "$$RC"""".stripMargin + } + + def delocalizeFileOrDirectory(containerPath: Path, cloudPath: Path, contentType: Option[ContentType]) + (implicit gcsTransferConfiguration: GcsTransferConfiguration): String = { + s"""if [ -d ${containerPath.escape} ]; then + | ${delocalizeDirectory(containerPath, cloudPath, contentType)} + |else + | ${delocalizeFile(containerPath, cloudPath, contentType)} + |fi""".stripMargin + } + + def localizeDirectory(cloudPath: Path, containerPath: Path) + (implicit gcsTransferConfiguration: GcsTransferConfiguration): String = { + retry { + recoverRequesterPaysError(cloudPath) { flag => + s"${containerPath |> makeContainerDirectory} && " + + s"rm -f $$HOME/.config/gcloud/gce && gsutil $flag -m rsync -r ${cloudPath.escape} ${containerPath.escape}" + } + } + } + + def localizeFile(cloudPath: Path, containerPath: Path) + (implicit gcsTransferConfiguration: GcsTransferConfiguration): String = { + retry { + recoverRequesterPaysError(cloudPath) { flag => + s"rm -f $$HOME/.config/gcloud/gce && gsutil $flag cp ${cloudPath.escape} ${containerPath.escape}" + } + } + } + + def recoverRequesterPaysError(path: Path)(f: String => String): String = { + val commandWithoutProject = f("") + val commandWithProject = f(s"-u ${path.projectId}") + + s"""$commandWithoutProject > gsutil_output.txt 2>&1 + |# Record the exit code of the gsutil command without project flag + |RC_GSUTIL=$$? + |if [ "$$RC_GSUTIL" != "0" ]; then + | ${s"$commandWithoutProject failed" |> timestampedMessage} + | # Print the reason of the failure + | cat gsutil_output.txt + | + | # Check if it matches the BucketIsRequesterPaysErrorMessage + | if grep -q "$BucketIsRequesterPaysErrorMessage" gsutil_output.txt; then + | ${"Retrying with user project" |> timestampedMessage} + | $commandWithProject + | else + | exit "$$RC_GSUTIL" + | fi + |else + | exit 0 + |fi""".stripMargin + } + + def checkIfStderrContainsRetryKeys(retryLookupKeys: List[String]): String = { + val lookupKeysAsString = retryLookupKeys.mkString("|") + s"grep -E -q '$lookupKeysAsString' /cromwell_root/stderr ; echo $$? > /cromwell_root/memory_retry_rc" + } + + def multiLineCommandTransformer(shell: String)(commandString: String): String = { + val randomUuid = UUID.randomUUID().toString + val withBashShebang = s"#!/bin/bash\n\n$commandString" + val base64EncodedScript = Base64.encodeBase64String(withBashShebang.getBytes(StandardCharsets.UTF_8)) + val scriptPath = s"/tmp/$randomUuid.sh" + + s"""python3 -c 'import base64; print(base64.b64decode("$base64EncodedScript").decode("utf-8"));' """ + + s"""> $scriptPath && """ + + s"chmod u+x $scriptPath && " + + s"$shell $scriptPath" + } + + def multiLineCommand(commandString: String): String = multiLineCommandTransformer("sh")(commandString) + + def multiLineBinBashCommand(commandString: String): String = multiLineCommandTransformer("/bin/bash")(commandString) + +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableFlag.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableFlag.scala new file mode 100644 index 00000000000..f2007c65226 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableFlag.scala @@ -0,0 +1,9 @@ +package cromwell.backend.google.batch.runnable + +sealed trait RunnableFlag + +object RunnableFlag { + case object IgnoreExitStatus extends RunnableFlag + case object RunInBackground extends RunnableFlag + case object AlwaysRun extends RunnableFlag +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableLabels.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableLabels.scala new file mode 100644 index 00000000000..7d9978c42c1 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableLabels.scala @@ -0,0 +1,22 @@ +package cromwell.backend.google.batch.runnable + +object RunnableLabels { + object Key { + /** + * Very short description of the runnable + */ + val Tag = "tag" + val Logging = "logging" + val InputName = "inputName" + val OutputName = "outputName" + } + object Value { + val ContainerSetup = "ContainerSetup" + val UserRunnable = "UserRunnable" + val Localization = "Localization" + val Delocalization = "Delocalization" + val Monitoring = "Monitoring" + val Background = "Background" + val RetryWithMoreMemory = "CheckingForMemoryRetry" + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableUtils.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableUtils.scala new file mode 100644 index 00000000000..7df2a4bff79 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/RunnableUtils.scala @@ -0,0 +1,83 @@ +package cromwell.backend.google.batch.runnable + +import com.google.cloud.batch.v1.Runnable +import com.typesafe.config.ConfigFactory +import net.ceedubs.ficus.Ficus._ +import org.apache.commons.text.StringEscapeUtils + +object RunnableUtils { + /** Image to use for ssh access. */ + val sshImage = "gcr.io/cloud-genomics-pipelines/tools" + + /** Entry point on the ssh image. */ + val sshEntryPoint = "ssh-server" + + /** Port mappings for the ssh container. */ + val sshPortMappings = Map("22" -> Int.box(22)) + + /* + * At the moment, cloud-sdk (924MB for 276.0.0-slim) and stedolan/jq (182MB) decompressed ~= 1.1 GB + */ + val cromwellImagesSizeRoundedUpInGB = 1 + + private val config = ConfigFactory.load().getConfig("google") + + /** + * An image with the Google Cloud SDK installed. + * http://gcr.io/google.com/cloudsdktool/cloud-sdk + * + * FYI additional older versions are available on DockerHub at: + * https://hub.docker.com/r/google/cloud-sdk + * + * When updating this value, also consider updating the CromwellImagesSizeRoundedUpInGB below. + */ + val CloudSdkImage: String = + //config.getOrElse("cloud-sdk-image-url", "gcr.io/google.com/cloudsdktool/cloud-sdk:354.0.0-alpine") + config.getOrElse("cloud-sdk-image-url", "gcr.io/google.com/cloudsdktool/cloud-sdk:434.0.0-alpine") + /* + * At the moment, cloud-sdk (584MB for 354.0.0-alpine) and stedolan/jq (182MB) decompressed ~= 0.8 GB + */ + val CromwellImagesSizeRoundedUpInGB: Int = + config.getOrElse("cloud-sdk-image-size-gb", 1) + + /** Quotes a string such that it's compatible as a string argument in the shell. */ + def shellEscaped(any: Any): String = { + val str = String.valueOf(any) + /* + NOTE: escapeXSI is more compact than wrapping in single quotes. Newlines are also stripped by the shell, as they + are by escapeXSI. If for some reason escapeXSI doesn't 100% work, say because it ends up stripping some required + newlines, then consider adding a check for newlines and then using: + + "'" + str.replace("'", "'\"'\"'") + "'" + */ + StringEscapeUtils.escapeXSI(str) + } + + private val backgroundRunnableTerminationGraceTime = 10 + + val terminateAllBackgroundRunnablesCommand: String = s"kill -TERM -1 && sleep $backgroundRunnableTerminationGraceTime || true" + + def timestampedMessage(message: String): String = + s"""printf '%s %s\\n' "$$(date -u '+%Y/%m/%d %H:%M:%S')" ${shellEscaped(message)}""" + + /** Start background runnables first, leave the rest as is */ + def sortRunnables(containerSetup: List[Runnable], + localization: List[Runnable], + userRunnable: List[Runnable], + memoryRetryRunnable: List[Runnable], + deLocalization: List[Runnable], + monitoringSetup: List[Runnable], + monitoringShutdown: List[Runnable], + checkpointingStart: List[Runnable], + checkpointingShutdown: List[Runnable], + sshAccess: List[Runnable], + isBackground: Runnable => Boolean, + ): List[Runnable] = { + val toBeSortedRunnables = localization ++ userRunnable ++ memoryRetryRunnable ++ deLocalization + val sortedRunnables = toBeSortedRunnables.sortWith { + case (runnable, _) => isBackground(runnable) + } + + sshAccess ++ containerSetup ++ monitoringSetup ++ checkpointingStart ++ sortedRunnables ++ checkpointingShutdown ++ monitoringShutdown + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/UserRunnable.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/UserRunnable.scala new file mode 100644 index 00000000000..b7f5a026476 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/UserRunnable.scala @@ -0,0 +1,25 @@ +package cromwell.backend.google.batch.runnable + +import com.google.cloud.batch.v1.{Runnable, Volume} +import cromwell.backend.google.batch.api.GcpBatchRequestFactory.CreateBatchJobParameters + + +trait UserRunnable { + + def userRunnables(createParameters: CreateBatchJobParameters, volumes: List[Volume]): List[Runnable] = { + + println(f"job shell ${createParameters.jobShell}") + println(f"script container path ${createParameters.commandScriptContainerPath}") + + val userRunnable = RunnableBuilder.userRunnable( + docker = createParameters.dockerImage, + scriptContainerPath = createParameters.commandScriptContainerPath.pathAsString, + jobShell = "/bin/bash", + volumes = volumes, + dockerhubCredentials = createParameters.dockerhubCredentials + ) + + val describeRunnable = RunnableBuilder.describeDocker("user runnable", userRunnable) + List(describeRunnable, userRunnable).map(_.build) + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/WorkflowOptionKeys.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/WorkflowOptionKeys.scala new file mode 100644 index 00000000000..34f22acbd8b --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/runnable/WorkflowOptionKeys.scala @@ -0,0 +1,13 @@ +package cromwell.backend.google.batch.runnable + +object WorkflowOptionKeys { + val MonitoringScript = "monitoring_script" + val MonitoringImage = "monitoring_image" + val MonitoringImageScript = "monitoring_image_script" + val EnableSSHAccess = "enable_ssh_access" + val GoogleProject = "google_project" + val GoogleComputeServiceAccount = "google_compute_service_account" + val EnableFuse = "enable_fuse" + val GoogleLegacyMachineSelection = "google_legacy_machine_selection" + val UseDockerImageCache = "use_docker_image_cache" +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/BatchExpressionFunctions.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/BatchExpressionFunctions.scala new file mode 100644 index 00000000000..4ad6cc45b61 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/BatchExpressionFunctions.scala @@ -0,0 +1,38 @@ +package cromwell.backend.google.batch.util + +import com.google.cloud.storage.contrib.nio.CloudStorageOptions +import cromwell.backend.standard.{StandardExpressionFunctions, StandardExpressionFunctionsParams} +import cromwell.core.CallContext +import cromwell.core.io.{CallCorePathFunctionSet, IoCommandBuilder} +import cromwell.core.path.Path +import cromwell.core.path.PathFactory.PathBuilders +import cromwell.filesystems.gcs.GcsPathBuilder +import cromwell.filesystems.gcs.GcsPathBuilder.{InvalidGcsPath, PossiblyValidRelativeGcsPath, ValidFullGcsPath} +import cromwell.filesystems.gcs.batch.GcsBatchCommandBuilder + +class BatchPathFunctions(pathBuilders: PathBuilders, callContext: CallContext) extends CallCorePathFunctionSet(pathBuilders, callContext) { + override def relativeToHostCallRoot(path: String) = { + GcsPathBuilder.validateGcsPath(path) match { + case _: ValidFullGcsPath => path + case _ => callContext.root.resolve(path.stripPrefix("file://").stripPrefix("/")).pathAsString + } + } +} + +class BatchExpressionFunctions(standardParams: StandardExpressionFunctionsParams) + extends StandardExpressionFunctions(standardParams) { + override lazy val ioCommandBuilder: IoCommandBuilder = GcsBatchCommandBuilder + + override def preMapping(str: String) = { + GcsPathBuilder.validateGcsPath(str) match { + case _: ValidFullGcsPath => str + case PossiblyValidRelativeGcsPath => callContext.root.resolve(str.stripPrefix("/")).pathAsString + case _: InvalidGcsPath => str + } + } + + override lazy val pathFunctions = new BatchPathFunctions(pathBuilders, callContext) + + override protected def writeAsync(file: Path, content: String) = + asyncIo.writeAsync(file, content, Seq(CloudStorageOptions.withMimeType("text/plain"))) +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/BatchParameterConversions.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/BatchParameterConversions.scala new file mode 100644 index 00000000000..ad7c942b4f9 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/BatchParameterConversions.scala @@ -0,0 +1,149 @@ +package cromwell.backend.google.batch.util + +import com.google.cloud.batch.v1.{Runnable, Volume} +//import com.typesafe.config.ConfigFactory +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration +import cromwell.backend.google.batch.models._ +import cromwell.backend.google.batch.runnable._ +import cromwell.filesystems.drs.DrsPath +import cromwell.filesystems.gcs.GcsPath +import cromwell.filesystems.http.HttpPath +//import cromwell.filesystems.sra.SraPath +import simulacrum.typeclass + +@typeclass trait ToParameter[A <: BatchParameter] { + def toRunnables(p: A, volumes: List[Volume])(implicit gcsTransferConfiguration: GcsTransferConfiguration): List[Runnable.Builder] +} + +trait GcpBatchParameterConversions { + import RunnableBuilder._ + import RunnableCommands._ + import RunnableLabels._ + + + implicit val fileInputToParameter: ToParameter[GcpBatchFileInput] = new ToParameter[GcpBatchFileInput] { + override def toRunnables(fileInput: GcpBatchFileInput, volumes: List[Volume]) + (implicit retryPolicy: GcsTransferConfiguration): List[Runnable.Builder] = { + + val labels = RunnableBuilder.parameterLabels(fileInput) + fileInput.cloudPath match { + case _: HttpPath => + val command = s"curl --silent --create-dirs --output ${fileInput.containerPath} ${fileInput.cloudPath}" + val localizationRunnables = RunnableBuilder.cloudSdkShellRunnable(command)(volumes = volumes, labels = labels, flags = List.empty) + List(RunnableBuilder.describeParameter(fileInput, volumes, labels), localizationRunnables) + + case _: GcsPath => + // GCS paths will be localized with a separate localization script. + Nil + case _: DrsPath => + // DRS paths will be localized with a single call to cromwell-drs-localizer with a manifest + Nil + } + } + } + + implicit val directoryInputToParameter: ToParameter[GcpBatchDirectoryInput] = + new ToParameter[GcpBatchDirectoryInput] { + override def toRunnables(directoryInput: GcpBatchDirectoryInput, volumes: List[Volume]) + (implicit retryPolicy: GcsTransferConfiguration): List[Runnable.Builder] = { + directoryInput.cloudPath match { + case _: GcsPath => Nil // GCS paths will be localized with a separate localization script. + case _ => + val labels = RunnableBuilder.parameterLabels(directoryInput) + val describeRunnables = RunnableBuilder.describeParameter(directoryInput, volumes, labels) + val localizationRunnables = RunnableBuilder.cloudSdkShellRunnable( + RunnableCommands.localizeDirectory(directoryInput.cloudPath, directoryInput.containerPath) + )(volumes = volumes, labels = labels, flags = List.empty) + List(describeRunnables, localizationRunnables) + } + } + } + + implicit val fileOutputToParameter: ToParameter[GcpBatchFileOutput] = new ToParameter[GcpBatchFileOutput] { + override def toRunnables(fileOutput: GcpBatchFileOutput, volumes: List[Volume]) + (implicit retryPolicy: GcsTransferConfiguration): List[Runnable.Builder] = { + + // If the output is a "secondary file", it actually could be a directory but we won't know before runtime. + // The fileOrDirectory method will generate a command that can cover both cases + lazy val copy = if (fileOutput.secondary) + RunnableCommands.delocalizeFileOrDirectory(fileOutput.containerPath, fileOutput.cloudPath, fileOutput.contentType) + else + RunnableCommands.delocalizeFile(fileOutput.containerPath, fileOutput.cloudPath, fileOutput.contentType) + + lazy val copyOnlyIfExists = RunnableCommands.ifExist(fileOutput.containerPath) { + copy + } + + lazy val copyCommand = if (fileOutput.optional || fileOutput.secondary) copyOnlyIfExists else copy + lazy val labels = RunnableBuilder.parameterLabels(fileOutput) + + // The delocalization runnables to take once the user command has terminated (i.e., the non-periodic uploads). + val finalDelocalizationRunnables = fileOutput.cloudPath match { + case _: GcsPath => Nil // GCS files are delocalized with a separate delocalization script. + case _ => + val describeRunnable = RunnableBuilder.describeParameter(fileOutput, volumes, labels) + val delocalizationRunnable = RunnableBuilder + .cloudSdkShellRunnable(copyCommand)(volumes = volumes, labels = labels, flags = List.empty) + .withAlwaysRun(true) + + List(describeRunnable, delocalizationRunnable) + } + + fileOutput.uploadPeriod match { + // If the file should be uploaded periodically, create a background upload runnable in addition to any normal ones + // that run at the end to make sure we get the most up to date version of the file. + case Some(period) => + val periodicLabels = labels collect { + case (key, _) if key == Key.Tag => key -> Value.Background + case (key, value) => key -> value + } + val periodic = RunnableBuilder.cloudSdkShellRunnable( + every(period) { + copyCommand + } + )(volumes = volumes, labels = periodicLabels, flags = List.empty).withRunInBackground(true) + + finalDelocalizationRunnables :+ periodic + + case None => finalDelocalizationRunnables + } + } + } + + implicit val directoryOutputToParameter: ToParameter[GcpBatchDirectoryOutput] = + new ToParameter[GcpBatchDirectoryOutput] { + override def toRunnables(directoryOutput: GcpBatchDirectoryOutput, volumes: List[Volume]) + (implicit gcsTransferConfiguration: GcsTransferConfiguration): List[Runnable.Builder] = { + directoryOutput.cloudPath match { + case _: GcsPath => Nil // GCS paths will be delocalized with a separate delocalization script. + case _ => + val labels = RunnableBuilder.parameterLabels(directoryOutput) + val describeRunnable = RunnableBuilder.describeParameter(directoryOutput, volumes, labels) + val delocalizationRunnable = RunnableBuilder.cloudSdkShellRunnable( + delocalizeDirectory(directoryOutput.containerPath, directoryOutput.cloudPath, None) + )(volumes = volumes, labels = labels, flags = List.empty) + .withAlwaysRun(true) + + List(describeRunnable, delocalizationRunnable) + } + } + } + + implicit val inputToParameter: ToParameter[GcpBatchInput] = new ToParameter[GcpBatchInput] { + override def toRunnables(p: GcpBatchInput, volumes: List[Volume]) + (implicit gcsTransferConfiguration: GcsTransferConfiguration): List[Runnable.Builder] = p match { + case fileInput: GcpBatchFileInput => fileInputToParameter.toRunnables(fileInput, volumes) + case directoryInput: GcpBatchDirectoryInput => directoryInputToParameter.toRunnables(directoryInput, volumes) + } + } + + implicit val outputToParameter: ToParameter[GcpBatchOutput] = new ToParameter[GcpBatchOutput] { + override def toRunnables(p: GcpBatchOutput, volumes: List[Volume]) + (implicit gcsTransferConfiguration: GcsTransferConfiguration): List[Runnable.Builder] = p match { + case fileOutput: GcpBatchFileOutput => fileOutputToParameter.toRunnables(fileOutput, volumes) + case directoryOutput: GcpBatchDirectoryOutput => directoryOutputToParameter.toRunnables(directoryOutput, volumes) + } + } +} + +object GcpBatchParameterConversions extends GcpBatchParameterConversions diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/BatchUtilityConversions.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/BatchUtilityConversions.scala new file mode 100644 index 00000000000..87b4762f5a5 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/BatchUtilityConversions.scala @@ -0,0 +1,102 @@ +package cromwell.backend.google.batch.util + +import com.google.cloud.batch.v1.AllocationPolicy.{Accelerator, AttachedDisk, Disk, ProvisioningModel} +import com.google.cloud.batch.v1.Volume +import cromwell.backend.google.batch.io.{DiskType, GcpBatchAttachedDisk, GcpBatchReferenceFilesDisk} +import cromwell.backend.google.batch.models.{GcpBatchRuntimeAttributes, GpuResource} +import wom.format.MemorySize + +trait BatchUtilityConversions { + + + // construct zones string + def toZonesPath(zones: Vector[String]): String = { + "zones/" + zones.mkString(",") + } + + // lowercase text to match gcp label requirements + def toLabel(text: String): String = { + text.toLowerCase + } + + // creates batch run time location from zones entered in runtime. This is needed if network not defined by user to place on right network. + def toBatchRunLocation(zones: Vector[String]): String = { + val parts = zones.mkString(",").split("-") + parts(0) + "-" + parts(1) + } + + // convert cpu cores to millicores that Batch expects + def toCpuCores(cpu: Long): Long = { + cpu * 1000 + } + + // convert memory to MiB that Batch expects + def toMemMib(memory: MemorySize): Long = { + (memory.amount * 1024).toLong + } + + // set Standard or Spot instances + def toProvisioningModel(preemption: Int): ProvisioningModel = preemption compare 0 match { + case 0 => ProvisioningModel.STANDARD + case 1 => ProvisioningModel.SPOT + } + + def toDisks(disks: Seq[GcpBatchAttachedDisk]): List[AttachedDisk] = disks.map(toDisk).toList + + def toVolumes(disks: Seq[GcpBatchAttachedDisk]): List[Volume] = disks.map(toVolume).toList + + def toVolume(disk: GcpBatchAttachedDisk): Volume = { + val volume = Volume + .newBuilder + .setDeviceName(disk.name) + .setMountPath(disk.mountPoint.pathAsString) + + + disk match { + case _: GcpBatchReferenceFilesDisk => + volume + .addMountOptions("async, rw") + .build + case _ => + volume + .build + } + } + + private def toDisk(disk: GcpBatchAttachedDisk): AttachedDisk = { + val googleDisk = Disk + .newBuilder + .setSizeGb(disk.sizeGb.toLong) + .setType(toBatchDiskType(disk.diskType)) + + disk match { + case refDisk: GcpBatchReferenceFilesDisk => + googleDisk.setImage(refDisk.image) + .build + case _ => + googleDisk.build + } + + val googleAttachedDisk = AttachedDisk + .newBuilder + .setDeviceName(disk.name) + .setNewDisk(googleDisk) + .build + googleAttachedDisk + + } + + private def toBatchDiskType(diskType: DiskType) = diskType match { + case DiskType.HDD => "pd-standard" + case DiskType.SSD => "pd-ssd" + case DiskType.LOCAL => "local-ssd" + } + + def convertGbToMib(runtimeAttributes: GcpBatchRuntimeAttributes): Long = { + (runtimeAttributes.bootDiskSize * 953.7).toLong + } + + // Create accelerators for GPUs + def toAccelerator(gpuResource: GpuResource): Accelerator.Builder = Accelerator.newBuilder.setCount(gpuResource.gpuCount.value.toLong).setType(gpuResource.gpuType.toString) + +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchDockerCacheMappingOperations.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchDockerCacheMappingOperations.scala new file mode 100644 index 00000000000..e7ca7a0e6a8 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchDockerCacheMappingOperations.scala @@ -0,0 +1,81 @@ +package cromwell.backend.google.batch.util + +import _root_.io.circe.generic.auto._ +import _root_.io.circe.parser._ +import cats.effect.IO +import com.google.api.services.storage.StorageScopes +import com.google.cloud.storage.{BlobId, Storage, StorageOptions} +import cromwell.cloudsupport.gcp.auth.GoogleAuthMode +import cromwell.core.logging.JobLogger +import cromwell.filesystems.gcs.GcsPathBuilder.ValidFullGcsPath + +import scala.util.control.NoStackTrace + +case class DockerImageCacheEntry(dockerImageDigest: String, diskImageName: String) +case class DockerImageCacheManifest(manifestFormatVersion: Int, dockerImageCacheMap: Map[String, DockerImageCacheEntry]) + +trait GcpBatchDockerCacheMappingOperations { + + private val CURRENT_SUPPORTED_MANIFEST_FORMAT_VERSION = 2 + private class DockerImageManifestVersionError(message: String) extends RuntimeException(message) with NoStackTrace + + def generateDockerImageToDiskImageMapping(auth: GoogleAuthMode, + dockerImageCacheManifestFile: ValidFullGcsPath): Map[String, DockerImageCacheEntry] = { + + val gcsClient = StorageOptions + .newBuilder() + .setCredentials(auth.credentials(Set(StorageScopes.DEVSTORAGE_READ_ONLY))) + .build + .getService + val mappingsFromManifestIO = readDockerImageCacheManifestFileFromGCS(gcsClient, dockerImageCacheManifestFile) + mappingsFromManifestIO.map(_.dockerImageCacheMap).unsafeRunSync() + } + + def getDockerCacheDiskImageForAJob(dockerImageToCacheDiskImageMappingOpt: Option[Map[String, DockerImageCacheEntry]], + dockerImageAsSpecifiedByUser: String, + dockerImageWithDigest: String, + jobLogger: JobLogger): Option[String] = { + dockerImageToCacheDiskImageMappingOpt + .flatMap(_.get(dockerImageAsSpecifiedByUser)) + .filter { cachedDockerImageDigestAndDiskName => + val hashStartingPositionInActualDockerImage = dockerImageWithDigest.indexOf('@') + if (hashStartingPositionInActualDockerImage != -1) { + val actualDigestOfDesiredDockerImage = dockerImageWithDigest.substring(hashStartingPositionInActualDockerImage + 1) + if (cachedDockerImageDigestAndDiskName.dockerImageDigest == actualDigestOfDesiredDockerImage) { + true + } else { + jobLogger.info(s"Cached Docker image digest mismatch. Requested docker image $dockerImageAsSpecifiedByUser has different digest than " + + s"corresponding cached image located at the ${cachedDockerImageDigestAndDiskName.diskImageName} disk image. " + + s"Digest of requested image is $actualDigestOfDesiredDockerImage, but digest of cached image is ${cachedDockerImageDigestAndDiskName.dockerImageDigest}. " + + s"Docker image cache feature will not be used for this task.") + + false + } + } else { + jobLogger.error(s"Programmer error ! Odd docker image name where supposed to be name with digest: $dockerImageWithDigest") + false + } + } + .map(_.diskImageName) + } + + private[batch] def readDockerImageCacheManifestFileFromGCS(gcsClient: Storage, gcsPath: ValidFullGcsPath): IO[DockerImageCacheManifest] = { + val manifestFileBlobIo = IO { gcsClient.get(BlobId.of(gcsPath.bucket, gcsPath.path.substring(1))) } + manifestFileBlobIo flatMap { manifestFileBlob => + val jsonStringIo = IO { manifestFileBlob.getContent().map(_.toChar).mkString } + jsonStringIo.flatMap { jsonStr => + decode[DockerImageCacheManifest](jsonStr) match { + case Left(error) => IO.raiseError(error) + case Right(parsedManifest) => + if (parsedManifest.manifestFormatVersion == CURRENT_SUPPORTED_MANIFEST_FORMAT_VERSION) { + IO.pure(parsedManifest) + } else { + IO.raiseError(new DockerImageManifestVersionError(s"Current supported docker image cache manifest format version " + + s"is $CURRENT_SUPPORTED_MANIFEST_FORMAT_VERSION, but got ${parsedManifest.manifestFormatVersion}")) + } + } + } + } + } + +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchExpressionFunctions.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchExpressionFunctions.scala new file mode 100644 index 00000000000..2cb2b86b34c --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchExpressionFunctions.scala @@ -0,0 +1,38 @@ +package cromwell.backend.google.batch.util + +import com.google.cloud.storage.contrib.nio.CloudStorageOptions +import cromwell.backend.standard.{StandardExpressionFunctions, StandardExpressionFunctionsParams} +import cromwell.core.CallContext +import cromwell.core.io.{CallCorePathFunctionSet, IoCommandBuilder} +import cromwell.core.path.Path +import cromwell.core.path.PathFactory.PathBuilders +import cromwell.filesystems.gcs.GcsPathBuilder +import cromwell.filesystems.gcs.GcsPathBuilder.{InvalidGcsPath, PossiblyValidRelativeGcsPath, ValidFullGcsPath} +import cromwell.filesystems.gcs.batch.GcsBatchCommandBuilder + +class GcpBatchPathFunctions(pathBuilders: PathBuilders, callContext: CallContext) extends CallCorePathFunctionSet(pathBuilders, callContext) { + override def relativeToHostCallRoot(path: String) = { + GcsPathBuilder.validateGcsPath(path) match { + case _: ValidFullGcsPath => path + case _ => callContext.root.resolve(path.stripPrefix("file://").stripPrefix("/")).pathAsString + } + } +} + +class GcpBatchExpressionFunctions(standardParams: StandardExpressionFunctionsParams) + extends StandardExpressionFunctions(standardParams) { + override lazy val ioCommandBuilder: IoCommandBuilder = GcsBatchCommandBuilder + + override def preMapping(str: String) = { + GcsPathBuilder.validateGcsPath(str) match { + case _: ValidFullGcsPath => str + case PossiblyValidRelativeGcsPath => callContext.root.resolve(str.stripPrefix("/")).pathAsString + case _: InvalidGcsPath => str + } + } + + override lazy val pathFunctions = new GcpBatchPathFunctions(pathBuilders, callContext) + + override protected def writeAsync(file: Path, content: String) = + asyncIo.writeAsync(file, content, Seq(CloudStorageOptions.withMimeType("text/plain"))) +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraints.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraints.scala new file mode 100644 index 00000000000..83b4796e039 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraints.scala @@ -0,0 +1,31 @@ +package cromwell.backend.google.batch.util + +import cromwell.backend.google.batch.models.{GcpBatchRuntimeAttributes, N1CustomMachineType, N2CustomMachineType, N2DCustomMachineType} +import eu.timepit.refined.api.Refined +import eu.timepit.refined.numeric.Positive +import org.slf4j.Logger +import wdl4s.parser.MemoryUnit +import wom.format.MemorySize + +object GcpBatchMachineConstraints { + def machineType(memory: MemorySize, + cpu: Int Refined Positive, + cpuPlatformOption: Option[String], + googleLegacyMachineSelection: Boolean, + jobLogger: Logger, + ): String = { + if (googleLegacyMachineSelection) { + s"predefined-$cpu-${memory.to(MemoryUnit.MB).amount.intValue()}" + } else { + // If someone requests Intel Cascade Lake as their CPU platform then switch the machine type to n2. + // Similarly, CPU platform of AMD Rome corresponds to the machine type n2d. + val customMachineType = + cpuPlatformOption match { + case Some(GcpBatchRuntimeAttributes.CpuPlatformIntelCascadeLakeValue) => N2CustomMachineType + case Some(GcpBatchRuntimeAttributes.CpuPlatformAMDRomeValue) => N2DCustomMachineType + case _ => N1CustomMachineType + } + customMachineType.machineType(memory, cpu, jobLogger) + } + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchReferenceFilesMappingOperations.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchReferenceFilesMappingOperations.scala new file mode 100644 index 00000000000..f472263f51c --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchReferenceFilesMappingOperations.scala @@ -0,0 +1,119 @@ +package cromwell.backend.google.batch.util + +import cats.effect.IO +import cats.implicits._ +import com.google.api.services.storage.StorageScopes +import com.google.cloud.storage.Storage.{BlobField, BlobGetOption} +import com.google.cloud.storage.{BlobId, Storage, StorageOptions} +import com.google.common.io.BaseEncoding +import com.google.common.primitives.Longs +import cromwell.backend.google.batch.errors.InvalidGcsPathsInManifestFileException +import cromwell.backend.google.batch.io.GcpBatchReferenceFilesDisk +import cromwell.backend.google.batch.models.{GcpBatchInput, ManifestFile, ReferenceFile} +import cromwell.cloudsupport.gcp.auth.GoogleAuthMode +import cromwell.filesystems.gcs.GcsPathBuilder.{InvalidFullGcsPath, ValidFullGcsPath} +import cromwell.filesystems.gcs.{GcsPath, GcsPathBuilder} +import org.slf4j.{Logger, LoggerFactory} + +import java.util + +trait GcpBatchReferenceFilesMappingOperations { + private val logger: Logger = LoggerFactory.getLogger(getClass) + + /** + * This method validates reference files' CRC32Cs. Depending on the number of manifests and their sizes this + * may take a significant amount of time. + */ + def generateReferenceFilesMapping(auth: GoogleAuthMode, + referenceDiskLocalizationManifests: List[ManifestFile]): Map[String, GcpBatchReferenceFilesDisk] = { + val gcsClient = StorageOptions + .newBuilder() + .setCredentials(auth.credentials(Set(StorageScopes.DEVSTORAGE_READ_ONLY))) + .build + .getService + + val validReferenceFilesMapIO = referenceDiskLocalizationManifests + .traverse(manifestFile => getMapOfValidReferenceFilePathsToDisks(gcsClient, manifestFile)) + .map(_.flatten.toMap) + + validReferenceFilesMapIO.unsafeRunSync() + } + + def getReferenceInputsToMountedPathMappings(referenceFileToDiskImageMapping: Map[String, GcpBatchReferenceFilesDisk], + inputFiles: List[GcpBatchInput]): Map[GcpBatchInput, String] = { + val gcsPathsToInputs = inputFiles.collect { case i if i.cloudPath.isInstanceOf[GcsPath] => (i.cloudPath.asInstanceOf[GcsPath].pathAsString, i) }.toMap + referenceFileToDiskImageMapping.collect { + case (path, disk) if gcsPathsToInputs.keySet.contains(s"gs://$path") => + (gcsPathsToInputs(s"gs://$path"), s"${disk.mountPoint.pathAsString}/$path") + } + } + + def getReferenceDisksToMount(referenceFileToDiskImageMapping: Map[String, GcpBatchReferenceFilesDisk], + inputFilePaths: Set[String]): List[GcpBatchReferenceFilesDisk] = { + referenceFileToDiskImageMapping.view.filterKeys(key => inputFilePaths.contains(s"gs://$key")).values.toList.distinct + } + + private def getReferenceFileToValidatedGcsPathMap(referenceFiles: Set[ReferenceFile]): IO[Map[ReferenceFile, ValidFullGcsPath]] = { + val filesAndValidatedPaths = referenceFiles.map { + referenceFile => (referenceFile, GcsPathBuilder.validateGcsPath(s"gs://${referenceFile.path}")) + }.toMap + + val filesWithValidPaths = filesAndValidatedPaths.collect { + case (referenceFile, validPath: ValidFullGcsPath) => (referenceFile, validPath) + } + val filesWithInvalidPaths = filesAndValidatedPaths.collect { + case (referenceFile, invalidPath: InvalidFullGcsPath) => (referenceFile, invalidPath) + } + + if (filesWithInvalidPaths.nonEmpty) { + IO.raiseError(new InvalidGcsPathsInManifestFileException(filesWithInvalidPaths.keySet.map(_.path).toList)) + } else { + IO.pure(filesWithValidPaths) + } + } + + protected def bulkValidateCrc32cs(gcsClient: Storage, + filesWithValidPaths: Map[ReferenceFile, ValidFullGcsPath]): IO[Map[ReferenceFile, Boolean]] = { + IO { + val gcsBatch = gcsClient.batch() + val filesAndBlobResults = filesWithValidPaths map { + case (referenceFile, ValidFullGcsPath(bucket, path)) => + val blobGetResult = gcsBatch.get(BlobId.of(bucket, path.substring(1)), BlobGetOption.fields(BlobField.CRC32C)) + (referenceFile, blobGetResult) + } + gcsBatch.submit() + + filesAndBlobResults map { + case (referenceFile, blobGetResult) => + val crc32cFromManifest = BaseEncoding.base64.encode( + // drop 4 leading bytes from Long crc32c value + // https://stackoverflow.com/a/25111119/1794750 + util.Arrays.copyOfRange(Longs.toByteArray(referenceFile.crc32c), 4, 8) + ) + + (referenceFile, crc32cFromManifest === blobGetResult.get().getCrc32c) + } + } + } + + private def getMapOfValidReferenceFilePathsToDisks(gcsClient: Storage, manifestFile: ManifestFile): IO[Map[String, GcpBatchReferenceFilesDisk]] = { + val refDisk = GcpBatchReferenceFilesDisk(manifestFile.imageIdentifier, manifestFile.diskSizeGb) + val allReferenceFilesFromManifestMap = manifestFile.files.map(refFile => (refFile, refDisk)).toMap + + val validReferenceFilesFromManifestMapIo = + for { + referenceFilesWithValidPaths <- getReferenceFileToValidatedGcsPathMap(allReferenceFilesFromManifestMap.keySet) + filesWithValidatedCrc32cs <- bulkValidateCrc32cs(gcsClient, referenceFilesWithValidPaths) + } yield allReferenceFilesFromManifestMap.view.filterKeys(key => filesWithValidatedCrc32cs.getOrElse(key, false)) + + validReferenceFilesFromManifestMapIo map { validReferenceFilesFromManifestMap => + val invalidReferenceFiles = allReferenceFilesFromManifestMap.keySet -- validReferenceFilesFromManifestMap.keySet + if (invalidReferenceFiles.nonEmpty) { + logger.warn(s"The following files listed in references manifest have checksum mismatch with actual files in GCS: ${invalidReferenceFiles.mkString(",")}") + } + validReferenceFilesFromManifestMap.map { + case (refFile, disk) => (refFile.path, disk) + }.toMap + } + } +} diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GpuTypeValidation.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GpuTypeValidation.scala new file mode 100644 index 00000000000..e7ade1bc46a --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GpuTypeValidation.scala @@ -0,0 +1,25 @@ +package cromwell.backend.google.batch.util + +import cats.syntax.validated._ +import common.validation.ErrorOr.ErrorOr +import cromwell.backend.google.batch.models.GpuResource.GpuType +import cromwell.backend.validation.{OptionalRuntimeAttributesValidation, RuntimeAttributesValidation} +import wom.RuntimeAttributesKeys +import wom.types.{WomStringType, WomType} +import wom.values.{WomString, WomValue} + + +object GpuTypeValidation { + lazy val instance: RuntimeAttributesValidation[GpuType] = new GpuTypeValidation + lazy val optional: OptionalRuntimeAttributesValidation[GpuType] = instance.optional +} + +class GpuTypeValidation extends RuntimeAttributesValidation[GpuType] { + override def key = RuntimeAttributesKeys.GpuTypeKey + + override def coercion: Iterable[WomType] = Set(WomStringType) + override def validateValue: PartialFunction[WomValue, ErrorOr[GpuType]] = { + case WomString(s) => GpuType(s).validNel + case other => s"Invalid '$key': String value required but got ${other.womType.friendlyName}. See ${GpuType.MoreDetailsURL} for a list of options".invalidNel + } +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GpuValidation.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GpuValidation.scala new file mode 100644 index 00000000000..4b24c9aeacf --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GpuValidation.scala @@ -0,0 +1,40 @@ +package cromwell.backend.google.batch.util + +import cats.data.NonEmptyList +import cats.syntax.either._ +import cats.syntax.validated._ +import com.typesafe.config.Config +import common.validation.ErrorOr.ErrorOr +import cromwell.backend.validation.{OptionalRuntimeAttributesValidation, PositiveIntRuntimeAttributesValidation, RuntimeAttributesValidation} +import eu.timepit.refined.api.Refined +import eu.timepit.refined.numeric.Positive +import eu.timepit.refined.refineV +import wom.RuntimeAttributesKeys.{GpuKey, GpuMaxKey, GpuMinKey} +import wom.types.WomIntegerType +import wom.values.{WomInteger, WomValue} + +object GpuValidation { + lazy val instance: RuntimeAttributesValidation[Int Refined Positive] = new GpuValidation(GpuKey) + lazy val optional: OptionalRuntimeAttributesValidation[Int Refined Positive] = instance.optional + lazy val instanceMin: RuntimeAttributesValidation[Int Refined Positive] = new GpuValidation(GpuMinKey) + lazy val optionalMin: OptionalRuntimeAttributesValidation[Int Refined Positive] = instanceMin.optional + lazy val instanceMax: RuntimeAttributesValidation[Int Refined Positive] = new GpuValidation(GpuMaxKey) + lazy val optionalMax: OptionalRuntimeAttributesValidation[Int Refined Positive] = instanceMax.optional + + lazy val defaultMin: WomValue = WomInteger(0) + def configDefaultWomValue(config: Option[Config]): Option[WomValue] = instance.configDefaultWomValue(config) +} + +class GpuValidation(attributeName: String) extends PositiveIntRuntimeAttributesValidation(attributeName) { + override protected def validateValue: PartialFunction[WomValue, ErrorOr[Int Refined Positive]] = { + case womValue if WomIntegerType.coerceRawValue(womValue).isSuccess => + WomIntegerType.coerceRawValue(womValue).get match { + case WomInteger(value) => + refineV[Positive](value.toInt) + .leftMap(_ => NonEmptyList.one(s"Expecting $key runtime attribute value greater than 0")) + .toValidated + } + case other => + s"Invalid gpu count. Expected positive Int but got ${other.womType.friendlyName} ${other.toWomString}".invalidNel + } +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/RuntimeOutputMapping.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/RuntimeOutputMapping.scala new file mode 100644 index 00000000000..4177c09dc6a --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/RuntimeOutputMapping.scala @@ -0,0 +1,33 @@ +package cromwell.backend.google.batch.util + +import common.util.StringUtil._ +import cromwell.backend.google.batch.io.GcpBatchWorkingDisk +import cromwell.core.path.Path + +object RuntimeOutputMapping { + + /** + * List of prefixes to be stripped away from runtime output paths before + * appending them to the cloud call root to generate the delocalization path. + * The goal is to reduce unnecessary long paths which keep repeating the workflow root + * as the workflow progresses + * + * For instance: + * + * file:///cromwell_root/bucket/workflow_name/6d777414-5ee7-4c60-8b9e-a02ec44c398e/call-A/file.txt + * -> + * call-A/file.txt + * + * Which will be delocalized to + * gs://bucket/workflow_name/6d777414-5ee7-4c60-8b9e-a02ec44c398e/call-B/call-A/file.txt + * instead of + * gs://bucket/workflow_name/6d777414-5ee7-4c60-8b9e-a02ec44c398e/call-B/cromwell_root/bucket/workflow_name/6d777414-5ee7-4c60-8b9e-a02ec44c398e/call-A/file.txt + */ + def prefixFilters(workflowRoot: Path): List[String] = List( + "file://", + "/", + GcpBatchWorkingDisk.MountPoint.pathAsString.relativeDirectory, + workflowRoot.pathWithoutScheme.relativeDirectory + ) + +} diff --git a/supportedBackends/google/batch/src/test/resources/reference.conf b/supportedBackends/google/batch/src/test/resources/reference.conf new file mode 100644 index 00000000000..3685305454d --- /dev/null +++ b/supportedBackends/google/batch/src/test/resources/reference.conf @@ -0,0 +1 @@ +drs.localization.docker-image = "somerepo/drs-downloader:tagged" \ No newline at end of file diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/GcpBatchIoSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/GcpBatchIoSpec.scala new file mode 100644 index 00000000000..186152c0768 --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/GcpBatchIoSpec.scala @@ -0,0 +1,34 @@ +package cromwell.backend.google.batch + +import com.google.api.client.http.HttpResponseException +import com.google.api.client.testing.http.{HttpTesting, MockHttpTransport, MockLowLevelHttpRequest, MockLowLevelHttpResponse} +import common.assertion.CromwellTimeoutSpec +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class GcpBatchIoSpec extends AnyFlatSpec with CromwellTimeoutSpec with Matchers { + + behavior of "io" + + it should "consider 403 as a fatal exception" in { + val transport = mockTransport(403) + val request = transport.createRequestFactory().buildGetRequest(HttpTesting.SIMPLE_GENERIC_URL) + val mockedResponse = intercept[HttpResponseException](request.execute()) + io.isFatalJesException(mockedResponse) should be(true) + } + + it should "consider 429 as a transient exception" in { + val transport = mockTransport(429) + val request = transport.createRequestFactory().buildGetRequest(HttpTesting.SIMPLE_GENERIC_URL) + val mockedResponse = intercept[HttpResponseException](request.execute()) + io.isTransientJesException(mockedResponse) should be(true) + } + + private def mockTransport(statusCode: Int) = new MockHttpTransport() { + override def buildRequest(method: String, url: String) = { + new MockLowLevelHttpRequest() { + override def execute() = new MockLowLevelHttpResponse().setStatusCode(statusCode) + } + } + } +} diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActorSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActorSpec.scala new file mode 100644 index 00000000000..711f59d6845 --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActorSpec.scala @@ -0,0 +1,800 @@ +package cromwell.backend.google.batch +package actors + +import _root_.io.grpc.Status +import _root_.wdl.draft2.model._ +import akka.actor.{ActorRef, Props} +import akka.testkit.{ImplicitSender, TestActorRef, TestDuration, TestProbe} +import cats.data.NonEmptyList +import com.google.cloud.NoCredentials +import com.google.cloud.batch.v1.{Job, JobName} +import common.collections.EnhancedCollections._ +import cromwell.backend.BackendJobExecutionActor.BackendJobExecutionResponse +import cromwell.backend._ +import cromwell.backend.async.AsyncBackendJobExecutionActor.{Execute, ExecutionMode} +import cromwell.backend.async.{ExecutionHandle, FailedNonRetryableExecutionHandle} +import cromwell.backend.google.batch.actors.GcpBatchAsyncBackendJobExecutionActor.GcpBatchPendingExecutionHandle +import cromwell.backend.google.batch.io.{DiskType, GcpBatchWorkingDisk} +import cromwell.backend.google.batch.models._ +import cromwell.backend.google.batch.util.BatchExpressionFunctions +import cromwell.backend.io.JobPathsSpecHelper._ +import cromwell.backend.standard.{DefaultStandardAsyncExecutionActorParams, StandardAsyncExecutionActorParams, StandardAsyncJob, StandardExpressionFunctionsParams} +import cromwell.core._ +import cromwell.core.callcaching.NoDocker +import cromwell.core.labels.Labels +import cromwell.core.logging.JobLogger +import cromwell.core.path.{DefaultPathBuilder, PathBuilder} +import cromwell.filesystems.gcs.{GcsPath, GcsPathBuilder, MockGcsPathBuilder} +import cromwell.services.instrumentation.InstrumentationService.InstrumentationServiceMessage +import cromwell.services.instrumentation.{CromwellBucket, CromwellIncrement} +import cromwell.services.keyvalue.InMemoryKvServiceActor +import cromwell.services.keyvalue.KeyValueServiceActor.{KvJobKey, KvPair, ScopedKey} +import cromwell.util.JsonFormatting.WomValueJsonFormatter._ +import cromwell.util.SampleWdl +import org.scalatest._ +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers +import org.slf4j.Logger +import spray.json._ +import wdl.transforms.draft2.wdlom2wom.WdlDraft2WomExecutableMakers._ +import wdl.transforms.draft2.wdlom2wom._ +import wom.WomFileMapper +import wom.expression.NoIoFunctionSet +import wom.graph.CommandCallNode +import wom.transforms.WomExecutableMaker.ops._ +import wom.transforms.WomWorkflowDefinitionMaker.ops._ +import wom.values._ + +import java.nio.file.Paths +import java.util.UUID +import scala.concurrent.duration._ +import scala.concurrent.{Await, ExecutionContext, Future, Promise} +import scala.language.postfixOps +import common.mock.MockSugar +import org.mockito.Mockito._ + +class GcpBatchAsyncBackendJobExecutionActorSpec extends TestKitSuite + with AnyFlatSpecLike + with Matchers + with ImplicitSender + with BackendSpec + with BeforeAndAfter + with MockSugar + with DefaultJsonProtocol { + + val mockPathBuilder: GcsPathBuilder = MockGcsPathBuilder.instance + import MockGcsPathBuilder._ + var kvService: ActorRef = system.actorOf(Props(new InMemoryKvServiceActor), "kvService") + + private def gcsPath(str: String) = mockPathBuilder.build(str).getOrElse(fail(s"Invalid gcs path: $str")) + + //import GcpBatchTestConfig._ + import cromwell.backend.google.batch.models.GcpBatchTestConfig._ + + implicit val Timeout: FiniteDuration = 25.seconds.dilated + + val YoSup: String = + s""" + |task sup { + | String addressee + | command { + | echo "yo sup $${addressee}!" + | } + | output { + | String salutation = read_string(stdout()) + | } + | runtime { + | docker: "ubuntu:latest" + | [PREEMPTIBLE] + | } + |} + | + |workflow wf_sup { + | call sup + |} + """.stripMargin + + val Inputs: Map[FullyQualifiedName, WomValue] = Map("wf_sup.sup.addressee" -> WomString("dog")) + + private val NoOptions = WorkflowOptions(JsObject(Map.empty[String, JsValue])) + + private lazy val TestableCallContext = CallContext(mockPathBuilder.build("gs://root").get, DummyStandardPaths, isDocker = false) + + private lazy val TestableStandardExpressionFunctionsParams: StandardExpressionFunctionsParams + = new StandardExpressionFunctionsParams { + override lazy val pathBuilders: List[PathBuilder] = List(mockPathBuilder) + override lazy val callContext: CallContext = TestableCallContext + override val ioActorProxy: ActorRef = simpleIoActor + override val executionContext: ExecutionContext = system.dispatcher + } + + lazy val TestableGcpBatchExpressionFunctions: BatchExpressionFunctions = { + new BatchExpressionFunctions(TestableStandardExpressionFunctionsParams) + } + + private def buildInitializationData(jobDescriptor: BackendJobDescriptor, configuration: GcpBatchConfiguration) = { + val pathBuilders = Await.result(configuration.configurationDescriptor.pathBuilders(WorkflowOptions.empty), 5.seconds) + val workflowPaths = GcpBatchWorkflowPaths( + jobDescriptor.workflowDescriptor, NoCredentials.getInstance(), NoCredentials.getInstance(), configuration, pathBuilders, GcpBatchInitializationActor.defaultStandardStreamNameToFileNameMetadataMapper) + val runtimeAttributesBuilder = GcpBatchRuntimeAttributes.runtimeAttributesBuilder(configuration) + + GcpBackendInitializationData(workflowPaths, runtimeAttributesBuilder, configuration, null, None, None, None) + } + + class TestableGcpBatchJobExecutionActor(params: StandardAsyncExecutionActorParams, functions: BatchExpressionFunctions) + extends GcpBatchAsyncBackendJobExecutionActor(params) { + + def this(jobDescriptor: BackendJobDescriptor, + promise: Promise[BackendJobExecutionResponse], + batchConfiguration: GcpBatchConfiguration, + functions: BatchExpressionFunctions = TestableGcpBatchExpressionFunctions, + batchSingletonActor: ActorRef = emptyActor, + ioActor: ActorRef = mockIoActor, + serviceRegistryActor: ActorRef = kvService) = { + + this( + DefaultStandardAsyncExecutionActorParams( + jobIdKey = GcpBatchAsyncBackendJobExecutionActor.GcpBatchOperationIdKey, + serviceRegistryActor = serviceRegistryActor, + ioActor = ioActor, + jobDescriptor = jobDescriptor, + configurationDescriptor = batchConfiguration.configurationDescriptor, + backendInitializationDataOption = Option(buildInitializationData(jobDescriptor, batchConfiguration)), + backendSingletonActorOption = Option(batchSingletonActor), + completionPromise = promise, + minimumRuntimeSettings = MinimumRuntimeSettings() + ), + functions + ) + } + + override lazy val jobLogger: JobLogger = new JobLogger( + loggerName = "TestLogger", + workflowIdForLogging = workflowId.toPossiblyNotRoot, + rootWorkflowIdForLogging = workflowId.toRoot, + jobTag = jobTag, + akkaLogger = Option(log) + ) { + override def tag: String = s"$name [UUID(${workflowId.shortString})$jobTag]" + override val slf4jLoggers: Set[Logger] = Set.empty + } + + override lazy val backendEngineFunctions: BatchExpressionFunctions = functions + } + + + private val runtimeAttributesBuilder = GcpBatchRuntimeAttributes.runtimeAttributesBuilder(gcpBatchConfiguration) + private val workingDisk = GcpBatchWorkingDisk(DiskType.SSD, 200) + + val DockerAndDiskRuntime: String = + """ + |runtime { + | docker: "ubuntu:latest" + | disks: "local-disk 200 SSD" + |} + """.stripMargin + + private def buildPreemptibleJobDescriptor(preemptible: Int, previousPreemptions: Int, previousUnexpectedRetries: Int, failedRetriesCountOpt: Option[Int] = None): BackendJobDescriptor = { + val attempt = previousPreemptions + previousUnexpectedRetries + 1 + val wdlNamespace = WdlNamespaceWithWorkflow.load(YoSup.replace("[PREEMPTIBLE]", s"preemptible: $preemptible"), + Seq.empty[Draft2ImportResolver]).get + val womDefinition = wdlNamespace.workflow.toWomWorkflowDefinition(isASubworkflow = false).getOrElse(fail("failed to get WomDefinition from WdlWorkflow")) + + wdlNamespace.toWomExecutable(Option(Inputs.toJson.compactPrint), NoIoFunctionSet, strictValidation = true) match { + case Right(womExecutable) => + val inputs = for { + combined <- womExecutable.resolvedExecutableInputs + (port, resolvedInput) = combined + value <- resolvedInput.select[WomValue] + } yield port -> value + + val workflowDescriptor = BackendWorkflowDescriptor( + WorkflowId.randomId(), + womDefinition, + inputs, + NoOptions, + Labels.empty, + HogGroup("foo"), + List.empty, + None + ) + + val job = workflowDescriptor.callable.taskCallNodes.head + val key = BackendJobDescriptorKey(job, None, attempt) + val runtimeAttributes = makeRuntimeAttributes(job) + val prefetchedKvEntries = Map( + GcpBatchBackendLifecycleActorFactory.preemptionCountKey -> KvPair(ScopedKey(workflowDescriptor.id, KvJobKey(key), GcpBatchBackendLifecycleActorFactory.preemptionCountKey), previousPreemptions.toString), + GcpBatchBackendLifecycleActorFactory.unexpectedRetryCountKey -> KvPair(ScopedKey(workflowDescriptor.id, KvJobKey(key), GcpBatchBackendLifecycleActorFactory.unexpectedRetryCountKey), previousUnexpectedRetries.toString)) + val prefetchedKvEntriesUpd = if(failedRetriesCountOpt.isEmpty) { + prefetchedKvEntries + } else { + prefetchedKvEntries + (BackendLifecycleActorFactory.FailedRetryCountKey -> KvPair(ScopedKey(workflowDescriptor.id, KvJobKey(key), BackendLifecycleActorFactory.FailedRetryCountKey), failedRetriesCountOpt.get.toString )) + } + BackendJobDescriptor(workflowDescriptor, key, runtimeAttributes, fqnWdlMapToDeclarationMap(Inputs), NoDocker, None, prefetchedKvEntriesUpd) + case Left(badtimes) => fail(badtimes.toList.mkString(", ")) + } + } + + private case class DockerImageCacheTestingParameters(dockerImageCacheDiskOpt: Option[String], + dockerImageAsSpecifiedByUser: String, + isDockerImageCacheUsageRequested: Boolean) + + private def executionActor(jobDescriptor: BackendJobDescriptor, + promise: Promise[BackendJobExecutionResponse], + batchSingletonActor: ActorRef, + shouldBePreemptible: Boolean, + serviceRegistryActor: ActorRef = kvService, + referenceInputFilesOpt: Option[Set[GcpBatchInput]] = None, + dockerImageCacheTestingParamsOpt: Option[DockerImageCacheTestingParameters] = None + ): ActorRef = { + + val job = generateStandardAsyncJob + val run = Run(job) + val handle = new GcpBatchPendingExecutionHandle(jobDescriptor, run.job, Option(run), None) + + class ExecuteOrRecoverActor extends TestableGcpBatchJobExecutionActor(jobDescriptor, promise, gcpBatchConfiguration, batchSingletonActor = batchSingletonActor, serviceRegistryActor = serviceRegistryActor) { + override def executeOrRecover(mode: ExecutionMode)(implicit ec: ExecutionContext): Future[ExecutionHandle] = { + sendIncrementMetricsForReferenceFiles(referenceInputFilesOpt) + dockerImageCacheTestingParamsOpt.foreach { dockerImageCacheTestingParams => + sendIncrementMetricsForDockerImageCache( + dockerImageCacheTestingParams.dockerImageCacheDiskOpt, + dockerImageCacheTestingParams.dockerImageAsSpecifiedByUser, + dockerImageCacheTestingParams.isDockerImageCacheUsageRequested + ) + } + + if (preemptible == shouldBePreemptible) Future.successful(handle) + else Future.failed(new Exception(s"Test expected preemptible to be $shouldBePreemptible but got $preemptible")) + } + } + + system.actorOf(Props(new ExecuteOrRecoverActor), "ExecuteOrRecoverActor-" + UUID.randomUUID) + } + + def runAndFail(previousPreemptions: Int, previousUnexpectedRetries: Int, preemptible: Int, errorCode: Status, innerErrorMessage: String, expectPreemptible: Boolean): BackendJobExecutionResponse = { + + val runStatus: RunStatus = RunStatus.Failed(List.empty) + // val runStatus = UnsuccessfulRunStatus(errorCode, Option(innerErrorMessage), Seq.empty, Option("fakeMachine"), Option("fakeZone"), Option("fakeInstance"), expectPreemptible) + val statusPoller = TestProbe("statusPoller") + + val promise = Promise[BackendJobExecutionResponse]() + val jobDescriptor = buildPreemptibleJobDescriptor(preemptible, previousPreemptions, previousUnexpectedRetries) + + // TODO: Use this to check the new KV entries are there! From PAPI + //val kvProbe = TestProbe("kvProbe") + + val backend = executionActor(jobDescriptor, promise, statusPoller.ref, expectPreemptible) + backend ! Execute + statusPoller.expectMsgPF(max = Timeout, hint = "awaiting status poll") { + case GcpBatchBackendSingletonActor.Action.QueryJob(jobName) => + println(s"Message received to query job: $jobName") + val internalStatus = runStatus match { + case RunStatus.Failed(_) => com.google.cloud.batch.v1.JobStatus.State.FAILED + case RunStatus.Succeeded(_) => com.google.cloud.batch.v1.JobStatus.State.SUCCEEDED + case RunStatus.Running => com.google.cloud.batch.v1.JobStatus.State.RUNNING + case RunStatus.DeletionInProgress => com.google.cloud.batch.v1.JobStatus.State.DELETION_IN_PROGRESS + case RunStatus.StateUnspecified => com.google.cloud.batch.v1.JobStatus.State.STATE_UNSPECIFIED + case RunStatus.Unrecognized => com.google.cloud.batch.v1.JobStatus.State.UNRECOGNIZED + } + + backend ! GcpBatchBackendSingletonActor.Event.JobStatusRetrieved(Job.newBuilder.setStatus(com.google.cloud.batch.v1.JobStatus.newBuilder.setState(internalStatus).build()).build()) + } + + Await.result(promise.future, Timeout) + } + + def buildPreemptibleTestActorRef(attempt: Int, preemptible: Int, failedRetriesCountOpt: Option[Int] = None): TestActorRef[TestableGcpBatchJobExecutionActor] = { + // For this test we say that all previous attempts were preempted: + val jobDescriptor = buildPreemptibleJobDescriptor(preemptible, attempt - 1, previousUnexpectedRetries = 0, failedRetriesCountOpt = failedRetriesCountOpt) + val props = Props(new TestableGcpBatchJobExecutionActor(jobDescriptor, Promise(), + gcpBatchConfiguration, + TestableGcpBatchExpressionFunctions, + emptyActor, + failIoActor)) + TestActorRef(props, s"TestableGcpBatchJobExecutionActor-${jobDescriptor.workflowDescriptor.id}") + } + + behavior of "GcpBatchAsyncBackendJobExecutionActor" + + private val timeout = 25 seconds + + it should "group files by bucket" in { + + def makeInput(bucket: String, name: String): GcpBatchFileInput = { + val mockCloudPath = mock[cromwell.core.path.Path] + when(mockCloudPath.pathAsString) thenReturn s"gs://$bucket/$name" + + GcpBatchFileInput( + name = name, + cloudPath = mockCloudPath, + relativeHostPath = DefaultPathBuilder.build(Paths.get(s"$bucket/$name")), + mount = null + ) + } + + val inputs: List[GcpBatchFileInput] = List( + ("foo", "file1"), + ("foo", "file2"), + ("bar", "file1"), + ("bar", "file2"), + ("bar", "file3"), + ("baz", "file1") + ) map (makeInput _).tupled.apply + + val expected = + Map("foo" -> (NonEmptyList.of(0, 1) map inputs.apply)) ++ + Map("bar" -> (NonEmptyList.of(2, 3, 4) map inputs.apply)) ++ + Map("baz" -> NonEmptyList.of(inputs(5))) + + GcpBatchAsyncBackendJobExecutionActor.groupParametersByGcsBucket(inputs) shouldEqual expected + } + + it should "send proper value for \"number of reference files used gauge\" metric, or don't send anything if reference disks feature is disabled" in { + + val expectedInput1 = GcpBatchFileInput(name = "testfile1", relativeHostPath = DefaultPathBuilder.build(Paths.get(s"test/reference/path/file1")), mount = null, cloudPath = null) + val expectedInput2 = GcpBatchFileInput(name = "testfile2", relativeHostPath = DefaultPathBuilder.build(Paths.get(s"test/reference/path/file2")), mount = null, cloudPath = null) + val expectedReferenceInputFiles = Set[GcpBatchInput](expectedInput1, expectedInput2) + + val expectedMsg1 = InstrumentationServiceMessage(CromwellIncrement(CromwellBucket(List.empty, NonEmptyList.of("referencefiles", expectedInput1.relativeHostPath.pathAsString)))) + val expectedMsg2 = InstrumentationServiceMessage(CromwellIncrement(CromwellBucket(List.empty, NonEmptyList.of("referencefiles", expectedInput2.relativeHostPath.pathAsString)))) + + val jobDescriptor = buildPreemptibleJobDescriptor(0, 0, 0) + val serviceRegistryProbe = TestProbe() + + val backend1 = executionActor( + jobDescriptor, + Promise[BackendJobExecutionResponse](), + TestProbe().ref, + shouldBePreemptible = false, + serviceRegistryActor = serviceRegistryProbe.ref, + referenceInputFilesOpt = Option(expectedReferenceInputFiles) + ) + backend1 ! Execute + serviceRegistryProbe.expectMsgAllOf(expectedMsg1, expectedMsg2) + + val backend2 = executionActor( + jobDescriptor, + Promise[BackendJobExecutionResponse](), + TestProbe().ref, + shouldBePreemptible = false, + serviceRegistryActor = serviceRegistryProbe.ref, + referenceInputFilesOpt = None + ) + backend2 ! Execute + serviceRegistryProbe.expectNoMessage(timeout) + } + + it should "sends proper metrics for docker image cache feature" in { + + val jobDescriptor = buildPreemptibleJobDescriptor(0, 0, 0) + val serviceRegistryProbe = TestProbe() + val madeUpDockerImageName = "test_madeup_docker_image_name" + + val expectedMessageWhenRequestedNotFound = InstrumentationServiceMessage(CromwellIncrement(CromwellBucket(List.empty, NonEmptyList("docker", List("image", "cache", "image_not_in_cache", madeUpDockerImageName))))) + val backendDockerCacheRequestedButNotFound = executionActor( + jobDescriptor, + Promise[BackendJobExecutionResponse](), + TestProbe().ref, + shouldBePreemptible = false, + serviceRegistryActor = serviceRegistryProbe.ref, + dockerImageCacheTestingParamsOpt = + Option( + DockerImageCacheTestingParameters( + None, + "test_madeup_docker_image_name", + isDockerImageCacheUsageRequested = true + ) + ) + ) + backendDockerCacheRequestedButNotFound ! Execute + serviceRegistryProbe.expectMsg(expectedMessageWhenRequestedNotFound) + + val expectedMessageWhenRequestedAndFound = InstrumentationServiceMessage(CromwellIncrement(CromwellBucket(List.empty, NonEmptyList("docker", List("image", "cache", "used_image_from_cache", madeUpDockerImageName))))) + val backendDockerCacheRequestedAndFound = executionActor( + jobDescriptor, + Promise[BackendJobExecutionResponse](), + TestProbe().ref, + shouldBePreemptible = false, + serviceRegistryActor = serviceRegistryProbe.ref, + dockerImageCacheTestingParamsOpt = + Option( + DockerImageCacheTestingParameters( + Option("test_madeup_disk_image_name"), + "test_madeup_docker_image_name", + isDockerImageCacheUsageRequested = true + ) + ) + ) + backendDockerCacheRequestedAndFound ! Execute + serviceRegistryProbe.expectMsg(expectedMessageWhenRequestedAndFound) + + val expectedMessageWhenNotRequestedButFound = InstrumentationServiceMessage(CromwellIncrement(CromwellBucket(List.empty, NonEmptyList("docker", List("image", "cache", "cached_image_not_used", madeUpDockerImageName))))) + val backendDockerCacheNotRequestedButFound = executionActor( + jobDescriptor, + Promise[BackendJobExecutionResponse](), + TestProbe().ref, + shouldBePreemptible = false, + serviceRegistryActor = serviceRegistryProbe.ref, + dockerImageCacheTestingParamsOpt = + Option( + DockerImageCacheTestingParameters( + Option("test_madeup_disk_image_name"), + "test_madeup_docker_image_name", + isDockerImageCacheUsageRequested = false + ) + ) + ) + backendDockerCacheNotRequestedButFound ! Execute + serviceRegistryProbe.expectMsg(expectedMessageWhenNotRequestedButFound) + + val backendDockerCacheNotRequestedNotFound = executionActor( + jobDescriptor, + Promise[BackendJobExecutionResponse](), + TestProbe().ref, + shouldBePreemptible = false, + serviceRegistryActor = serviceRegistryProbe.ref, + dockerImageCacheTestingParamsOpt = + Option( + DockerImageCacheTestingParameters( + None, + "test_madeup_docker_image_name", + isDockerImageCacheUsageRequested = false + ) + ) + ) + backendDockerCacheNotRequestedNotFound ! Execute + serviceRegistryProbe.expectNoMessage(timeout) + } + + it should "not restart 2 of 1 unexpected shutdowns without another preemptible VM" in { + + val actorRef = buildPreemptibleTestActorRef(2, 1) + val batchBackend = actorRef.underlyingActor + val runId = generateStandardAsyncJob + val handle = new GcpBatchPendingExecutionHandle(null, runId, None, None) + + val failedStatus = RunStatus.Failed(List.empty) + val executionResult = batchBackend.handleExecutionResult(failedStatus, handle) + val result = Await.result(executionResult, timeout) + result.isInstanceOf[FailedNonRetryableExecutionHandle] shouldBe true + val failedHandle = result.asInstanceOf[FailedNonRetryableExecutionHandle] + failedHandle.returnCode shouldBe None + } + + it should "handle Failure Status for various errors" in { + + val actorRef = buildPreemptibleTestActorRef(1, 1) + val batchBackend = actorRef.underlyingActor + val runId = generateStandardAsyncJob + val handle = new GcpBatchPendingExecutionHandle(null, runId, None, None) + + def checkFailedResult(errorCode: Status, errorMessage: Option[String]): ExecutionHandle = { + val failed = RunStatus.Failed(List.empty) + Await.result(batchBackend.handleExecutionResult(failed, handle), timeout) + } + + checkFailedResult(Status.ABORTED, Option("15: Other type of error.")) + .isInstanceOf[FailedNonRetryableExecutionHandle] shouldBe true + checkFailedResult(Status.OUT_OF_RANGE, Option("14: Wrong errorCode.")).isInstanceOf[FailedNonRetryableExecutionHandle] shouldBe true + checkFailedResult(Status.ABORTED, Option("Weird error message.")).isInstanceOf[FailedNonRetryableExecutionHandle] shouldBe true + checkFailedResult(Status.ABORTED, Option("UnparsableInt: Even weirder error message.")) + .isInstanceOf[FailedNonRetryableExecutionHandle] shouldBe true + checkFailedResult(Status.ABORTED, None).isInstanceOf[FailedNonRetryableExecutionHandle] shouldBe true + actorRef.stop() + } + + it should "map GCS paths and *only* GCS paths to local" in { + val wdlString = + s"""|workflow wf { + | call t + |} + | + |task t { + | String abc + | File lf + | File gcsf + | command {} + | runtime { docker: "ubuntu" } + |} + |""".stripMargin + + val stringKey = "wf.t.abc" + val stringVal = WomString("abc") + val localFileKey = "wf.t.lf" + val localFileVal = WomSingleFile("/blah/abc") + val gcsFileKey = "wf.t.gcsf" + val gcsFileVal = WomSingleFile("gs://blah/abc") + + val inputs: Map[String, WomValue] = Map( + stringKey -> stringVal, + localFileKey -> localFileVal, + gcsFileKey -> gcsFileVal + ) + + val wdlNamespace = WdlNamespaceWithWorkflow.load( + wdlString, + Seq.empty[Draft2ImportResolver], + ).get + val womWorkflow = wdlNamespace.workflow.toWomWorkflowDefinition(isASubworkflow = false).getOrElse(fail("failed to get WomDefinition from WdlWorkflow")) + wdlNamespace.toWomExecutable(Option(inputs.toJson.compactPrint), NoIoFunctionSet, strictValidation = true) match { + case Right(womExecutable) => + val wdlInputs = womExecutable.resolvedExecutableInputs.flatMap({case (port, v) => v.select[WomValue] map { port -> _ }}) + + val workflowDescriptor = BackendWorkflowDescriptor( + WorkflowId.randomId(), + womWorkflow, + wdlInputs, + NoOptions, + Labels.empty, + HogGroup("foo"), + List.empty, + None + ) + + val call: CommandCallNode = workflowDescriptor.callable.graph.nodes.collectFirst({ case t: CommandCallNode => t }).get + val key = BackendJobDescriptorKey(call, None, 1) + val runtimeAttributes = makeRuntimeAttributes(call) + val jobDescriptor = BackendJobDescriptor(workflowDescriptor, key, runtimeAttributes, fqnWdlMapToDeclarationMap(inputs), NoDocker, None, Map.empty) + + val props = Props(new TestableGcpBatchJobExecutionActor(jobDescriptor, Promise(), gcpBatchConfiguration)) + val testActorRef = TestActorRef[TestableGcpBatchJobExecutionActor]( + props, s"TestableGcpBatchJobExecutionActor-${jobDescriptor.workflowDescriptor.id}") + + + def gcsPathToLocal(womValue: WomValue): WomValue = { + WomFileMapper.mapWomFiles(testActorRef.underlyingActor.mapCommandLineWomFile)(womValue).get + } + + val mappedInputs = jobDescriptor.localInputs safeMapValues gcsPathToLocal + + mappedInputs(stringKey) match { + case WomString(v) => assert(v.equalsIgnoreCase(stringVal.value)) + case _ => fail("test setup error") + } + + mappedInputs(localFileKey) match { + case wdlFile: WomSingleFile => assert(wdlFile.value.equalsIgnoreCase(localFileVal.value)) + case _ => fail("test setup error") + } + + mappedInputs(gcsFileKey) match { + case wdlFile: WomSingleFile => wdlFile.value shouldBe "/mnt/disks/cromwell_root/blah/abc" + case _ => fail("test setup error") + } + case Left(badtimes) => fail(badtimes.toList.mkString(", ")) + } + } + + it should "create a GcpBatchFileInput for the monitoring script, when specified" in { + + val workflowDescriptor = BackendWorkflowDescriptor( + WorkflowId.randomId(), + WdlNamespaceWithWorkflow.load(SampleWdl.EmptyString.asWorkflowSources(DockerAndDiskRuntime).workflowSource.get, + Seq.empty[Draft2ImportResolver]).get.workflow.toWomWorkflowDefinition(isASubworkflow = false).getOrElse(fail("failed to get WomDefinition from WdlWorkflow")), + Map.empty, + WorkflowOptions.fromJsonString("""{"monitoring_script": "gs://path/to/script"}""").get, + Labels.empty, + HogGroup("foo"), + List.empty, + None + ) + + val job: CommandCallNode = workflowDescriptor.callable.taskCallNodes.head + val runtimeAttributes = makeRuntimeAttributes(job) + val key = BackendJobDescriptorKey(job, None, 1) + val jobDescriptor = BackendJobDescriptor(workflowDescriptor, key, runtimeAttributes, Map.empty, NoDocker, None, Map.empty) + + val props = Props(new TestableGcpBatchJobExecutionActor(jobDescriptor, Promise(), gcpBatchConfiguration)) + val testActorRef = TestActorRef[TestableGcpBatchJobExecutionActor]( + props, s"TestableGcpBatchJobExecutionActor-${jobDescriptor.workflowDescriptor.id}") + + testActorRef.underlyingActor.monitoringScript shouldBe + Option(GcpBatchFileInput("monitoring-in", gcsPath("gs://path/to/script"), DefaultPathBuilder.get("monitoring.sh"), workingDisk)) + } + + it should "not create a GcpBatchFileInput for the monitoring script, when not specified" in { + + val workflowDescriptor = BackendWorkflowDescriptor( + WorkflowId.randomId(), + WdlNamespaceWithWorkflow.load(SampleWdl.EmptyString.asWorkflowSources(DockerAndDiskRuntime).workflowSource.get, + Seq.empty[Draft2ImportResolver]).get.workflow.toWomWorkflowDefinition(isASubworkflow = false).getOrElse(fail("failed to get WomDefinition from WdlWorkflow")), + Map.empty, + NoOptions, + Labels.empty, + HogGroup("foo"), + List.empty, + None + ) + + val job: CommandCallNode = workflowDescriptor.callable.graph.nodes.collectFirst({case t: CommandCallNode => t}).get + val key = BackendJobDescriptorKey(job, None, 1) + val runtimeAttributes = makeRuntimeAttributes(job) + val jobDescriptor = BackendJobDescriptor(workflowDescriptor, key, runtimeAttributes, Map.empty, NoDocker, None, Map.empty) + + val props = Props(new TestableGcpBatchJobExecutionActor(jobDescriptor, Promise(), gcpBatchConfiguration)) + val testActorRef = TestActorRef[TestableGcpBatchJobExecutionActor]( + props, s"TestableGcpBatchJobExecutionActor-${jobDescriptor.workflowDescriptor.id}") + + testActorRef.underlyingActor.monitoringScript shouldBe None + } + + it should "return GCP Batch log paths for non-scattered call" in { + + val workflowDescriptor = BackendWorkflowDescriptor( + WorkflowId(UUID.fromString("e6236763-c518-41d0-9688-432549a8bf7c")), + WdlNamespaceWithWorkflow.load( + SampleWdl.HelloWorld.asWorkflowSources(""" runtime {docker: "ubuntu:latest"} """).workflowSource.get, + Seq.empty[Draft2ImportResolver]).get.workflow.toWomWorkflowDefinition(isASubworkflow = false).getOrElse(fail("failed to get WomDefinition from WdlWorkflow")), + Map.empty, + WorkflowOptions.fromJsonString(s""" {"${GcpBatchWorkflowPaths.GcsRootOptionKey}": "gs://path/to/gcs_root"} """).get, + Labels.empty, + HogGroup("foo"), + List.empty, + None + ) + + val call: CommandCallNode = workflowDescriptor.callable.taskCallNodes.find(_.localName == "hello").get + val key = BackendJobDescriptorKey(call, None, 1) + val runtimeAttributes = makeRuntimeAttributes(call) + val jobDescriptor = BackendJobDescriptor(workflowDescriptor, key, runtimeAttributes, Map.empty, NoDocker, None, Map.empty) + + val props = Props(new TestableGcpBatchJobExecutionActor(jobDescriptor, Promise(), gcpBatchConfiguration)) + val testActorRef = TestActorRef[TestableGcpBatchJobExecutionActor]( + props, s"TestableGcpBatchJobExecutionActor-${jobDescriptor.workflowDescriptor.id}") + + val batchBackend = testActorRef.underlyingActor + + batchBackend.gcpBatchCallPaths.stdout should be(a[GcsPath]) + batchBackend.gcpBatchCallPaths.stdout.pathAsString shouldBe + "gs://path/to/gcs_root/wf_hello/e6236763-c518-41d0-9688-432549a8bf7c/call-hello/stdout" + batchBackend.gcpBatchCallPaths.stderr should be(a[GcsPath]) + batchBackend.gcpBatchCallPaths.stderr.pathAsString shouldBe + "gs://path/to/gcs_root/wf_hello/e6236763-c518-41d0-9688-432549a8bf7c/call-hello/stderr" + batchBackend.gcpBatchCallPaths.batchLogPath should be(a[GcsPath]) + batchBackend.gcpBatchCallPaths.batchLogPath.pathAsString shouldBe + "gs://path/to/gcs_root/wf_hello/e6236763-c518-41d0-9688-432549a8bf7c/call-hello/hello.log" + } + + it should "return Batch log paths for scattered call" in { + + val workflowDescriptor = BackendWorkflowDescriptor( + WorkflowId(UUID.fromString("e6236763-c518-41d0-9688-432549a8bf7d")), + WdlNamespaceWithWorkflow.load( + new SampleWdl.ScatterWdl().asWorkflowSources(""" runtime {docker: "ubuntu:latest"} """).workflowSource.get, + Seq.empty[Draft2ImportResolver]).get.workflow.toWomWorkflowDefinition(isASubworkflow = false).getOrElse(fail("failed to get WomDefinition from WdlWorkflow")), + Map.empty, + WorkflowOptions.fromJsonString(s""" {"${GcpBatchWorkflowPaths.GcsRootOptionKey}": "gs://path/to/gcs_root"} """).get, + Labels.empty, + HogGroup("foo"), + List.empty, + None + ) + + val call: CommandCallNode = workflowDescriptor.callable.taskCallNodes.find(_.localName == "B").get + val key = BackendJobDescriptorKey(call, Option(2), 1) + val runtimeAttributes = makeRuntimeAttributes(call) + val jobDescriptor = BackendJobDescriptor(workflowDescriptor, key, runtimeAttributes, Map.empty, NoDocker, None, Map.empty) + + val props = Props(new TestableGcpBatchJobExecutionActor(jobDescriptor, Promise(), gcpBatchConfiguration)) + val testActorRef = TestActorRef[TestableGcpBatchJobExecutionActor]( + props, s"TestableGcpBatchJobExecutionActor-${jobDescriptor.workflowDescriptor.id}") + + val batchBackend = testActorRef.underlyingActor + + batchBackend.gcpBatchCallPaths.stdout should be(a[GcsPath]) + batchBackend.gcpBatchCallPaths.stdout.pathAsString shouldBe + "gs://path/to/gcs_root/w/e6236763-c518-41d0-9688-432549a8bf7d/call-B/shard-2/stdout" + batchBackend.gcpBatchCallPaths.stderr should be(a[GcsPath]) + batchBackend.gcpBatchCallPaths.stderr.pathAsString shouldBe + "gs://path/to/gcs_root/w/e6236763-c518-41d0-9688-432549a8bf7d/call-B/shard-2/stderr" + batchBackend.gcpBatchCallPaths.batchLogPath should be(a[GcsPath]) + batchBackend.gcpBatchCallPaths.batchLogPath.pathAsString shouldBe + "gs://path/to/gcs_root/w/e6236763-c518-41d0-9688-432549a8bf7d/call-B/shard-2/B-2.log" + } + + it should "return preemptible = true only in the correct cases" in { + + def attempt(max: Int, attempt: Int): GcpBatchAsyncBackendJobExecutionActor = { + buildPreemptibleTestActorRef(attempt, max).underlyingActor + } + def attempt1(max: Int) = attempt(max, 1) + def attempt2(max: Int) = attempt(max, 2) + + val descriptorWithMax0AndKey1 = attempt1(max = 0) + descriptorWithMax0AndKey1.preemptible shouldBe false + + val descriptorWithMax1AndKey1 = attempt1(max = 1) + descriptorWithMax1AndKey1.preemptible shouldBe true + + val descriptorWithMax2AndKey1 = attempt1(max = 2) + descriptorWithMax2AndKey1.preemptible shouldBe true + + val descriptorWithMax1AndKey2 = attempt2(max = 1) + descriptorWithMax1AndKey2.preemptible shouldBe false + + val descriptorWithMax2AndKey2 = attempt2(max = 2) + descriptorWithMax2AndKey2.preemptible shouldBe true + } + + it should "return the project from the workflow options in the start metadata" in { + + val googleProject = "baa-ram-ewe" + val batchGcsRoot = "gs://anorexic/duck" + val workflowId = WorkflowId.randomId() + val workflowDescriptor = BackendWorkflowDescriptor( + workflowId, + WdlNamespaceWithWorkflow + .load( + SampleWdl.EmptyString.asWorkflowSources(DockerAndDiskRuntime).workflowSource.get, + Seq.empty[Draft2ImportResolver] + ) + .get + .workflow + .toWomWorkflowDefinition(isASubworkflow = false) + .getOrElse(fail("failed to get WomDefinition from WdlWorkflow")), + Map.empty, + WorkflowOptions.fromJsonString( + s"""|{ + | "google_project": "$googleProject", + | "${GcpBatchWorkflowPaths.GcsRootOptionKey}": "$batchGcsRoot" + |} + |""".stripMargin + ).get, + Labels.empty, + HogGroup("foo"), + List.empty, + None + ) + + val call: CommandCallNode = workflowDescriptor.callable.taskCallNodes.find(_.localName == "goodbye").get + val key = BackendJobDescriptorKey(call, None, 1) + val runtimeAttributes = makeRuntimeAttributes(call) + val jobDescriptor = BackendJobDescriptor(workflowDescriptor, key, runtimeAttributes, Map.empty, NoDocker, None, Map.empty) + + val props = Props(new TestableGcpBatchJobExecutionActor(jobDescriptor, Promise(), gcpBatchConfiguration)) + val testActorRef = TestActorRef[TestableGcpBatchJobExecutionActor]( + props, s"TestableGcpBatchJobExecutionActor-${jobDescriptor.workflowDescriptor.id}") + + val batchBackend = testActorRef.underlyingActor + + // NOTE: The commented lines are not provided by batch yet, we need to check whether those are necessary + val actual = batchBackend.startMetadataKeyValues.safeMapValues(_.toString) + actual should be( + Map( + // "backendLogs:log" -> s"$batchGcsRoot/wf_hello/$workflowId/call-goodbye/goodbye.log", + "callRoot" -> s"$batchGcsRoot/wf_hello/$workflowId/call-goodbye", + "gcpBatch:executionBucket" -> batchGcsRoot, + "gcpBatch:googleProject" -> googleProject, + "labels:cromwell-workflow-id" -> s"cromwell-$workflowId", + "labels:wdl-task-name" -> "goodbye", + "preemptible" -> "false", + "runtimeAttributes:bootDiskSizeGb" -> "10", + "runtimeAttributes:continueOnReturnCode" -> "0", + "runtimeAttributes:cpu" -> "1", + "runtimeAttributes:cpuMin" -> "1", + "runtimeAttributes:disks" -> "local-disk 200 SSD", + "runtimeAttributes:docker" -> "ubuntu:latest", + "runtimeAttributes:failOnStderr" -> "false", + "runtimeAttributes:memory" -> "2 GB", + "runtimeAttributes:memoryMin" -> "2 GB", + "runtimeAttributes:noAddress" -> "false", + "runtimeAttributes:preemptible" -> "0", + "runtimeAttributes:zones" -> "us-central1-b,us-central1-a", + "runtimeAttributes:maxRetries" -> "0", + "stderr" -> s"$batchGcsRoot/wf_hello/$workflowId/call-goodbye/stderr", + "stdout" -> s"$batchGcsRoot/wf_hello/$workflowId/call-goodbye/stdout" + ) + ) + + } + + private def makeRuntimeAttributes(job: CommandCallNode) = { + val evaluatedAttributes = RuntimeAttributeDefinition.evaluateRuntimeAttributes(job.callable.runtimeAttributes, null, Map.empty) + RuntimeAttributeDefinition.addDefaultsToAttributes( + runtimeAttributesBuilder.definitions.toSet, NoOptions)(evaluatedAttributes.getOrElse(fail("Failed to evaluate runtime attributes"))) + } + + private def generateStandardAsyncJob = { + StandardAsyncJob(JobName.newBuilder().setJob(UUID.randomUUID().toString).setProject("test").setLocation("local").build().toString) + } +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchBackendLifecycleActorFactorySpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchBackendLifecycleActorFactorySpec.scala new file mode 100644 index 00000000000..d3bf3a7a3d1 --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchBackendLifecycleActorFactorySpec.scala @@ -0,0 +1,64 @@ +package cromwell.backend.google.batch.actors + +import cromwell.backend.google.batch.GcpBatchBackendLifecycleActorFactory; +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes; +import eu.timepit.refined.numeric.Positive +import eu.timepit.refined.refineV +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers +import org.scalatest.prop.TableDrivenPropertyChecks + +import scala.concurrent.duration._ +import scala.language.postfixOps + + +class GcpBatchBackendLifecycleActorFactorySpec extends AnyFlatSpecLike with Matchers with TableDrivenPropertyChecks { + + "GcpBatchBackendLifecycleActorFactory" should "robustly build configuration attributes" in { + + val attributes = new GcpBatchConfigurationAttributes( + project = "project", + computeServiceAccount = "computeServiceAccount", + auths = null, + restrictMetadataAccess = true, + dockerhubToken = "test", + enableFuse = true, + executionBucket = "executionBucket", + location = "location", + maxPollingInterval = 0, + qps = refineV[Positive](1).toOption.get, + cacheHitDuplicationStrategy = null, + requestWorkers = refineV[Positive](1).toOption.get, + batchTimeout = 1 second, + logFlushPeriod = Option(1 second), + gcsTransferConfiguration = null, + virtualPrivateCloudConfiguration = null, + batchRequestTimeoutConfiguration = null, + referenceFileToDiskImageMappingOpt = None, + dockerImageToCacheDiskImageMappingOpt = None, + checkpointingInterval = 1 second) + + GcpBatchBackendLifecycleActorFactory.robustBuildAttributes(() => attributes) shouldBe attributes + } + + { + // The message string actually observed during construction failure. + val actualRetryMessage = s"We encountered an internal error. Please try again." + val fails = Table( + ("attempts", "description", "function"), + (3, "no exception message", () => throw new RuntimeException()), + (3, "exception message", () => throw new RuntimeException(actualRetryMessage)), + (1, "error not exception, no message", () => throw new Error()), + (1, "error not exception", () => throw new Error(actualRetryMessage)) + ) + forAll(fails) { (attempts, description, function) => + it should s"$description: make $attempts attribute creation attempts before giving up" in { + val e = the[RuntimeException] thrownBy { + GcpBatchBackendLifecycleActorFactory.robustBuildAttributes(function, initialIntervalMillis = 1, maxIntervalMillis = 5) + } + e.getMessage should startWith(s"Failed to build GcpBatchConfigurationAttributes on attempt $attempts of 3") + } + } + } + +} diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchInitializationActorSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchInitializationActorSpec.scala new file mode 100644 index 00000000000..c57902531c8 --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchInitializationActorSpec.scala @@ -0,0 +1,185 @@ +package cromwell.backend.google.batch.actors + +import java.util.UUID +import akka.actor.Props +import akka.testkit._ +import com.typesafe.config.{Config, ConfigFactory} +import cromwell.backend.BackendWorkflowInitializationActor.{InitializationFailed, InitializationSuccess, Initialize} +import cromwell.backend.async.RuntimeAttributeValidationFailures +import cromwell.backend.google.batch.models.GcpBatchConfiguration +import cromwell.backend.google.batch.actors.GcpBatchInitializationActorSpec._ +import cromwell.backend.google.batch.models.GcpBatchTestConfig.{BatchGlobalConfig, googleConfiguration, batchAttributes} +import cromwell.backend.{BackendConfigurationDescriptor, BackendSpec, BackendWorkflowDescriptor} +import cromwell.core.Dispatcher.BackendDispatcher +import cromwell.core.TestKitSuite +import cromwell.core.filesystem.CromwellFileSystems +import cromwell.core.logging.LoggingTest._ +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers +import wom.graph.CommandCallNode + +import scala.concurrent.duration._ + +class GcpBatchInitializationActorSpec extends TestKitSuite with AnyFlatSpecLike with Matchers + with ImplicitSender { + val Timeout: FiniteDuration = 30.second.dilated + + import BackendSpec._ + + val HelloWorld: String = + s""" + |task hello { + | String addressee = "you" + | command { + | echo "Hello $${addressee}!" + | } + | output { + | String salutation = read_string(stdout()) + | } + | + | RUNTIME + |} + | + |workflow wf_hello { + | call hello + |} + """.stripMargin + + private def getJesBackendProps(workflowDescriptor: BackendWorkflowDescriptor, + calls: Set[CommandCallNode], + jesConfiguration: GcpBatchConfiguration): Props = { + val ioActor = mockIoActor + val params = GcpBatchInitializationActorParams(workflowDescriptor, ioActor, calls, jesConfiguration, emptyActor, restarting = false) + Props(new GcpBatchInitializationActor(params)).withDispatcher(BackendDispatcher) + } + + private def getJesBackend(workflowDescriptor: BackendWorkflowDescriptor, calls: Set[CommandCallNode], conf: BackendConfigurationDescriptor) = { + val props = getJesBackendProps(workflowDescriptor, calls, new GcpBatchConfiguration(conf, googleConfiguration, batchAttributes)) + system.actorOf(props, "TestableJesInitializationActor-" + UUID.randomUUID) + } + + behavior of "GcpBatchInitializationActor" + + it should "log a warning message when there are unsupported runtime attributes" in { + + within(Timeout) { + val workflowDescriptor = buildWdlWorkflowDescriptor(HelloWorld, + runtime = """runtime { docker: "ubuntu/latest" test: true }""") + val backend = getJesBackend(workflowDescriptor, workflowDescriptor.callable.taskCallNodes, + defaultBackendConfig) + val eventPattern = + "Key/s [test] is/are not supported by backend. Unsupported attributes will not be part of job executions." + EventFilter.warning(pattern = escapePattern(eventPattern), occurrences = 1) intercept { + backend ! Initialize + } + expectMsgPF() { + case InitializationSuccess(_) => //Docker entry is present. + case InitializationFailed(failure) => fail(s"InitializationSuccess was expected but got $failure") + } + } + } + + it should "return InitializationFailed when docker runtime attribute key is not present" in { + within(Timeout) { + val workflowDescriptor = buildWdlWorkflowDescriptor(HelloWorld, runtime = """runtime { }""") + val backend = getJesBackend(workflowDescriptor, workflowDescriptor.callable.taskCallNodes, + defaultBackendConfig) + backend ! Initialize + expectMsgPF() { + case InitializationFailed(failure) => + failure match { + case exception: RuntimeAttributeValidationFailures => + if (!exception.getMessage.equals("Runtime validation failed:\nTask hello has an invalid runtime attribute docker = !! NOT FOUND !!")) + fail("Exception message is not equal to 'Runtime validation failed:\nTask hello has an invalid runtime attribute docker = !! NOT FOUND !!'.") + } + } + } + } +} + +object GcpBatchInitializationActorSpec { + val globalConfig: Config = ConfigFactory.parseString( + """ + |google { + | + | application-name = "cromwell" + | + | auths = [ + | { + | name = "application-default" + | scheme = "mock" + | } + | ] + |} + |""".stripMargin) + + val backendConfigTemplate: String = + """ + | // Google project + | project = "my-cromwell-workflows" + | + | // Base bucket for workflow executions + | root = "gs://my-cromwell-workflows-bucket" + | + | // Polling for completion backs-off gradually for slower-running jobs. + | // This is the maximum polling interval (in seconds): + | maximum-polling-interval = 600 + | + | genomics { + | // A reference to an auth defined in the `google` stanza at the top. This auth is used to create + | // Pipelines and manipulate auth JSONs. + | auth = "application-default" + | // Endpoint for APIs, no reason to change this unless directed by Google. + | endpoint-url = "https://genomics.googleapis.com/" + | } + | + | default-runtime-attributes { + | cpu: 1 + | failOnStderr: false + | # Allowed to be a boolean, or a list of Ints, or an Int + | continueOnReturnCode: 0 + | memory: "2 GB" + | bootDiskSizeGb: 10 + | # Allowed to be a String, or a list of Strings + | disks: "local-disk 10 SSD" + | noAddress: false + | preemptible: 0 + | zones: ["us-central1-a", "us-central1-b"] + | } + | filesystems { + | gcs { + | // A reference to a potentially different auth for manipulating files via engine functions. + | auth = "application-default" + | } + | } + | + |[VPCCONFIG] + | + |[DOCKERHUBCONFIG] + |""".stripMargin + + val backendConfig: Config = ConfigFactory.parseString(backendConfigTemplate.replace("[VPCCONFIG]", "").replace("[DOCKERHUBCONFIG]", "")) + + val dockerBackendConfig: Config = ConfigFactory.parseString(backendConfigTemplate.replace("[VPCCONFIG]", "").replace("[DOCKERHUBCONFIG]", + """ + |dockerhub { + | account = "my@docker.account" + | # no secrets here guys this is just `echo -n username:password | base64` + | token = "dXNlcm5hbWU6cGFzc3dvcmQ=" + |} + | """.stripMargin)) + + val vpcBackendConfig: Config = ConfigFactory.parseString(backendConfigTemplate.replace("[DOCKERHUBCONFIG]", "").replace("[VPCCONFIG]", + """ + |virtual-private-cloud { + | network-label-key = "cromwell-ci-network" + | subnetwork-label-key = "cromwell-ci-subnetwork" + | auth = "service_account" + |} + | """.stripMargin)) + + private val defaultBackendConfig = new BackendConfigurationDescriptor(backendConfig, globalConfig) { + override private[backend] lazy val cromwellFileSystems = new CromwellFileSystems(BatchGlobalConfig) + } +} + diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/io/GcpBatchAttachedDiskSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/io/GcpBatchAttachedDiskSpec.scala new file mode 100644 index 00000000000..a168743db8f --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/io/GcpBatchAttachedDiskSpec.scala @@ -0,0 +1,47 @@ +package cromwell.backend.google.batch.io + +import common.assertion.CromwellTimeoutSpec +import cromwell.core.path.DefaultPathBuilder +import org.scalatest.TryValues +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import org.scalatest.prop.TableDrivenPropertyChecks._ +import org.scalatest.prop.Tables.Table + +import scala.util.Failure + +class GcpBatchAttachedDiskSpec extends AnyFlatSpec with CromwellTimeoutSpec with Matchers with TryValues { + val validTable = Table( + ("unparsed", "parsed"), + ("/mnt 3 SSD", PipelinesApiEmptyMountedDisk(DiskType.SSD, 3, DefaultPathBuilder.get("/mnt"))), + ("/mnt/my_path 10 HDD", PipelinesApiEmptyMountedDisk(DiskType.HDD, 10, DefaultPathBuilder.get("/mnt/my_path"))), + ("local-disk 100 SSD", GcpBatchWorkingDisk(DiskType.SSD, 100)), + ("local-disk 100 LOCAL", GcpBatchWorkingDisk(DiskType.LOCAL, 100)) + ) + + it should "parse" in { + forAll(validTable) { (unparsed, parsed) => + GcpBatchAttachedDisk.parse(unparsed).get shouldEqual parsed + } + } + + it should "stringify" in { + forAll(validTable) { (unparsed, parsed) => + parsed.toString shouldEqual unparsed + } + } + + val invalidTable = Table( + "unparsed", + "local-disk BAD HDD", + "local-disk 10 BAD", + "BAD 100 SSD", + "foobar" + ) + + it should "reject malformed disk mounts" in { + forAll(invalidTable) { (unparsed) => + GcpBatchAttachedDisk.parse(unparsed) should be(a[Failure[_]]) + } + } +} diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchAttributeSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchAttributeSpec.scala new file mode 100644 index 00000000000..6743962a4e0 --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchAttributeSpec.scala @@ -0,0 +1,80 @@ +package cromwell.backend.google.batch.models + +import cromwell.backend.google.batch.models.GpuResource.GpuType +import cromwell.backend.google.batch.models.GcpBatchTestConfig.gcpBatchConfiguration +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpecLike +import wom.values.{WomFloat, WomInteger, WomSingleFile, WomString, WomValue} + +class GcpBatchGpuAttributesSpec + extends AnyWordSpecLike + with Matchers + with GcpBatchRuntimeAttributesSpecsMixin { + + val validGpuTypes = List( + (Option(WomString("nvidia-tesla-k80")), Option(GpuType.NVIDIATeslaK80)), + (Option(WomString("nvidia-tesla-p100")), Option( GpuType.NVIDIATeslaP100)), + (Option(WomString("custom-gpu-24601")), Option( GpuType("custom-gpu-24601"))), + (None, None)) + val invalidGpuTypes = List( + WomSingleFile("nvidia-tesla-k80"), + WomInteger(100)) + + val validGpuCounts = List( + (Option(WomInteger(1)), Option(1)), + (Option(WomInteger(100)), Option(100)), + (None, None) + ) + val invalidGpuCounts = List( + WomString("ten"), + WomFloat(1.0)) + + validGpuTypes foreach { case (validGpuType, expectedGpuTypeValue) => + validGpuCounts foreach { case (validGpuCount, expectedGpuCountValue) => + s"validate the valid gpu type '$validGpuType' and count '$validGpuCount'" in { + val runtimeAttributes = Map( + "docker" -> WomString("ubuntu:latest") + ) ++ validGpuType.map(t => "gpuType" -> t) ++ validGpuCount.map(c => "gpuCount" -> c) + + val actualRuntimeAttributes = toBatchRuntimeAttributes(runtimeAttributes, emptyWorkflowOptions, gcpBatchConfiguration) + + expectedGpuTypeValue match { + case Some(v) => actualRuntimeAttributes.gpuResource.exists(_.gpuType == v) + case None => actualRuntimeAttributes.gpuResource.foreach(_.gpuType == GpuType.DefaultGpuType) + } + + expectedGpuCountValue match { + case Some(v) => actualRuntimeAttributes.gpuResource.exists(_.gpuCount.value == v) + case None => actualRuntimeAttributes.gpuResource.foreach(_.gpuCount.value == GpuType.DefaultGpuCount.value) + } + + } + } + + invalidGpuCounts foreach { invalidGpuCount => + s"not validate a valid gpu type '$validGpuType' but an invalid gpu count '$invalidGpuCount'" in { + val runtimeAttributes: Map[String, WomValue] = Map( + "docker" -> WomString("ubuntu:latest") + ) ++ validGpuType.map(t => "gpuType" -> t) + ("gpuCount" -> invalidGpuCount) + + assertBatchRuntimeAttributesFailedCreation( + runtimeAttributes, + s"Invalid gpu count. Expected positive Int but got") + } + } + } + + invalidGpuTypes foreach { invalidGpuType => + invalidGpuCounts foreach { invalidGpuCount => + s"not validate a invalid gpu type '$invalidGpuType' and invalid gpu count '$invalidGpuCount'" in { + val runtimeAttributes: Map[String, WomValue] = Map( + "docker" -> WomString("ubuntu:latest") + ) + ("gpuType" -> invalidGpuType) + ("gpuCount" -> invalidGpuCount) + + assertBatchRuntimeAttributesFailedCreation( + runtimeAttributes, + s"Invalid gpu count. Expected positive Int but got") + } + } + } +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchConfigurationAttributesSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchConfigurationAttributesSpec.scala new file mode 100644 index 00000000000..d914e9f2978 --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchConfigurationAttributesSpec.scala @@ -0,0 +1,469 @@ +package cromwell.backend.google.batch.models + +import cats.data.Validated.{Invalid, Valid} +import cats.syntax.validated._ +import com.typesafe.config.{Config, ConfigFactory} +import common.assertion.CromwellTimeoutSpec +import common.exception.MessageAggregation +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes._ +import cromwell.backend.google.batch.models.GcpBatchTestConfig.BatchGlobalConfig +import cromwell.cloudsupport.gcp.GoogleConfiguration +import cromwell.cloudsupport.gcp.auth.MockAuthMode +import cromwell.filesystems.gcs.GcsPathBuilder +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import org.scalatest.prop.TableDrivenPropertyChecks + +//import java.net.URL +import scala.concurrent.duration._ + +class GcpBatchConfigurationAttributesSpec extends AnyFlatSpec with CromwellTimeoutSpec with Matchers + with TableDrivenPropertyChecks { + + behavior of "GcpBatchAttributes" + + val googleConfig: GoogleConfiguration = GoogleConfiguration(BatchGlobalConfig) + val runtimeConfig: Config = ConfigFactory.load() + + it should "parse correct Batch config" in { + + val backendConfig = ConfigFactory.parseString(configString()) + println(backendConfig) + + val gcpBatchAttributes = GcpBatchConfigurationAttributes(googleConfig, backendConfig, "batch") + println(gcpBatchAttributes) + gcpBatchAttributes.project should be("myProject") + gcpBatchAttributes.executionBucket should be("gs://myBucket") + gcpBatchAttributes.maxPollingInterval should be(600) + gcpBatchAttributes.computeServiceAccount should be("default") + gcpBatchAttributes.restrictMetadataAccess should be(false) + gcpBatchAttributes.referenceFileToDiskImageMappingOpt.isEmpty should be(true) + } + + it should "parse correct preemptible config" in { + + val backendConfig = ConfigFactory.parseString(configString(customContent = "preemptible = 3")) + + val gcpBatchAttributes = GcpBatchConfigurationAttributes(googleConfig, backendConfig, "batch") + gcpBatchAttributes.project should be("myProject") + gcpBatchAttributes.executionBucket should be("gs://myBucket") + gcpBatchAttributes.maxPollingInterval should be(600) + } + + it should "parse batch-requests.timeouts values correctly" in { + + val customContent = + """ + |batch-requests { + | timeouts { + | read = 100 hours + | connect = 10 seconds + | } + |} + """.stripMargin + + val backendConfig = ConfigFactory.parseString(configString(customContent = customContent)) + val gcpBatchAttributes = GcpBatchConfigurationAttributes(googleConfig, backendConfig, "batch") + + gcpBatchAttributes.batchRequestTimeoutConfiguration.readTimeoutMillis.get.value should be(100.hours.toMillis.toInt) + gcpBatchAttributes.batchRequestTimeoutConfiguration.connectTimeoutMillis.get.value should be(10.seconds.toMillis.toInt) + } + + it should "parse an empty batch-requests.timeouts section correctly" in { + + val customContent = + """ + |batch-requests { + | timeouts { + | # read = 100 hours + | # connect = 10 seconds + | } + |} + """.stripMargin + + val backendConfig = ConfigFactory.parseString(configString(customContent = customContent)) + val gcpBatchAttributes = GcpBatchConfigurationAttributes(googleConfig, backendConfig, "batch") + + gcpBatchAttributes.batchRequestTimeoutConfiguration should be(BatchRequestTimeoutConfiguration(None, None)) + } + + it should "parse batch-timeout" in { + + val backendConfig = ConfigFactory.parseString(configString(customContent = "batch-timeout = 3 days")) + val gcpBatchAttributes = GcpBatchConfigurationAttributes(googleConfig, backendConfig, "batch") + + gcpBatchAttributes.batchTimeout should be(3.days) + } + + it should "parse an undefined batch-timeout" in { + + val backendConfig = ConfigFactory.parseString(configString()) + val gcpBatchAttributes = GcpBatchConfigurationAttributes(googleConfig, backendConfig, "batch") + + gcpBatchAttributes.batchTimeout should be(7.days) + } + + it should "parse compute service account" in { + + val backendConfig = ConfigFactory.parseString(configString(genomics = """compute-service-account = "testing" """)) + + val gcpBatchAttributes = GcpBatchConfigurationAttributes(googleConfig, backendConfig, "batch") + gcpBatchAttributes.computeServiceAccount should be("testing") + } + + it should "parse restrict-metadata-access" in { + + val backendConfig = ConfigFactory.parseString(configString(genomics = "restrict-metadata-access = true")) + + val gcpBatchAttributes = GcpBatchConfigurationAttributes(googleConfig, backendConfig, "batch") + gcpBatchAttributes.restrictMetadataAccess should be(true) + + } + + it should "parse localization-attempts" in { + + val backendConfig = ConfigFactory.parseString(configString(genomics = "localization-attempts = 31380")) + + val gcpBatchAttributes = GcpBatchConfigurationAttributes(googleConfig, backendConfig, "batch") + gcpBatchAttributes.gcsTransferConfiguration.transferAttempts.value should be(31380) + } + + private val mockAuth = MockAuthMode("mock") + + private val validVpcConfigTests = Table( + ("description", "customConfig", "vpcConfig"), + ("empty body", "virtual-private-cloud {}", VirtualPrivateCloudConfiguration(None, None)), + ( + "labels config", + """virtual-private-cloud { + | network-label-key = my-network + | subnetwork-label-key = my-subnetwork + | auth = mock + |} + |""".stripMargin, + VirtualPrivateCloudConfiguration( + Option(VirtualPrivateCloudLabels("my-network", Option("my-subnetwork"), mockAuth)), + None, + ), + ), + ( + "labels config without subnetwork key", + """virtual-private-cloud { + | network-label-key = my-network + | auth = mock + |} + |""".stripMargin, + VirtualPrivateCloudConfiguration( + Option(VirtualPrivateCloudLabels("my-network", None, mockAuth)), + None, + ), + ), + ( + "literal config", + """virtual-private-cloud { + | network-name = my-network + | subnetwork-name = my-subnetwork + |} + |""".stripMargin, + VirtualPrivateCloudConfiguration( + None, + Option(VirtualPrivateCloudLiterals("my-network", Option("my-subnetwork"))), + ), + ), + ( + "literal config without subnetwork name", + """virtual-private-cloud { + | network-name = my-network + |} + |""".stripMargin, + VirtualPrivateCloudConfiguration( + None, + Option(VirtualPrivateCloudLiterals("my-network", None)), + ), + ), + ) + + private val invalidVPCConfigTests = Table( + ("description", "customConfig", "messages"), + ( + "without auth", + """virtual-private-cloud { + | network-label-key = my-network + |} + |""".stripMargin, + List("Virtual Private Cloud configuration is invalid. Missing keys: `auth`."), + ), + ( + "without network label-key", + """virtual-private-cloud { + | auth = mock + |} + |""".stripMargin, + List("Virtual Private Cloud configuration is invalid. Missing keys: `network-label-key`."), + ), + ( + "with just a subnetwork label key", + """virtual-private-cloud { + | subnetwork-label-key = my-subnetwork + |} + |""".stripMargin, + List("Virtual Private Cloud configuration is invalid. Missing keys: `network-label-key,auth`."), + ), + ( + "with subnetwork label network key and auth", + """ virtual-private-cloud { + | subnetwork-label-key = my-subnetwork + | auth = mock + | } + |""".stripMargin, + List("Virtual Private Cloud configuration is invalid. Missing keys: `network-label-key`."), + ), + ) + + forAll(validVpcConfigTests) { (description, customConfig, vpcConfig) => + it should s"parse virtual-private-cloud $description" in { + + val backendConfig = ConfigFactory.parseString(configString(customConfig)) + val gcpBatchAttributes = GcpBatchConfigurationAttributes(googleConfig, backendConfig, "batch") + gcpBatchAttributes.virtualPrivateCloudConfiguration should be(vpcConfig) + } + } + + forAll(invalidVPCConfigTests) { (description, customConfig, errorMessages) => + it should s"not parse invalid virtual-private-cloud config $description" in { + + val backendConfig = ConfigFactory.parseString(configString(customConfig)) + val exception = intercept[IllegalArgumentException with MessageAggregation] { + GcpBatchConfigurationAttributes(googleConfig, backendConfig, "batch") + } + exception.errorMessages.toList should be(errorMessages) + } + } + + it should "not parse invalid config" in { + + val nakedConfig = + ConfigFactory.parseString( + """ + |{ + | genomics { + | + | } + |} + """.stripMargin) + + val exception = intercept[IllegalArgumentException with MessageAggregation] { + GcpBatchConfigurationAttributes(googleConfig, nakedConfig, "batch") + } + val errorsList = exception.errorMessages.toList + errorsList should contain("String: 2: No configuration setting found for key 'project'") + errorsList should contain("String: 2: No configuration setting found for key 'root'") + errorsList should contain("String: 3: No configuration setting found for key 'genomics.auth'") + errorsList should contain("String: 2: No configuration setting found for key 'filesystems'") + } + + def configString(customContent: String = "", genomics: String = ""): String = + s""" + |{ + | project = "myProject" + | root = "gs://myBucket" + | maximum-polling-interval = 600 + | $customContent + | genomics { + | // A reference to an auth defined in the `google` stanza at the top. This auth is used to create + | // Pipelines and manipulate auth JSONs. + | auth = "mock" + | $genomics + | endpoint-url = "http://myEndpoint" + | location = "us-central1" + | } + | + | filesystems = { + | gcs { + | // A reference to a potentially different auth for manipulating files via engine functions. + | auth = "mock" + | } + | } + |} + | """.stripMargin + + it should "parse gsutil memory specifications" in { + + val valids = List("0", "150M", "14 PIBIT", "6kib") + + valids foreach { + case GcpBatchConfigurationAttributes.GsutilHumanBytes(_, _) => + case bad => fail(s"'$bad' was expected to be a valid gsutil memory specification") + } + } + + it should "reject invalid memory specifications" in { + + val invalids = List("-1", "150MB", "14PB") + + invalids foreach { + case invalid@GcpBatchConfigurationAttributes.GsutilHumanBytes(_, _) => fail(s"Memory specification $invalid not expected to be accepted") + case _ => + } + } + + it should "parse a missing \"reference-disk-localization-manifests\"" in { + + val backendConfig = ConfigFactory.parseString(configString()) + + val validation = GcpBatchConfigurationAttributes.validateReferenceDiskManifestConfigs(backendConfig, "batch") + + validation shouldBe None.validNel + } + + it should "parse a present but empty \"reference-disk-localization-manifests\"" in { + + val manifestConfig = "reference-disk-localization-manifests = []" + + val backendConfig = ConfigFactory.parseString(configString(customContent = manifestConfig)) + + val validation = GcpBatchConfigurationAttributes.validateReferenceDiskManifestConfigs(backendConfig, "batch") + + validation shouldBe Option(List.empty).validNel + } + + it should "parse a present and populated \"reference-disk-localization-manifests\"" in { + + // Highly abridged versions of hg19 and hg38 manifests just to test for correctness + // of parsing. + val manifestConfig = + """ + |reference-disk-localization-manifests = [ + |{ + | "imageIdentifier" : "hg19-public-2020-10-26", + | "diskSizeGb" : 10, + | "files" : [ { + | "path" : "gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai", + | "crc32c" : 159565724 + | }, { + | "path" : "gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.dict", + | "crc32c" : 1679459712 + | }] + |}, + |{ + | "imageIdentifier" : "hg38-public-2020-10-26", + | "diskSizeGb" : 20, + | "files" : [ { + | "path" : "gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + | "crc32c" : 930173616 + | }, { + | "path" : "gcp-public-data--broad-references/hg38/v0/exome_evaluation_regions.v1.interval_list", + | "crc32c" : 289077232 + | }] + |} + |] + |""".stripMargin + val backendConfig = ConfigFactory.parseString(configString(manifestConfig)) + val validation = GcpBatchConfigurationAttributes.validateReferenceDiskManifestConfigs(backendConfig, "batch") + val manifests: List[ManifestFile] = validation.toEither.toOption.get.get + + manifests shouldBe List( + ManifestFile( + imageIdentifier = "hg19-public-2020-10-26", + diskSizeGb = 10, + files = List( + ReferenceFile( + path = "gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai", + crc32c = 159565724 + ), + ReferenceFile( + path = "gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.dict", + crc32c = 1679459712 + ) + ) + ), + ManifestFile( + imageIdentifier = "hg38-public-2020-10-26", + diskSizeGb = 20, + files = List( + ReferenceFile( + path = "gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + crc32c = 930173616 + ), + ReferenceFile( + path = "gcp-public-data--broad-references/hg38/v0/exome_evaluation_regions.v1.interval_list", + crc32c = 289077232 + ) + ) + ) + ) + } + + it should "parse a present and invalid \"reference-disk-localization-manifests\"" in { + + val badValues = List( + "\"foo\"", + "{ foo: bar }", + s""" + |[{ + | # missing imageIdentifier + | "diskSizeGb" : 10, + | "files" : [ { + | "path" : "gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai", + | "crc32c" : 159565724 + | }] + |}]""", + s""" + |[{ + | "imageIdentifier" : "hg19-public-2020-10-26", + | # missing diskSizeGb + | "files" : [ { + | "path" : "gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai", + | "crc32c" : 159565724 + | }] + |}]""", + s""" + |[{ + | "imageIdentifier" : "hg19-public-2020-10-26", + | "diskSizeGb" : 10, + | # missing files + |}]""", + ) + + badValues foreach { badValue => + val customContent = s""""reference-disk-localization-manifests" = $badValue""" + val backendConfig = ConfigFactory.parseString(configString(customContent)) + val validation = GcpBatchConfigurationAttributes.validateReferenceDiskManifestConfigs(backendConfig, "batch") + validation.isInvalid shouldBe true + } + } + + + it should "parse correct existing docker-image-cache-manifest-file config" in { + + val dockerImageCacheManifest1Path = "gs://bucket/manifest1.json" + val dockerImageCacheManifestConfigStr = s"""docker-image-cache-manifest-file = "$dockerImageCacheManifest1Path"""" + val backendConfig = ConfigFactory.parseString(configString(dockerImageCacheManifestConfigStr)) + + val validatedGcsPathToDockerImageCacheManifestFileErrorOr = GcpBatchConfigurationAttributes.validateGcsPathToDockerImageCacheManifestFile(backendConfig) + validatedGcsPathToDockerImageCacheManifestFileErrorOr match { + case Valid(validatedGcsPathToDockerImageCacheManifestFileOpt) => + validatedGcsPathToDockerImageCacheManifestFileOpt match { + case Some(validatedGcsPathToDockerCacheManifestFile) => + validatedGcsPathToDockerCacheManifestFile shouldBe GcsPathBuilder.validateGcsPath(dockerImageCacheManifest1Path) + case None => + fail("GCS paths to docker image cache manifest files, parsed from config, should not be empty") + } + case Invalid(ex) => + fail(s"Error while parsing GCS paths to docker image cache manifest files from config: $ex") + } + } + + it should "parse correct missing docker-image-cache-manifest-file config" in { + + val backendConfig = ConfigFactory.parseString(configString()) + + val validatedGcsPathsToDockerImageCacheManifestFilesErrorOr = GcpBatchConfigurationAttributes.validateReferenceDiskManifestConfigs(backendConfig, "unit-test-backend") + validatedGcsPathsToDockerImageCacheManifestFilesErrorOr match { + case Valid(validatedGcsPathsToDockerImageCacheManifestFilesOpt) => + validatedGcsPathsToDockerImageCacheManifestFilesOpt shouldBe None + case Invalid(ex) => + fail(s"Error while parsing GCS paths to docker image cache manifest files from config: $ex") + } + } +} diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchConfigurationSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchConfigurationSpec.scala new file mode 100644 index 00000000000..46b7c147b34 --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchConfigurationSpec.scala @@ -0,0 +1,124 @@ +package cromwell.backend.google.batch.models + +import com.typesafe.config.ConfigFactory +import common.assertion.CromwellTimeoutSpec +import cromwell.backend.BackendConfigurationDescriptor +import cromwell.backend.google.batch.models.GcpBatchTestConfig._ +import cromwell.cloudsupport.gcp.GoogleConfiguration +import cromwell.core.path.DefaultPathBuilder +import org.scalatest.BeforeAndAfterAll +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import org.scalatest.prop.TableDrivenPropertyChecks + +class GcpBatchConfigurationSpec extends AnyFlatSpec with CromwellTimeoutSpec with Matchers with TableDrivenPropertyChecks with BeforeAndAfterAll { + + behavior of "GcpBatchConfigurationSpec" + + val mockFile = DefaultPathBuilder.createTempFile() + + override def afterAll(): Unit = { + mockFile.delete(swallowIOExceptions = true) + () + } + + val globalConfig = ConfigFactory.parseString( + s""" + |google { + | + | application-name = "cromwell" + | + | auths = [ + | { + | name = "application-default" + | scheme = "application_default" + | }, + | { + | name = "service-account" + | scheme = "service_account" + | service-account-id = "my-service-account" + | pem-file = "${mockFile.pathAsString}" + | } + | ] + |} + | + """.stripMargin) + + val backendConfig = ConfigFactory.parseString( + """ + | // Google project + | project = "my-cromwell-workflows" + | + | // Base bucket for workflow executions + | root = "gs://my-cromwell-workflows-bucket" + | + | // Polling for completion backs-off gradually for slower-running jobs. + | // This is the maximum polling interval (in seconds): + | maximum-polling-interval = 600 + | + | genomics { + | // A reference to an auth defined in the `google` stanza at the top. This auth is used to create + | // Pipelines and manipulate auth JSONs. + | auth = "application-default" + | location = "us-central1" + | } + | + | default-runtime-attributes { + | failOnStderr: false + | continueOnReturnCode: 0 + | cpu: 1 + | memory: "2 GB" + | bootDiskSizeGb: 10 + | disks: "local-disk 10 SSD" + | noAddress: false + | preemptible: 3 + | zones:["us-central1-a", "us-central1-b"] + | } + | + | dockerhub { + | account = "dockerAccount" + | token = "dockerToken" + | } + | + | filesystems { + | gcs { + | // A reference to a potentially different auth for manipulating files via engine functions. + | auth = "application-default" + | } + | } + | + """.stripMargin) + + it should "fail to instantiate if any required configuration is missing" in { + + val configs = Table( + ("backendConfig", "globalConfig"), + (backendConfig, globalConfig.withoutPath("google")), + (backendConfig.withoutPath("project"), globalConfig), + (backendConfig.withoutPath("root"), globalConfig), + (backendConfig.withoutPath("genomics"), globalConfig), + (backendConfig.withoutPath("genomics.location"), globalConfig), + (backendConfig.withoutPath("filesystems"), globalConfig), + (backendConfig.withoutPath("filesystems.gcs"), globalConfig), + (backendConfig.withoutPath("filesystems.gcs.auth"), globalConfig) + ) + + forAll(configs) { (backend, global) => + an[Exception] shouldBe thrownBy { + val failingGoogleConf = GoogleConfiguration(global) + val failingAttributes = GcpBatchConfigurationAttributes(failingGoogleConf, backend, "papi") + new GcpBatchConfiguration(BackendConfigurationDescriptor(backend, global), failingGoogleConf, failingAttributes) + } + } + } + + it should "have correct root" in { + new GcpBatchConfiguration(BackendConfigurationDescriptor(backendConfig, globalConfig), googleConfiguration, batchAttributes).root shouldBe "gs://my-cromwell-workflows-bucket" + } + + //it should "have correct docker" in { + // val dockerConf = new GcpBatchConfiguration(BackendConfigurationDescriptor(backendConfig, globalConfig), googleConfiguration, batchAttributes).dockerCredentials + // dockerConf shouldBe defined + // dockerConf.get.token shouldBe "dockerToken" + //} +} diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchJobPathsSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchJobPathsSpec.scala new file mode 100644 index 00000000000..ed2baf4fd06 --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchJobPathsSpec.scala @@ -0,0 +1,81 @@ +package cromwell.backend.google.batch.models + +import com.google.cloud.NoCredentials +import common.collections.EnhancedCollections._ +import cromwell.backend.BackendSpec +import cromwell.backend.google.batch.actors.GcpBatchInitializationActor +import cromwell.backend.google.batch.models.GcpBatchTestConfig.{gcpBatchConfiguration, pathBuilders} +import cromwell.backend.io.JobPathsSpecHelper._ +import cromwell.core.TestKitSuite +import cromwell.util.SampleWdl +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers +import spray.json.{JsObject, JsString} + +import scala.concurrent.ExecutionContext.Implicits.global + +class GcpBatchJobPathsSpec extends TestKitSuite with AnyFlatSpecLike with Matchers { + + import BackendSpec._ + + behavior of "GcpBatchCallPaths" + + it should "map the correct filenames" in { + val workflowDescriptor = buildWdlWorkflowDescriptor( + SampleWdl.HelloWorld.workflowSource(), + inputFileAsJson = Option(JsObject(SampleWdl.HelloWorld.rawInputs.safeMapValues(JsString.apply)).compactPrint) + ) + val jobDescriptorKey = firstJobDescriptorKey(workflowDescriptor) + + val workflowPaths = GcpBatchWorkflowPaths(workflowDescriptor, NoCredentials.getInstance(), NoCredentials.getInstance(), gcpBatchConfiguration, pathBuilders(), GcpBatchInitializationActor.defaultStandardStreamNameToFileNameMetadataMapper) + + val callPaths = GcpBatchJobPaths(workflowPaths, jobDescriptorKey) + + callPaths.returnCodeFilename should be("rc") + callPaths.stderr.getFileName.pathAsString should be("gs://my-cromwell-workflows-bucket/stderr") + callPaths.stdout.getFileName.pathAsString should be("gs://my-cromwell-workflows-bucket/stdout") + callPaths.batchLogFilename should be("hello.log") + } + + it should "map the correct paths" in { + val workflowDescriptor = buildWdlWorkflowDescriptor( + SampleWdl.HelloWorld.workflowSource(), + inputFileAsJson = Option(JsObject(SampleWdl.HelloWorld.rawInputs.safeMapValues(JsString.apply)).compactPrint) + ) + val jobDescriptorKey = firstJobDescriptorKey(workflowDescriptor) + + val workflowPaths = GcpBatchWorkflowPaths(workflowDescriptor, NoCredentials.getInstance(), NoCredentials.getInstance(), gcpBatchConfiguration, pathBuilders(), GcpBatchInitializationActor.defaultStandardStreamNameToFileNameMetadataMapper) + + val callPaths = GcpBatchJobPaths(workflowPaths, jobDescriptorKey) + + callPaths.returnCode.pathAsString should + be(s"gs://my-cromwell-workflows-bucket/wf_hello/${workflowDescriptor.id}/call-hello/rc") + callPaths.stdout.pathAsString should + be(s"gs://my-cromwell-workflows-bucket/wf_hello/${workflowDescriptor.id}/call-hello/stdout") + callPaths.stderr.pathAsString should + be(s"gs://my-cromwell-workflows-bucket/wf_hello/${workflowDescriptor.id}/call-hello/stderr") + callPaths.batchLogPath.pathAsString should + be(s"gs://my-cromwell-workflows-bucket/wf_hello/${workflowDescriptor.id}/call-hello/hello.log") + } + + it should "map the correct call context" in { + + val workflowDescriptor = buildWdlWorkflowDescriptor( + SampleWdl.HelloWorld.workflowSource(), + inputFileAsJson = Option(JsObject(SampleWdl.HelloWorld.rawInputs.safeMapValues(JsString.apply)).compactPrint) + ) + val jobDescriptorKey = firstJobDescriptorKey(workflowDescriptor) + + val workflowPaths = GcpBatchWorkflowPaths(workflowDescriptor, NoCredentials.getInstance(), NoCredentials.getInstance(), gcpBatchConfiguration, pathBuilders(), GcpBatchInitializationActor.defaultStandardStreamNameToFileNameMetadataMapper) + + val callPaths = GcpBatchJobPaths(workflowPaths, jobDescriptorKey) + + callPaths.callContext.root.pathAsString should + be(s"gs://my-cromwell-workflows-bucket/wf_hello/${workflowDescriptor.id}/call-hello") + callPaths.callContext.stdout should + be(s"gs://my-cromwell-workflows-bucket/wf_hello/${workflowDescriptor.id}/call-hello/stdout") + callPaths.callContext.stderr should + be(s"gs://my-cromwell-workflows-bucket/wf_hello/${workflowDescriptor.id}/call-hello/stderr") + } + +} diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchLabelSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchLabelSpec.scala new file mode 100644 index 00000000000..8ae97f4d0ca --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchLabelSpec.scala @@ -0,0 +1,38 @@ +package cromwell.backend.google.batch.models + +import cats.data.Validated.{Invalid, Valid} +import common.assertion.CromwellTimeoutSpec +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class GcpBatchLabelSpec extends AnyFlatSpec with CromwellTimeoutSpec with Matchers { + + behavior of "GoogleLabels" + + /** + * In the format 'to validate', 'expected result' + */ + val googleLabelConversions = List( + "11f2468c-39d6-4be3-85c8-32735c01e66b" -> "x--11f2468c-39d6-4be3-85c8-32735c01e66b", + "0-cromwell-root-workflow-id" -> "x--0-cromwell-root-workflow-id", + "cromwell-root-workflow-id-" -> "cromwell-root-workflow-id---x", + "0-cromwell-root-workflow-id-" -> "x--0-cromwell-root-workflow-id---x", + "Cromwell-root-workflow-id" -> "cromwell-root-workflow-id", + "cromwell_root_workflow_id" -> "cromwell-root-workflow-id", + "too-long-too-long-too-long-too-long-too-long-too-long-too-long-t" -> "too-long-too-long-too-long-too---g-too-long-too-long-too-long-t", + "0-too-long-and-invalid-too-long-and-invalid-too-long-and-invali+" -> "x--0-too-long-and-invalid-too----nvalid-too-long-and-invali---x" + ) + + googleLabelConversions foreach { case (label: String, conversion: String) => + it should s"not validate the bad label key '$label'" in { + GcpLabel.validateLabelRegex(label) match { + case Invalid(_) => // Good! + case Valid(_) => fail(s"Label validation succeeded but should have failed.") + } + } + + it should s"convert the bad label string '$label' into the safe label string '$conversion'" in { + GcpLabel.safeGoogleName(label) should be(conversion) + } + } +} diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributesSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributesSpec.scala new file mode 100644 index 00000000000..569a2af58eb --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributesSpec.scala @@ -0,0 +1,298 @@ +package cromwell.backend.google.batch.models + +import cats.data.NonEmptyList +import cromwell.backend.RuntimeAttributeDefinition +import cromwell.backend.google.batch.models.GcpBatchTestConfig._ +import cromwell.backend.validation.ContinueOnReturnCodeSet +//import cromwell.backend.google.batch.io.{DiskType, GcpBatchAttachedDisk} +import cromwell.backend.google.batch.io.{DiskType, GcpBatchWorkingDisk} +import cromwell.core.WorkflowOptions +import eu.timepit.refined.refineMV +import org.scalatest.TestSuite +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpecLike +import org.slf4j.helpers.NOPLogger +import spray.json._ +import wdl4s.parser.MemoryUnit +import wom.format.MemorySize +import wom.types._ +import wom.values._ + +import scala.util.{Failure, Success, Try} + +final class GcpBatchRuntimeAttributesSpec + extends AnyWordSpecLike + with Matchers + with GcpBatchRuntimeAttributesSpecsMixin { + + "GcpBatchRuntimeAttributes" should { + + "throw an exception when there are no runtime attributes defined." in { + val runtimeAttributes = Map.empty[String, WomValue] + assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, "Can't find an attribute value for key docker") + } + + "use hardcoded defaults if not declared in task, workflow options, or config (except for docker)" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest")) + val expectedRuntimeAttributes = expectedDefaults + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes, batchConfiguration = noDefaultsBatchConfiguration) + } + + "validate a valid Docker entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest")) + val expectedRuntimeAttributes = expectedDefaults + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes) + } + + "fail to validate an invalid Docker entry" in { + pending + val runtimeAttributes = Map("docker" -> WomInteger(1)) + assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, "Expecting docker runtime attribute to be String") + } + + "validate a valid failOnStderr entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "failOnStderr" -> WomBoolean(true)) + val expectedRuntimeAttributes = expectedDefaults.copy(failOnStderr = true) + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes) + } + + "fail to validate an invalid failOnStderr entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "failOnStderr" -> WomString("yes")) + assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, "Expecting failOnStderr runtime attribute to be a Boolean or a String with values of 'true' or 'false'") + } + + "fail to validate an invalid continueOnReturnCode entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "continueOnReturnCode" -> WomString("value")) + assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, "Expecting continueOnReturnCode runtime attribute to be either a Boolean, a String 'true' or 'false', or an Array[Int]") + } + + "validate a valid cpu entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "cpu" -> WomInteger(2)) + val expectedRuntimeAttributes = expectedDefaults.copy(cpu = refineMV(2)) + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes) + } + + "validate a valid cpu string entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "cpu" -> WomString("2")) + val expectedRuntimeAttributes = expectedDefaults.copy(cpu = refineMV(2)) + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes) + } + + "fail to validate an invalid cpu entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "cpu" -> WomString("value")) + assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, "Expecting cpu runtime attribute to be an Integer") + } + + "validate a valid zones entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "zones" -> WomString("us-central-z")) + val expectedRuntimeAttributes = expectedDefaults.copy(zones = Vector("us-central-z")) + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes) + } + + "fail to validate an invalid zones entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "zones" -> WomInteger(1)) + assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, "Expecting zones runtime attribute to be either a whitespace separated String or an Array[String]") + } + + "validate a valid array zones entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "zones" -> WomArray(WomArrayType(WomStringType), List(WomString("us-central1-y"), WomString("us-central1-z")))) + val expectedRuntimeAttributes = expectedDefaults.copy(zones = Vector("us-central1-y", "us-central1-z")) + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes) + } + + "fail to validate an invalid array zones entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "zones" -> WomArray(WomArrayType(WomIntegerType), List(WomInteger(1), WomInteger(2)))) + assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, "Expecting zones runtime attribute to be either a whitespace separated String or an Array[String]") + } + + "validate a valid preemptible entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "preemptible" -> WomInteger(3)) + val expectedRuntimeAttributes = expectedDefaults.copy(preemptible = 3) + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes) + } + + "fail to validate an invalid preemptible entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "preemptible" -> WomString("value")) + assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, + "Expecting preemptible runtime attribute to be an Integer") + } + + "validate a valid bootDiskSizeGb entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "bootDiskSizeGb" -> WomInteger(4)) + val expectedRuntimeAttributes = expectedDefaults.copy(bootDiskSize = 4) + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes) + } + + "fail to validate an invalid bootDiskSizeGb entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "bootDiskSizeGb" -> WomString("4GB")) + assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, "Expecting bootDiskSizeGb runtime attribute to be an Integer") + } + + // "validate a valid disks entry" in { + // val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "disks" -> WomString("local-disk 20 SSD")) + // val expectedRuntimeAttributes = expectedDefaults.copy(disks = Seq(GcpBatchAttachedDisk.parse("local-disk 20 SSD").get)) + // assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes) + // } + + //"fail to validate an invalid disks entry" in { + // val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "disks" -> WomInteger(10)) + // assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, "Expecting disks runtime attribute to be a comma separated String or Array[String]") + //} + + //"fail to validate a valid disks array entry" in { + // val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "disks" -> WomArray(WomArrayType(WomStringType), List(WomString("blah"), WomString("blah blah")))) + // assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, "Disk strings should be of the format 'local-disk SIZE TYPE' or '/mount/point SIZE TYPE'") + //} + + "validate a valid memory entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "memory" -> WomString("1 GB")) + val expectedRuntimeAttributes = expectedDefaults.copy(memory = MemorySize.parse("1 GB").get) + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes) + } + + "fail to validate an invalid memory entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "memory" -> WomString("blah")) + assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, "Expecting memory runtime attribute to be an Integer or String with format '8 GB'") + } + + "validate a valid noAddress entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "noAddress" -> WomBoolean(true)) + val expectedRuntimeAttributes = expectedDefaults.copy(noAddress = true) + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes) + } + + "fail to validate an invalid noAddress entry" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "noAddress" -> WomInteger(1)) + assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, + "Expecting noAddress runtime attribute to be a Boolean") + } + + "override config default attributes with default attributes declared in workflow options" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest")) + + val workflowOptionsJson = + """{ + | "default_runtime_attributes": { "cpu": 2 } + |} + """.stripMargin.parseJson.asInstanceOf[JsObject] + + val workflowOptions = WorkflowOptions.fromJsonObject(workflowOptionsJson).get + val expectedRuntimeAttributes = expectedDefaults.copy(cpu = refineMV(2)) + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes, workflowOptions) + } + + "override config default runtime attributes with task runtime attributes" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "cpu" -> WomInteger(4)) + + val workflowOptionsJson = + """{ + | "default_runtime_attributes": { "cpu": 2 } + |} + """.stripMargin.parseJson.asInstanceOf[JsObject] + + val workflowOptions = WorkflowOptions.fromJsonObject(workflowOptionsJson).get + val expectedRuntimeAttributes = expectedDefaults.copy(cpu = refineMV(4)) + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes, workflowOptions) + } + + "override invalid config default attributes with task runtime attributes" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest"), "cpu" -> WomInteger(4)) + + val workflowOptionsJson = + """{ + | "default_runtime_attributes": { "cpu": 2.2 } + |} + """.stripMargin.parseJson.asInstanceOf[JsObject] + + val workflowOptions = WorkflowOptions.fromJsonObject(workflowOptionsJson).get + val expectedRuntimeAttributes = expectedDefaults.copy(cpu = refineMV(4)) + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes, workflowOptions) + } + + "parse cpuPlatform correctly" in { + val runtimeAttributes = Map("docker" -> WomString("ubuntu:latest")) + val workflowOptionsJson = + """{ + | "default_runtime_attributes": { "cpuPlatform": "the platform" } + |} + """.stripMargin.parseJson.asInstanceOf[JsObject] + val workflowOptions = WorkflowOptions.fromJsonObject(workflowOptionsJson).get + val expectedRuntimeAttributes = expectedDefaults.copy(cpuPlatform = Option("the platform")) + assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes, expectedRuntimeAttributes, workflowOptions) + } + } +} + +trait GcpBatchRuntimeAttributesSpecsMixin { + this: TestSuite => + + def workflowOptionsWithDefaultRA(defaults: Map[String, JsValue]): WorkflowOptions = { + WorkflowOptions(JsObject(Map( + "default_runtime_attributes" -> JsObject(defaults) + ))) + } + + val expectedDefaults = new GcpBatchRuntimeAttributes( + cpu = refineMV(1), + cpuPlatform = None, + gpuResource = None, + zones = Vector("us-central1-b", "us-central1-a"), + preemptible = 0, + bootDiskSize = 10, + memory = MemorySize(2, MemoryUnit.GB), + disks = Vector(GcpBatchWorkingDisk(DiskType.SSD, 10)), + dockerImage = "ubuntu:latest", + failOnStderr = false, + continueOnReturnCode = ContinueOnReturnCodeSet(Set(0)), + noAddress = false, + useDockerImageCache = None, + checkpointFilename = None + ) + + def assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes: Map[String, WomValue], + expectedRuntimeAttributes: GcpBatchRuntimeAttributes, + workflowOptions: WorkflowOptions = emptyWorkflowOptions, + defaultZones: NonEmptyList[String] = defaultZones, + batchConfiguration: GcpBatchConfiguration = gcpBatchConfiguration): Unit = { + try { + val actualRuntimeAttributes = toBatchRuntimeAttributes(runtimeAttributes, workflowOptions, batchConfiguration) + assert(actualRuntimeAttributes == expectedRuntimeAttributes) + } catch { + case ex: RuntimeException => fail(s"Exception was not expected but received: ${ex.getMessage}") + } + () + } + + def assertBatchRuntimeAttributesFailedCreation(runtimeAttributes: Map[String, WomValue], + exMsgs: List[String], + workflowOptions: WorkflowOptions): Unit = { + Try(toBatchRuntimeAttributes(runtimeAttributes, workflowOptions, gcpBatchConfiguration)) match { + case Success(oops) => + fail(s"Expected error containing strings: ${exMsgs.map(s => s"'$s'").mkString(", ")} but instead got Success($oops)") + case Failure(ex) => exMsgs foreach { exMsg => assert(ex.getMessage.contains(exMsg)) } + } + () + } + + def assertBatchRuntimeAttributesFailedCreation(runtimeAttributes: Map[String, WomValue], + exMsg: String, + workflowOptions: WorkflowOptions = emptyWorkflowOptions): Unit = { + assertBatchRuntimeAttributesFailedCreation(runtimeAttributes, List(exMsg), workflowOptions) + } + + def toBatchRuntimeAttributes(runtimeAttributes: Map[String, WomValue], + workflowOptions: WorkflowOptions, + batchConfiguration: GcpBatchConfiguration): GcpBatchRuntimeAttributes = { + val runtimeAttributesBuilder = GcpBatchRuntimeAttributes.runtimeAttributesBuilder(batchConfiguration) + val defaultedAttributes = RuntimeAttributeDefinition.addDefaultsToAttributes( + staticRuntimeAttributeDefinitions, workflowOptions)(runtimeAttributes) + val validatedRuntimeAttributes = runtimeAttributesBuilder.build(defaultedAttributes, NOPLogger.NOP_LOGGER) + GcpBatchRuntimeAttributes(validatedRuntimeAttributes, batchConfiguration.runtimeConfig) + } + + val emptyWorkflowOptions: WorkflowOptions = WorkflowOptions.fromMap(Map.empty).get + val defaultZones: NonEmptyList[String] = NonEmptyList.of("us-central1-b", "us-central1-a") + val noDefaultsBatchConfiguration = new GcpBatchConfiguration(GcpBatchTestConfig.NoDefaultsConfigurationDescriptor, googleConfiguration, batchAttributes) + val staticRuntimeAttributeDefinitions: Set[RuntimeAttributeDefinition] = + GcpBatchRuntimeAttributes.runtimeAttributesBuilder(GcpBatchTestConfig.gcpBatchConfiguration).definitions.toSet +} diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchTestConfig.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchTestConfig.scala new file mode 100644 index 00000000000..fc8b1333170 --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchTestConfig.scala @@ -0,0 +1,114 @@ +package cromwell.backend.google.batch.models + +import akka.actor.ActorSystem +import com.typesafe.config.{Config, ConfigFactory} +import cromwell.backend.BackendConfigurationDescriptor +import cromwell.cloudsupport.gcp.GoogleConfiguration +import cromwell.core.WorkflowOptions +import cromwell.core.filesystem.CromwellFileSystems +import cromwell.core.path.PathBuilder + +import scala.concurrent.Await +import scala.concurrent.duration._ + +object GcpBatchTestConfig { + + private val BatchBackendConfigString = + """ + |project = "my-cromwell-workflows" + |root = "gs://my-cromwell-workflows-bucket" + | + |genomics { + | auth = "application-default" + | location = "us-central1" + |} + | + |filesystems.gcs.auth = "application-default" + | + |request-workers = 1 + | + |default-runtime-attributes { + | cpu: 1 + | failOnStderr: false + | continueOnReturnCode: 0 + | docker: "ubuntu:latest" + | memory: "2048 MB" + | bootDiskSizeGb: 10 + | disks: "local-disk 10 SSD" + | noAddress: false + | preemptible: 0 + | zones:["us-central1-b", "us-central1-a"] + |} + | + |""".stripMargin + + private val NoDefaultsConfigString = + """ + |project = "my-cromwell-workflows" + |root = "gs://my-cromwell-workflows-bucket" + | + |genomics { + | auth = "application-default" + |} + | + |filesystems { + | gcs { + | auth = "application-default" + | } + |} + |""".stripMargin + + private val BatchGlobalConfigString = + s""" + |google { + | application-name = "cromwell" + | auths = [ + | { + | name = mock + | scheme = mock + | } + | { + | # legacy `application-default` auth that actually just mocks + | name = "application-default" + | scheme = "mock" + | } + | ] + |} + | + |filesystems { + | gcs { + | class = "cromwell.filesystems.gcs.GcsPathBuilderFactory" + | } + |} + | + |backend { + | default = "batch" + | providers { + | batch { + | actor-factory = "cromwell.backend.google.pipelines.batch.GcpBatchBackendLifecycleActorFactory" + | config { + | $BatchBackendConfigString + | } + | } + | } + |} + | + |""".stripMargin + + val BatchBackendConfig: Config = ConfigFactory.parseString(BatchBackendConfigString) + val BatchGlobalConfig: Config = ConfigFactory.parseString(BatchGlobalConfigString) + val BatchBackendNoDefaultConfig: Config = ConfigFactory.parseString(NoDefaultsConfigString) + val BatchBackendConfigurationDescriptor: BackendConfigurationDescriptor = { + new BackendConfigurationDescriptor(BatchBackendConfig, BatchGlobalConfig) { + override private[backend] lazy val cromwellFileSystems = new CromwellFileSystems(BatchGlobalConfig) + } + } + val NoDefaultsConfigurationDescriptor: BackendConfigurationDescriptor = + BackendConfigurationDescriptor(BatchBackendNoDefaultConfig, BatchGlobalConfig) + def pathBuilders()(implicit as: ActorSystem): List[PathBuilder] = + Await.result(BatchBackendConfigurationDescriptor.pathBuilders(WorkflowOptions.empty), 5.seconds) + val googleConfiguration: GoogleConfiguration = GoogleConfiguration(BatchGlobalConfig) + val batchAttributes: GcpBatchConfigurationAttributes = + GcpBatchConfigurationAttributes(googleConfiguration, BatchBackendConfig, "batch") + val gcpBatchConfiguration = new GcpBatchConfiguration(BatchBackendConfigurationDescriptor, googleConfiguration,batchAttributes) +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchVpcAndSubnetworkProjectLabelValuesSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchVpcAndSubnetworkProjectLabelValuesSpec.scala new file mode 100644 index 00000000000..e24b3fc809e --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchVpcAndSubnetworkProjectLabelValuesSpec.scala @@ -0,0 +1,42 @@ +package cromwell.backend.google.batch.models + +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import org.scalatest.prop.TableDrivenPropertyChecks + +class GcpBatchVpcAndSubnetworkProjectLabelValuesSpec extends AnyFlatSpec with Matchers with TableDrivenPropertyChecks { + + behavior of "VpcAndSubnetworkProjectLabelValues" + + private val myProjectId = "my-project" + + private val labelsTests = Table( + ("description", "network", "subnetOption", "networkName", "subnetNameOption"), + ("a network with a slash", "slash/net", None, "slash/net", None), + ("a network without a slash", "net", None, "projects/my-project/global/networks/net/", None), + ("a subnet with a slash", "slashed/net", Option("slashed/sub"), "slashed/net", Option("slashed/sub")), + ("a subnet without a slash", "slashed/net", Option("sub"), "slashed/net", Option("sub")), + ( + "a network with a project token", + s"slashed/$${projectId}/net", + None, + "slashed/my-project/net", + None, + ), + ( + "a subnet with a project token", + "slashed/net", + Option(s"slashed/$${projectId}/sub"), + "slashed/net", + Option("slashed/my-project/sub"), + ), + ) + + forAll(labelsTests) { (description, network, subnetOption, networkName, subnetNameOption) => + it should description in { + val labels = VpcAndSubnetworkProjectLabelValues(network, subnetOption) + labels.networkName(myProjectId) should be(networkName) + labels.subnetNameOption(myProjectId) should be(subnetNameOption) + } + } +} diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/runnable/LocalizationSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/runnable/LocalizationSpec.scala new file mode 100644 index 00000000000..12be65744e0 --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/runnable/LocalizationSpec.scala @@ -0,0 +1,67 @@ +package cromwell.backend.google.batch.runnable + +import common.assertion.CromwellTimeoutSpec +import cromwell.backend.google.batch.models.GcpBatchJobPaths +import cromwell.core.path.DefaultPathBuilder +import org.scalatest.OptionValues._ +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +import scala.jdk.CollectionConverters._ + +class LocalizationSpec extends AnyFlatSpec with CromwellTimeoutSpec with Matchers { + + behavior of "Localization" + + it should "create the right runnable to localize DRS files using a manifest" in { + val manifestPathString = s"path/to/${GcpBatchJobPaths.DrsLocalizationManifestName}" + val manifestPath = DefaultPathBuilder.get(manifestPathString) + val tagKey = "tag" + val tagLabel = "myLabel" + + val container = Localization.drsRunnable(manifestPath, Map(tagKey -> tagLabel), None).getContainer + val fields = container.getAllFields.asScala + fields.keySet.map(_.getName) should contain theSameElementsAs Set("commands", "image_uri") + // Set("commands", "environment", "imageUri", "labels", "mounts") + + fields.find(_._1.getName == "commands").value.asInstanceOf[(_, java.util.List[_])]._2 should contain theSameElementsAs List( + "-m", manifestPathString + ) + + // runnable.get("mounts") should be(a[java.util.List[_]]) + // runnable.get("mounts").asInstanceOf[java.util.List[_]] should be (empty) + // + fields.find(_._1.getName == "image_uri").value.asInstanceOf[(_, String)]._2 should be("somerepo/drs-downloader:tagged") + // + // val actionLabels = runnable.get("labels").asInstanceOf[java.util.Map[_, _]] + // actionLabels.keySet.asScala should contain theSameElementsAs List("tag") + // actionLabels.get(tagKey) should be(tagLabel) + } + + it should "create the right runnable to localize DRS files using a manifest with requester pays" in { + + val manifestPathString = s"path/to/${GcpBatchJobPaths.DrsLocalizationManifestName}" + val manifestPath = DefaultPathBuilder.get(manifestPathString) + val tagKey = "tag" + val tagLabel = "myLabel" + val requesterPaysProjectId = "123" + + val container = Localization.drsRunnable(manifestPath, Map(tagKey -> tagLabel), Option(requesterPaysProjectId)).getContainer + val fields = container.getAllFields.asScala + fields.keySet.map(_.getName) should contain theSameElementsAs Set("commands", "image_uri") + // Set("commands", "environment", "imageUri", "labels", "mounts") + + fields.find(_._1.getName == "commands").value.asInstanceOf[(_, java.util.List[_])]._2 should contain theSameElementsAs List( + "-m", manifestPathString, "-r", requesterPaysProjectId + ) + + // runnable.get("mounts") should be(a[java.util.List[_]]) + // runnable.get("mounts").asInstanceOf[java.util.List[_]] should be (empty) + + fields.find(_._1.getName == "image_uri").value.asInstanceOf[(_, String)]._2 should be("somerepo/drs-downloader:tagged") + + // val actionLabels = runnable.get("labels").asInstanceOf[java.util.Map[_, _]] + // actionLabels.keySet.asScala should contain theSameElementsAs List("tag") + // actionLabels.get(tagKey) should be(tagLabel) + } +} diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/runnable/RunnableBuilderSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/runnable/RunnableBuilderSpec.scala new file mode 100644 index 00000000000..d0e4eac0b87 --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/runnable/RunnableBuilderSpec.scala @@ -0,0 +1,94 @@ +package cromwell.backend.google.batch.runnable + +import com.google.cloud.batch.v1.{Runnable, Volume} +import common.assertion.CromwellTimeoutSpec +import cromwell.backend.google.batch.runnable.RunnableBuilder.EnhancedRunnableBuilder +import cromwell.backend.google.batch.runnable.RunnableLabels._ +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import org.scalatest.prop.TableDrivenPropertyChecks + +import scala.jdk.CollectionConverters._ + +class RunnableBuilderSpec extends AnyFlatSpec with CromwellTimeoutSpec with Matchers with TableDrivenPropertyChecks { + + behavior of "RunnableBuilder" + + private val dockerRunRunnables = Table( + ("description", "runnable", "command"), + ("a cloud sdk runnable", RunnableBuilder.cloudSdkRunnable, s"docker run ${RunnableUtils.CloudSdkImage}"), + ("a cloud sdk runnable with args", + RunnableBuilder.cloudSdkRunnable.withCommand("bash", "-c", "echo hello"), + s"docker run ${RunnableUtils.CloudSdkImage} bash -c echo\\ hello" + ), + ("a cloud sdk runnable with quotes in the args", + RunnableBuilder.cloudSdkRunnable.withCommand("bash", "-c", "echo hello m'lord"), + s"docker run ${RunnableUtils.CloudSdkImage} bash -c echo\\ hello\\ m\\'lord" + ), + ("a cloud sdk runnable with a newline in the args", + RunnableBuilder.cloudSdkRunnable.withCommand("bash", "-c", "echo hello\\\nworld"), + s"docker run ${RunnableUtils.CloudSdkImage} bash -c echo\\ hello\\\\world" + ), + ("an runnable with multiple args", + Runnable.newBuilder() + .setContainer(Runnable.Container.newBuilder.setImageUri("ubuntu")) + .withEntrypointCommand("") + .withCommand("bash", "-c", "echo hello") + .withAlwaysRun(true) + .withVolumes(List( + Volume.newBuilder().setDeviceName("read-only-disk").setMountPath("/mnt/read/only/container").addMountOptions("ro"), + Volume.newBuilder().setDeviceName("read-write-disk").setMountPath("/mnt/read/write/container").addMountOptions("rw"), + ).map(_.build())), + "docker run" + + " -v /mnt/read/only/container:/mnt/read/only/container -v /mnt/read/write/container:/mnt/read/write/container" + + " ubuntu bash -c echo\\ hello" + ), + ) + + forAll(dockerRunRunnables) { (description, runnable, command) => + it should s"convert $description" in { + RunnableBuilder.toDockerRun(runnable) should be(command) + } + } + + private val memoryRetryExpectedEntrypoint = "/bin/sh" + + def memoryRetryExpectedCommand(lookupString: String): List[String] = { + List( + "-c", + s"grep -E -q '$lookupString' /cromwell_root/stderr ; echo $$? > /cromwell_root/memory_retry_rc" + ) + } + + val volumes = List( + Volume.newBuilder().setDeviceName("read-only-disk").setMountPath("/mnt/read/only/container")/*.addMountOptions("ro")*/ + ).map(_.build()) + + private val memoryRetryRunnableExpectedLabels = Map(Key.Tag -> Value.RetryWithMoreMemory).asJava + + it should "return cloud sdk runnable for one key in retry-with-double-memory" in { + val lookupKeyList = List("OutOfMemory") + val expectedCommand = memoryRetryExpectedCommand(lookupKeyList.mkString("|")) + + val runnable = RunnableBuilder.checkForMemoryRetryRunnable(lookupKeyList, volumes) + + runnable.getContainer.getEntrypoint shouldBe memoryRetryExpectedEntrypoint + runnable.getContainer.getCommandsList.asScala shouldBe expectedCommand + runnable.getAlwaysRun shouldBe true + runnable.getLabelsMap shouldBe memoryRetryRunnableExpectedLabels + runnable.getContainer.getVolumesList.asScala.toList shouldBe volumes.map(v => s"${v.getMountPath}:${v.getMountPath}:") + } + + it should "return cloud sdk runnable for multiple keys in retry-with-double-memory" in { + val lookupKeyList = List("OutOfMemory", "Killed", "Exit123") + val expectedCommand = memoryRetryExpectedCommand(lookupKeyList.mkString("|")) + + val runnable = RunnableBuilder.checkForMemoryRetryRunnable(lookupKeyList, volumes) + + runnable.getContainer.getEntrypoint shouldBe memoryRetryExpectedEntrypoint + runnable.getContainer.getCommandsList.asScala shouldBe expectedCommand + runnable.getAlwaysRun shouldBe true + runnable.getLabelsMap shouldBe memoryRetryRunnableExpectedLabels + runnable.getContainer.getVolumesList.asScala.toList shouldBe volumes.map(v => s"${v.getMountPath}:${v.getMountPath}:") + } +} \ No newline at end of file diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/runnable/RunnableCommandSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/runnable/RunnableCommandSpec.scala new file mode 100644 index 00000000000..6e6c9f2d0af --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/runnable/RunnableCommandSpec.scala @@ -0,0 +1,67 @@ +package cromwell.backend.google.batch.runnable + + +import java.nio.file.Path +import common.assertion.CromwellTimeoutSpec +import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration +import cromwell.backend.google.batch.runnable.RunnableCommands._ +import cromwell.filesystems.gcs.GcsPath +import eu.timepit.refined.refineMV +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import common.mock.MockSugar + +class RunnableCommandsSpec extends AnyFlatSpec with CromwellTimeoutSpec with Matchers with MockSugar { + behavior of "ActionCommands" + + it should "inject project flag when request fails because of requester pays" in { + val path = GcsPath( + mock[Path], + mock[com.google.api.services.storage.Storage], + mock[com.google.cloud.storage.Storage], + "my-project", + ) + val recovered = recoverRequesterPaysError(path) { flag => + s"flag is $flag" + } + + recovered shouldBe """flag is > gsutil_output.txt 2>&1 + |# Record the exit code of the gsutil command without project flag + |RC_GSUTIL=$? + |if [ "$RC_GSUTIL" != "0" ]; then + | printf '%s %s\n' "$(date -u '+%Y/%m/%d %H:%M:%S')" flag\ is\ \ failed + | # Print the reason of the failure + | cat gsutil_output.txt + | + | # Check if it matches the BucketIsRequesterPaysErrorMessage + | if grep -q "requester pays bucket but no user project" gsutil_output.txt; then + | printf '%s %s\n' "$(date -u '+%Y/%m/%d %H:%M:%S')" Retrying\ with\ user\ project + | flag is -u my-project + | else + | exit "$RC_GSUTIL" + | fi + |else + | exit 0 + |fi""".stripMargin + } + + it should "use GcsTransferConfiguration to set the number of localization retries" in { + implicit val gcsTransferConfiguration: GcsTransferConfiguration = GcsTransferConfiguration( + transferAttempts = refineMV(31380), parallelCompositeUploadThreshold = "0") + retry("I'm very flaky") shouldBe """for i in $(seq 31380); do + | ( + | I'm very flaky + | ) + | RC=$? + | if [ "$RC" = "0" ]; then + | break + | fi + | if [ $i -lt 31380 ]; then + | printf '%s %s\n' "$(date -u '+%Y/%m/%d %H:%M:%S')" Waiting\ 5\ seconds\ and\ retrying + | sleep 5 + | fi + |done + |exit "$RC"""".stripMargin + } +} + diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraintsSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraintsSpec.scala new file mode 100644 index 00000000000..968ab545cbf --- /dev/null +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraintsSpec.scala @@ -0,0 +1,90 @@ +package cromwell.backend.google.batch.util + +import common.assertion.CromwellTimeoutSpec +import cromwell.backend.google.batch.models.GcpBatchRuntimeAttributes +import eu.timepit.refined.numeric.Positive +import eu.timepit.refined.refineMV +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import org.scalatest.prop.TableDrivenPropertyChecks._ +import org.scalatest.prop.Tables.Table +import org.slf4j.helpers.NOPLogger +import wdl4s.parser.MemoryUnit +import wom.format.MemorySize + +class GcpBatchMachineConstraintsSpec extends AnyFlatSpec with CromwellTimeoutSpec with Matchers { + behavior of "MachineConstraints" + + private val n2Option = Option(GcpBatchRuntimeAttributes.CpuPlatformIntelCascadeLakeValue) + + private val n2dOption = Option(GcpBatchRuntimeAttributes.CpuPlatformAMDRomeValue) + + it should "generate valid machine types" in { + val validTypes = Table( + ("memory", "cpu", "cpuPlatformOption", "googleLegacyMachineSelection", "machineTypeString"), + // Already ok tuple + (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), None, false, "custom-1-1024"), + // CPU must be even (except if it's 1) + (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), None, false, "custom-4-4096"), + // Memory must be a multiple of 256 + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), None, false, "custom-1-1024"), + // Memory / cpu ratio must be > 0.9GB, increase memory + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), None, false, "custom-4-3840"), + (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), None, false, "custom-16-14848"), + // Memory / cpu ratio must be < 6.5GB, increase CPU + (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), None, false, "custom-4-14080"), + // Memory should be an int + (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), None, false, "custom-1-1536"), + (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), None, false, "custom-1-1024"), + // Increase to a cpu selection not valid for n2 below + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, false, "custom-34-31488"), + + // Same tests as above but with legacy machine type selection (cpu and memory as specified. No 'custom machine + // requirement' adjustments are expected this time, except float->int) + + (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), None, true, "predefined-1-1024"), + (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), None, true, "predefined-3-4096"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), None, true, "predefined-1-1024"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), None, true, "predefined-4-1024"), + (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), None, true, "predefined-16-14336"), + (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), None, true, "predefined-1-13977"), + (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), None, true, "predefined-1-1520"), + (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), None, true, "predefined-1-1024"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, true, "predefined-33-2048"), + + // Same tests but with cascade lake (n2) + (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), n2Option, false, "n2-custom-2-2048"), + (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), n2Option, false, "n2-custom-4-4096"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), n2Option, false, "n2-custom-2-2048"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), n2Option, false, "n2-custom-4-4096"), + (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), n2Option, false, "n2-custom-16-16384"), + (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), n2Option, false, "n2-custom-2-14080"), + (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), n2Option, false, "n2-custom-2-2048"), + (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), n2Option, false, "n2-custom-2-2048"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), n2Option, false, "n2-custom-36-36864"), + + // Same tests but with AMD Rome (n2d) #cpu > 16 are in increments of 16 + (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), n2dOption, false, "n2d-custom-2-1024"), + (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), n2dOption, false, "n2d-custom-4-4096"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), n2dOption, false, "n2d-custom-2-1024"), + (MemorySize(1 , MemoryUnit.GB), refineMV[Positive](4), n2dOption, false, "n2d-custom-4-2048"), + (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), n2dOption, false, "n2d-custom-16-14336"), + (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), n2dOption, false, "n2d-custom-2-14080"), + (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), n2dOption, false, "n2d-custom-2-1536"), + (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), n2dOption, false, "n2d-custom-2-1024"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), n2dOption, false, "n2d-custom-48-24576"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](81), n2dOption, false, "n2d-custom-96-49152"), + (MemorySize(256, MemoryUnit.GB), refineMV[Positive](128), n2dOption, false, "n2d-custom-96-262144") + ) + + forAll(validTypes) { (memory, cpu, cpuPlatformOption, googleLegacyMachineSelection, expected) => + GcpBatchMachineConstraints.machineType( + memory = memory, + cpu = cpu, + cpuPlatformOption = cpuPlatformOption, + googleLegacyMachineSelection = googleLegacyMachineSelection, + jobLogger = NOPLogger.NOP_LOGGER, + ) shouldBe expected + } + } +}