From b14ae65380269154d7647c2f79c59ad29443443e Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Mon, 17 Jun 2024 14:40:31 -0700 Subject: [PATCH] runner.aws_batch: Download .snakemake/metadata/ too Snakemake stores state information per input/output here and uses it to determine if it needs to re-run rules or not. It seems akin to the file mtimes which we already take care to preserve on upload/download. Additionally, the metadata recorded is used in Snakemake's report generation and is generally useful for looking at workflow statistics. Continue to not download all of .snakemake/ en masse because it can potentially contain files that interfere with local usage and/or are large and unnecessary. Resolves: Related-to: --- CHANGES.md | 15 +++++++++++++++ nextstrain/cli/runner/aws_batch/s3.py | 6 +++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 513c159d..1ed8d175 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -13,6 +13,21 @@ development source code and as such may not be routinely kept up to date. # __NEXT__ +## Improvements + +* Snakemake's per-input/output file metadata (stored in `.snakemake/metadata/`) + is now downloaded from AWS Batch builds by default. Like file modification + times (mtimes), which are already preserved from the remote build, this + additional metadata is used by Snakemake to track when inputs have changed + and when it should regenerate outputs. The metadata is also used in + [Snakemake report generation](https://snakemake.readthedocs.io/en/v8.14.0/snakefiles/reporting.html#rendering-reports) + and can be useful for gathering ad-hoc workflow statistics. + + The runtime image used must be at least `nextstrain/base:build-20240617T235011Z` + for these Snakemake metadata files to be available for download from the AWS + Batch job. + ([#374](https://github.com/nextstrain/cli/pull/374)) + # 8.4.0 (29 May 2024) diff --git a/nextstrain/cli/runner/aws_batch/s3.py b/nextstrain/cli/runner/aws_batch/s3.py index 02343e1f..7d07fb9f 100644 --- a/nextstrain/cli/runner/aws_batch/s3.py +++ b/nextstrain/cli/runner/aws_batch/s3.py @@ -119,8 +119,12 @@ def download_workdir(remote_workdir: S3Object, workdir: Path, patterns: List[str ]) included = path_matcher([ - # But we do want the Snakemake logs to come over. + # But we do want the Snakemake logs to come over… ".snakemake/log/", + + # …and the input/output metadata Snakemake tracks (akin to mtimes, + # which we also preserve). + ".snakemake/metadata/", ]) if patterns: