diff --git a/nextstrain/cli/command/build.py b/nextstrain/cli/command/build.py index 1c489304..32f56f9a 100644 --- a/nextstrain/cli/command/build.py +++ b/nextstrain/cli/command/build.py @@ -117,6 +117,30 @@ def register_parser(subparser): dest = "download", action = "store_false") + # XXX FIXME: --no-upload: discuss corresponding download behaviour; use + # of --download '!...' (or making --no-download accept patterns too?). + parser.add_argument( + "--no-upload", + metavar = "", + help = dedent(f"""\ + Exclude files matching ```` from being uploaded as part of + the remote build. Shell-style advanced globbing is supported, but + be sure to escape wildcards or quote the whole pattern so your + shell doesn't expand them. May be passed more than once. + Currently only supported when also using :option:`--aws-batch`. + Default is to upload the entire pathogen build directory (except + for some ancillary files which are always excluded). + + Besides basic glob features like single-part wildcards (``*``), + character classes (``[…]``), and brace expansion (``{{…, …}}``), + several advanced globbing features are also supported: multi-part + wildcards (``**``), extended globbing (``@(…)``, ``+(…)``, etc.), + and negation (``!…``). + + {SKIP_AUTO_DEFAULT_IN_HELP} + """), + action = "append") + # A --logs option doesn't make much sense right now for most of our # runtimes, but I can see how it might in the future. So we're ready if # that future comes to pass, set up --no-logs as if there's a --logs option diff --git a/nextstrain/cli/runner/aws_batch/__init__.py b/nextstrain/cli/runner/aws_batch/__init__.py index 78ad13a4..78987b1e 100644 --- a/nextstrain/cli/runner/aws_batch/__init__.py +++ b/nextstrain/cli/runner/aws_batch/__init__.py @@ -213,7 +213,7 @@ def run(opts, argv, working_volume = None, extra_env: Env = {}, cpus: int = None print_stage("Uploading %s to S3" % local_workdir) bucket = s3.bucket(opts.s3_bucket) - remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id) + remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id, opts.no_upload) print("uploaded:", s3.object_url(remote_workdir)) diff --git a/nextstrain/cli/runner/aws_batch/s3.py b/nextstrain/cli/runner/aws_batch/s3.py index 6aa7331a..487acd6f 100644 --- a/nextstrain/cli/runner/aws_batch/s3.py +++ b/nextstrain/cli/runner/aws_batch/s3.py @@ -38,17 +38,20 @@ def object_from_url(s3url: str) -> S3Object: return bucket(url.netloc).Object(key) -def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str) -> S3Object: +def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[str] = None) -> S3Object: """ Upload a ZIP archive of the local *workdir* to the remote S3 *bucket* for the given *run_id*. + An optional list of *patterns* (shell-style advanced globs) can be passed + to selectively exclude part of the local *workdir* from being uploaded. + Returns the S3.Object instance of the uploaded archive. """ remote_workdir = bucket.Object(run_id + ".zip") - excluded = path_matcher([ + always_excluded = path_matcher([ # Jobs don't use .git, so save the bandwidth/space/time. It may also # contain information in history that shouldn't be uploaded. ".git/", @@ -65,6 +68,13 @@ def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str) -> S3Object: "__pycache__/", ]) + if patterns: + deselected = glob_matcher(patterns) + else: + deselected = lambda path: False + + excluded = lambda path: always_excluded(path) or deselected(path) + # Stream writes directly to the remote ZIP file remote_file: Any with fsspec.open(object_url(remote_workdir), "wb", auto_mkdir = False) as remote_file: @@ -86,6 +96,8 @@ def download_workdir(remote_workdir: S3Object, workdir: Path, patterns: List[str to selectively download only part of the remote workdir. """ + # XXX FIXME: --no-upload: how does this interact with downloads? + excluded = path_matcher([ # Jobs don't use .git and it may also contain information that # shouldn't be uploaded.