From 1a1fc20b0f4b71b3ba6ed166b1a0ca3cd039480d Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Thu, 18 Jul 2024 10:48:42 -0700 Subject: [PATCH 1/7] =?UTF-8?q?remote.nextstrain=5Fdot=5Forg:=20Refactor?= =?UTF-8?q?=20download=20destination=20logic=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …into a separate function so that I can add doctests to it. --- nextstrain/cli/remote/nextstrain_dot_org.py | 34 ++++++++++++--------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/nextstrain/cli/remote/nextstrain_dot_org.py b/nextstrain/cli/remote/nextstrain_dot_org.py index b64f515a..60c7f77f 100644 --- a/nextstrain/cli/remote/nextstrain_dot_org.py +++ b/nextstrain/cli/remote/nextstrain_dot_org.py @@ -383,20 +383,7 @@ def download(url: URL, local_path: Path, recursively: bool = False, dry_run: boo raise UserError(f"Path {path} does not seem to be a {subresource}.") # Local destination - if local_path.is_dir(): - local_name = ( - str(resource.path.relative_to(namespace(resource.path))) - .lstrip("/") - .replace("/", "_")) - - destination = local_path / local_name - else: - destination = local_path - - if not subresource.primary: - destination = destination.with_name(f"{destination.with_suffix('').name}_{sidecar_suffix(subresource.media_type)}") - - destination = destination.with_suffix(subresource.file_extension) + destination = _download_destination(resource, subresource, local_path) yield source, destination @@ -409,6 +396,25 @@ def download(url: URL, local_path: Path, recursively: bool = False, dry_run: boo local_file.write(chunk) +def _download_destination(resource: Resource, subresource: SubResource, local_path: Path) -> Path: + if local_path.is_dir(): + local_name = ( + str(resource.path.relative_to(namespace(resource.path))) + .lstrip("/") + .replace("/", "_")) + + destination = local_path / local_name + else: + destination = local_path + + if not subresource.primary: + destination = destination.with_name(f"{destination.with_suffix('').name}_{sidecar_suffix(subresource.media_type)}") + + destination = destination.with_suffix(subresource.file_extension) + + return destination + + def ls(url: URL) -> Iterable[str]: """ List the datasets and narratives deployed at the given nextstrain.org *url*. From 200003f81c1f111958a2e95a719da9d8a333c56f Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Thu, 18 Jul 2024 11:29:25 -0700 Subject: [PATCH 2/7] remote.nextstrain_dot_org: Add doctests for download destinations Demonstrate and validate our basic expectations. In the next commit, I'll add a failing test for a bug. --- nextstrain/cli/remote/nextstrain_dot_org.py | 43 +++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/nextstrain/cli/remote/nextstrain_dot_org.py b/nextstrain/cli/remote/nextstrain_dot_org.py index 60c7f77f..9b611e08 100644 --- a/nextstrain/cli/remote/nextstrain_dot_org.py +++ b/nextstrain/cli/remote/nextstrain_dot_org.py @@ -397,6 +397,49 @@ def download(url: URL, local_path: Path, recursively: bool = False, dry_run: boo def _download_destination(resource: Resource, subresource: SubResource, local_path: Path) -> Path: + """ + These examples show all potential file names. + + >>> def names(r, d = Path.cwd()): + ... return [_download_destination(r, s, d).name for s in r.subresources] + + Dataset files. + + >>> names(Dataset("/ncov/open/global/6m")) # doctest: +NORMALIZE_WHITESPACE + ['ncov_open_global_6m.json', + 'ncov_open_global_6m_root-sequence.json', + 'ncov_open_global_6m_tip-frequencies.json', + 'ncov_open_global_6m_measurements.json'] + + Narrative files. + + >>> names(Narrative("/narratives/ncov/sit-rep/2020-01-23")) # doctest: +NORMALIZE_WHITESPACE + ['ncov_sit-rep_2020-01-23.md'] + + Namespace is omitted. + + >>> names(Dataset("/groups/blab/ncov-king-county/omicron")) # doctest: +NORMALIZE_WHITESPACE + ['ncov-king-county_omicron.json', + 'ncov-king-county_omicron_root-sequence.json', + 'ncov-king-county_omicron_tip-frequencies.json', + 'ncov-king-county_omicron_measurements.json'] + + When a non-directory local path is given. + + >>> names(Dataset("/mpox/clade-IIb"), Path("foo")) # doctest: +NORMALIZE_WHITESPACE + ['foo.json', + 'foo_root-sequence.json', + 'foo_tip-frequencies.json', + 'foo_measurements.json'] + + When a non-directory local path with extension is given. + + >>> names(Dataset("/mpox/clade-IIb"), Path("bar.json")) # doctest: +NORMALIZE_WHITESPACE + ['bar.json', + 'bar_root-sequence.json', + 'bar_tip-frequencies.json', + 'bar_measurements.json'] + """ if local_path.is_dir(): local_name = ( str(resource.path.relative_to(namespace(resource.path))) From 3a754d8d63a68955cf969e7523a34b7305402719 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Tue, 23 Jul 2024 12:43:35 -0700 Subject: [PATCH 3/7] remote.nextstrain_dot_org: Add failing doctests for download dots-in-destination bug The root of the problem is the same: dots are not being handled properly. They're assumed to always indicate an extension, but we can and should support non-extension dots. Fails with: Expected: ['mpox.clade-IIb.json', 'mpox.clade-IIb_root-sequence.json', 'mpox.clade-IIb_tip-frequencies.json', 'mpox.clade-IIb_measurements.json'] Got: ['mpox.json', 'mpox_root-sequence.json', 'mpox_tip-frequencies.json', 'mpox_measurements.json'] and Expected: ['2022.04.29-ncov_omicron-BA-two.json', '2022.04.29-ncov_omicron-BA-two_root-sequence.json', '2022.04.29-ncov_omicron-BA-two_tip-frequencies.json', '2022.04.29-ncov_omicron-BA-two_measurements.json'] Got: ['2022.04.json', '2022.json', '2022.json', '2022.json'] The latter failure reproduces a user report of issues downloading. The former was a closely-related failure case I noticed while debugging the latter. Related-to: --- nextstrain/cli/remote/nextstrain_dot_org.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/nextstrain/cli/remote/nextstrain_dot_org.py b/nextstrain/cli/remote/nextstrain_dot_org.py index 9b611e08..f1646025 100644 --- a/nextstrain/cli/remote/nextstrain_dot_org.py +++ b/nextstrain/cli/remote/nextstrain_dot_org.py @@ -439,6 +439,22 @@ def _download_destination(resource: Resource, subresource: SubResource, local_pa 'bar_root-sequence.json', 'bar_tip-frequencies.json', 'bar_measurements.json'] + + When a local path with non-extension dotted segment is given. + + >>> names(Dataset("/mpox/clade-IIb"), Path("mpox.clade-IIb")) # doctest: +NORMALIZE_WHITESPACE + ['mpox.clade-IIb.json', + 'mpox.clade-IIb_root-sequence.json', + 'mpox.clade-IIb_tip-frequencies.json', + 'mpox.clade-IIb_measurements.json'] + + When there are dots in the remote dataset name. + + >>> names(Dataset("/groups/niph/2022.04.29-ncov/omicron-BA-two")) # doctest: +NORMALIZE_WHITESPACE + ['2022.04.29-ncov_omicron-BA-two.json', + '2022.04.29-ncov_omicron-BA-two_root-sequence.json', + '2022.04.29-ncov_omicron-BA-two_tip-frequencies.json', + '2022.04.29-ncov_omicron-BA-two_measurements.json'] """ if local_path.is_dir(): local_name = ( From 0655ba29c8c7719e1690777c243cae6b03150e6d Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Tue, 23 Jul 2024 12:44:26 -0700 Subject: [PATCH 4/7] remote.nextstrain_dot_org: Fix download dots-in-destination bug Avoids careless Path.with_suffix() usage. Resolves: --- nextstrain/cli/remote/nextstrain_dot_org.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/nextstrain/cli/remote/nextstrain_dot_org.py b/nextstrain/cli/remote/nextstrain_dot_org.py index f1646025..69d45b1d 100644 --- a/nextstrain/cli/remote/nextstrain_dot_org.py +++ b/nextstrain/cli/remote/nextstrain_dot_org.py @@ -464,12 +464,24 @@ def _download_destination(resource: Resource, subresource: SubResource, local_pa destination = local_path / local_name else: - destination = local_path + # We assume a bit about subresource ordering here, so assert it. Down + # the road, it'd be better to enforce it structurally in Resource. + # -trs, 23 July 2024 + assert resource.subresources[0].primary, "first subresource is primary" + assert all(not s.primary for s in resource.subresources[1:]), "subsequent subresources are not primary" + + # Strip the suffix provided by the user *iff* it matches our expected + # *primary* extension; otherwise we assume they're intending to include + # dots in their desired filename. + if local_path.suffix == resource.subresources[0].file_extension: + destination = local_path.with_suffix('') + else: + destination = local_path if not subresource.primary: - destination = destination.with_name(f"{destination.with_suffix('').name}_{sidecar_suffix(subresource.media_type)}") + destination = destination.with_name(f"{destination.name}_{sidecar_suffix(subresource.media_type)}") - destination = destination.with_suffix(subresource.file_extension) + destination = destination.with_name(destination.name + subresource.file_extension) return destination From c3331c82c91636c0d09efd0c5efafa21928742a8 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Tue, 23 Jul 2024 13:01:58 -0700 Subject: [PATCH 5/7] remote.nextstrain_dot_org: Add failing doctests for destination sidecar suffix handling Mixed subresource extensions work due to the previous bug fix in "remote.nextstrain_dot_org: Fix download dots-in-destination bug" (5d9533f) but empty sidecar suffixes on non-primary subresources do not. I noted this edge case when working on other fixes in this code. We don't currently run into this edge case, but it's good to consider and prevent anyway to avoid future surprises if we ever do. Our data model makes it possible and it's not an unreasonable use case. The first test fails with: Expected: ['baz.md', 'baz.json', 'baz_root-sequence.json'] Got: ['baz.md', 'baz_.json', 'baz_root-sequence.json'] and the subsequent similar tests fail similarly. --- nextstrain/cli/remote/nextstrain_dot_org.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/nextstrain/cli/remote/nextstrain_dot_org.py b/nextstrain/cli/remote/nextstrain_dot_org.py index 69d45b1d..cc1d1d57 100644 --- a/nextstrain/cli/remote/nextstrain_dot_org.py +++ b/nextstrain/cli/remote/nextstrain_dot_org.py @@ -455,6 +455,26 @@ def _download_destination(resource: Resource, subresource: SubResource, local_pa '2022.04.29-ncov_omicron-BA-two_root-sequence.json', '2022.04.29-ncov_omicron-BA-two_tip-frequencies.json', '2022.04.29-ncov_omicron-BA-two_measurements.json'] + + When subresources don't share the same extension and may not have a sidecar + suffix. This is a hypothetical (though possible) use case for now, but + demonstrates an edge case to consider in the code below. + + >>> r = Resource("/foo/bar") + >>> r.subresources = [ + ... SubResource("text/vnd.nextstrain.narrative+markdown", ".md", True), + ... SubResource("application/vnd.nextstrain.dataset.main+json", ".json"), + ... SubResource("application/vnd.nextstrain.dataset.root-sequence+json", ".json"), + ... ] + + >>> names(r, Path("baz")) + ['baz.md', 'baz.json', 'baz_root-sequence.json'] + + >>> names(r, Path("baz.md")) + ['baz.md', 'baz.json', 'baz_root-sequence.json'] + + >>> names(r, Path("baz.bam")) + ['baz.bam.md', 'baz.bam.json', 'baz.bam_root-sequence.json'] """ if local_path.is_dir(): local_name = ( From 62f01037aabf38b1f94abc2a9831344844914c41 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Tue, 23 Jul 2024 13:03:53 -0700 Subject: [PATCH 6/7] remote.nextstrain_dot_org: Fix bug in destination sidecar suffix handling Don't assume that all non-primary subresources will have sidecar suffixes. --- nextstrain/cli/remote/nextstrain_dot_org.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextstrain/cli/remote/nextstrain_dot_org.py b/nextstrain/cli/remote/nextstrain_dot_org.py index cc1d1d57..c57c83d9 100644 --- a/nextstrain/cli/remote/nextstrain_dot_org.py +++ b/nextstrain/cli/remote/nextstrain_dot_org.py @@ -498,8 +498,8 @@ def _download_destination(resource: Resource, subresource: SubResource, local_pa else: destination = local_path - if not subresource.primary: - destination = destination.with_name(f"{destination.name}_{sidecar_suffix(subresource.media_type)}") + if not subresource.primary and (suffix := sidecar_suffix(subresource.media_type)): + destination = destination.with_name(f"{destination.name}_{suffix}") destination = destination.with_name(destination.name + subresource.file_extension) From 5660acd91db8754f14ff949c7cb053f1347b1f26 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Tue, 23 Jul 2024 13:26:46 -0700 Subject: [PATCH 7/7] CHANGES: Document `nextstrain remote download` destination file name bug fixes --- CHANGES.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 1ed8d175..270cc06e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -28,6 +28,15 @@ development source code and as such may not be routinely kept up to date. Batch job. ([#374](https://github.com/nextstrain/cli/pull/374)) +## Bug fixes + +* `nextstrain remote download` now produces the expected local file names when + there are periods (dots) in the remote dataset name (e.g. `nextstrain remote + download /a/b.c/d` now produces `a_b.c_d.json` instead of `a_b.json`) and + when there are periods in the given local file name (e.g. `nextstrain remote + download /x/y/z x.y.z` now produces `x.y.z.json` instead of `x.y.json`). + ([#381](https://github.com/nextstrain/cli/pull/381)) + # 8.4.0 (29 May 2024)