Bsweger/expand moto fixtures (#66)

* Update test fixture used to simulate AWS S3 Originally, the s3_setup fixture created in conftest.py was designed to unit test cladetime's ability to pull the correct versionId of S3 objects when provided with a specific date. Thus, the content of the objects was irrelevant. Since then, we've added Cladetime features that also require testing the content of the files on S3. This changeset updates the s3_setup pytest fixture to include realistic metadata, sequence, and ncov_metadata files. Rather than using file content to test the version, we can now check a metadata field called "version". * Don't use moto fixture when directly access URLs via Polar This seems obvious in hindsignt, but for .zst files, the sequence.get_metadata function uses polars to access URLs directly (via scan_csv). Polars uses fsspec to open remote files, so if we pass it a url to a mock, moto-created S3 bucket, it will simply try to access a real S3 bucket (hence the 403 errors) The moto setup works for .xz files, because in that case, the actual file-handling is done by requests, which then feeds the data to polars. * Add more checks to sequence metadata obtained via URL These additional checks do some basic asserts to ensure the schema of the metadata columns used by Cladetime and to ensure completeness/uniqueness of the strain column (which acts, essentially, as a primary key)
reichlab · Dec 6, 2024 · cba8c60 · cba8c60
1 parent 290cc5d
commit cba8c60
Show file tree

Hide file tree

Showing 10 changed files with 69 additions and 75 deletions.
diff --git a/src/cladetime/sequence.py b/src/cladetime/sequence.py
@@ -70,7 +70,7 @@ def get_metadata(
         # get sequence metadata from a URL
         file_suffix = Path(urlparse(metadata_url).path).suffix
         if file_suffix in [".tsv", ".zst"]:
-            metadata = pl.scan_csv(metadata_url, separator="\t", n_rows=num_rows)
+            metadata = pl.scan_csv(metadata_url, separator="\t", n_rows=num_rows, infer_schema_length=100000)
         elif file_suffix == ".xz":
             # pytyon's lzma module doesn't support opening via HTTP, so use requests
             # to download the file in chunks and then decompress it
@@ -83,7 +83,7 @@ def get_metadata(
                         decompressed_chunk = decompressor.decompress(chunk)
                         buffer.write(decompressed_chunk)
                 buffer.seek(0)
-                metadata = pl.scan_csv(buffer, separator="\t", n_rows=num_rows)
+                metadata = pl.scan_csv(buffer, separator="\t", n_rows=num_rows, infer_schema_length=100000)
         else:
             raise ValueError(f"Unsupported compression type: {file_suffix}")
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,6 +1,5 @@
-import json
-import lzma
 from datetime import datetime, timezone
+from pathlib import Path
 
 import boto3
 import pytest
@@ -11,6 +10,15 @@
 from cladetime.util.config import Config
 
 
+@pytest.fixture
+def moto_file_path() -> Path:
+    """
+    Return path to the unit test files.
+    """
+    moto_file_path = Path(__file__).parent.joinpath("data").joinpath("moto_fixture")
+    return moto_file_path
+
+
 @pytest.fixture(scope="function")
 def demo_mode(monkeypatch):
     "Set demo mode to True for tests using the Nextstrain 100K sequence files."
@@ -37,22 +45,13 @@ def test_sequences():
     return (file_name, set(sequences))
 
 
-@pytest.fixture
-def ncov_metadata():
-    return {
-        "schema_version": "v1",
-        "nextclade_dataset_name": "SARS-CoV-2",
-        "nextclade_dataset_version": "",
-    }
-
-
 @pytest.fixture
 def s3_object_keys():
     return {
-        "sequence_metadata": "data/object-key/metadata.tsv.zst",
-        "sequence_metadata_xz": "data/object-key/metadata.tsv.xz",
-        "sequence": "data/object-key/sequences.fasta.zst",
-        "ncov_metadata": "data/object-key/metadata_version.json",
+        "sequence_metadata_zst": "data/metadata.tsv.zst",
+        "sequence_metadata_xz": "data/metadata.tsv.xz",
+        "sequences_xz": "data/sequences.fasta.xz",
+        "ncov_metadata": "data/metadata_version.json",
     }
 
 
@@ -65,7 +64,7 @@ def mock_session(mocker):
 
 
 @pytest.fixture
-def s3_setup(s3_object_keys, ncov_metadata):
+def s3_setup(moto_file_path, s3_object_keys):
     """
     Setup mock S3 bucket with versioned objects that represent testing files for
     sequence data, sequence metadata, and ncov pipeline metadata.
@@ -93,27 +92,12 @@ def s3_setup(s3_object_keys, ncov_metadata):
         # Add versioned sequence, sequence metadata, and ncov metadata test objects
         versions = ["2023-01-01 03:05:01", "2023-02-05 03:33:06", "2023-02-05 14:33:06", "2023-03-22 22:55:12"]
         for i, version in enumerate(versions, start=1):
-            for key, value in s3_object_keys.items():
-                if key == "ncov_metadata":
-                    ncov_metadata["nextclade_dataset_version"] = f"version-{i}"
-                    ncov_metadata["nextclade_dataset_name"] = "sars-cov-2"
-                    ncov_metadata["nextclade_dataset_name_full"] = "data/clades"
-                    ncov_metadata["nextclade_version"] = "nexclade 3.8.2"
-                    ncov_metadata["nextclade_version_num"] = "3.8.2"
-                    ncov_metadata["greeting"] = "hello from pytest and moto"
-                    content = json.dumps(ncov_metadata)
-                elif key == "sequence_metadata_xz":
-                    content = lzma.compress(str.encode(f"{value} version {i}"))
-                else:
-                    content = f"{value} version {i}"
-                # use freezegun to override system date, which in
-                # turn sets S3 object version LastModified date
-                with freeze_time(version):
-                    s3_client.put_object(
-                        Bucket=bucket_name,
-                        Key=value,
-                        Body=content,
-                    )
+            extra_args = {"Metadata": {"version": str(i)}}
+            # use freezegun to override system date, which in
+            # turn sets S3 object version LastModified date
+            with freeze_time(version):
+                for file in moto_file_path.iterdir():
+                    s3_client.upload_file(file, bucket_name, f"data/{file.name}", ExtraArgs=extra_args)
 
         yield s3_client, bucket_name, s3_object_keys
 
@@ -127,8 +111,8 @@ def test_config(s3_setup):
     test_config = Config()
     test_config.nextstrain_min_seq_date = datetime(2023, 1, 1).replace(tzinfo=timezone.utc)
     test_config.nextstrain_ncov_bucket = "versioned-bucket"
-    test_config.nextstrain_genome_metadata_key = s3_object_keys["sequence_metadata"]
-    test_config.nextstrain_genome_sequence_key = s3_object_keys["sequence"]
+    test_config.nextstrain_genome_metadata_key = s3_object_keys["sequence_metadata_zst"]
+    test_config.nextstrain_genome_sequence_key = s3_object_keys["sequences_xz"]
     test_config.nextstrain_ncov_metadata_key = s3_object_keys["ncov_metadata"]
 
     return test_config
diff --git a/tests/data/README.md b/tests/data/README.md
@@ -2,11 +2,12 @@
 
 This directory contains test files used by CladeTime's test suite.
 
+* `moto_fixture` directory contains files used when recreating Nextstrain/Nextclade data in the moto mocked S3 bucket
 * `test_metadata.tsv` was used to test `get_clade_list` before that functionality moved to variant-nowcast-hub
 * `metadata.tsv.xz` and `metadata.tsv.xz` are used to test setting CladeTime's sequence_metadata property.
 * `test_sequence.xz` is used to test the sequence filter function
-* `test_sequences.fasta`, `test_sequences.fasta`, and `test_nexclade_dataset.zip` are used in Nextclade integration tests
-* `test_sequences_evolving.fasta` is used to test clade assignments with prior reference trees
+* `test_sequences.fasta`, `test_sequences_fake.fasta`, and `test_nexclade_dataset.zip` are used in Nextclade integration tests
+* `test_sequences_updated.fasta` is used to test clade assignments with prior reference trees
   * it contains 3 sequence strains with clade assignments that changed between 2024-08-02 and 2024-11-07
   * differing clade assignments were determined by comparing the 2024-08-02 and 2024-11-07 versions of Nexstrain's sequence metadata
   * `USA/VA-CDC-LC1109961/2024` is assigned to `24C` as of 2024-08-02 and `24E` as of 2024-11-07

diff --git a/tests/data/moto_fixture/metadata.tsv.xz b/tests/data/moto_fixture/metadata.tsv.xz
diff --git a/tests/data/moto_fixture/metadata.tsv.zst b/tests/data/moto_fixture/metadata.tsv.zst
diff --git a/tests/data/moto_fixture/metadata_version.json b/tests/data/moto_fixture/metadata_version.json
@@ -0,0 +1 @@
+{"schema_version":"v1","nextclade_version":"nextclade 3.8.2","nextclade_dataset_name":"SARS-CoV-2","nextclade_dataset_version":"2024-11-19--14-18-53Z","nextclade_tsv_sha256sum":"1800155490bd925a85fbcb4a46d19c72311a0ed6d1cd58d7d26899673cca83f1","metadata_tsv_sha256sum":"dae40f81f1cef7cb4a246c4ad483d20bda91ed3c79f7bfb81de4f67cd4797156"}
diff --git a/tests/data/moto_fixture/sequences.fasta.xz b/tests/data/moto_fixture/sequences.fasta.xz
diff --git a/tests/unit/test_cladetime.py b/tests/unit/test_cladetime.py
@@ -114,27 +114,27 @@ def test_cladetime_future_date():
 
 
 @pytest.mark.parametrize(
-    "sequence_as_of, expected_content",
+    "sequence_as_of, expected_metadata",
     [
         (
             "2024-09-01",
-            "version 4",
+            {"version": "4"},
         ),
         (
             None,
-            "version 4",
+            {"version": "4"},
         ),
         (
             datetime(2023, 2, 5, 5, 55),
-            "version 2",
+            {"version": "2"},
         ),
         (
             datetime(2023, 2, 5, 1, 22),
-            "version 1",
+            {"version": "1"},
         ),
     ],
 )
-def test_cladetime_urls(s3_setup, test_config, sequence_as_of, expected_content):
+def test_cladetime_urls(s3_setup, test_config, sequence_as_of, expected_metadata):
     s3_client, bucket_name, s3_object_keys = s3_setup
 
     mock = MagicMock(return_value=test_config, name="CladeTime._get_config_mock")
@@ -147,7 +147,7 @@ def test_cladetime_urls(s3_setup, test_config, sequence_as_of, expected_content)
                 key = parsed_url.path.strip("/")
                 version_id = parse_qs(parsed_url.query)["versionId"][0]
                 object = s3_client.get_object(Bucket=bucket_name, Key=key, VersionId=version_id)
-                assert expected_content in object["Body"].read().decode("utf-8").lower()
+                assert object.get("Metadata") == expected_metadata
 
             if ct.sequence_as_of < test_config.nextstrain_min_ncov_metadata_date:
                 assert ct.url_ncov_metadata is None
@@ -170,7 +170,6 @@ def test_cladetime_ncov_metadata(s3_setup, s3_object_keys, test_config):
             )
             ct.url_ncov_metadata = presigned_url
 
-    assert ct.ncov_metadata.get("nextclade_dataset_version") == "version-4"
     assert ct.ncov_metadata.get("nextclade_dataset_name_full") == "nextstrain/sars-cov-2/wuhan-hu-1/orfs"
     assert ct.ncov_metadata.get("nextclade_version_num") == "3.8.2"
 

diff --git a/tests/unit/test_sequence.py b/tests/unit/test_sequence.py
@@ -53,41 +53,49 @@ def test_get_metadata(test_file_path, metadata_file):
     assert expected_cols.issubset(metadata_cols)
 
 
-def test_get_metadata_url(s3_setup):
+def test_get_metadata_url(s3_setup, test_file_path):
     """
     Test get_metadata when used with an S3 URL instead of a local file.
     Needs additional research into moto and S3 url access.
     """
     s3_client, bucket_name, s3_object_keys = s3_setup
 
-    # get metadata file from S3 using ZSTD compression
-    presigned_url = s3_client.generate_presigned_url(
-        "get_object",
-        Params={"Bucket": bucket_name, "Key": s3_object_keys["sequence_metadata"]},
-        ExpiresIn=3600,
-    )
-    metadata = sequence.get_metadata(metadata_url=presigned_url)
-    assert isinstance(metadata, pl.LazyFrame)
-    # ZNK 2024-11-25: I would like to test this, but I am not sure what the
-    #   output should be and I am getting 403: no body errors with this.
-    # expected_metadata = pl.DataFrame(
-    #         {"data/object-key/metadata.tsv.zst version 4": []}
-    #         ).cast({"data/object-key/metadata.tsv.zst version 4": str})
-    # assert_frame_equal(expected_metadata, metadata.collect_schema(), check_column_order=False, check_row_order=False)
-
-    # get metadata file from S3 using XZ compression
+    # For .zst files, get_metadata uses polars to access the file directly via scan_csv
+    # However, that is difficult to test, because polars doesn't use requests or boto
+    # under the hood, so it doesn't work with moto. Thus, this hacky test passes a
+    # test file path as the metadata_url param.
+    test_file = test_file_path / "metadata.tsv.zst"
+    metadata = sequence.get_metadata(metadata_url=str(test_file))
+    # ensure lazyframe can be collected and check its shape and columns
+    metadata_df = metadata.collect()
+    assert metadata_df.shape == (99373, 58)
+    # focus on a handful of columns that an integral to cladetime
+    metadata_df = metadata.collect().select("strain", "date", "country", "division", "location", "clade_nextstrain")
+    # strain column is required and should be unique
+    assert metadata_df.select("strain").n_unique() == len(metadata_df)
+    # all columns should have a string data type
+    for data_type in metadata_df.schema.to_python().values():
+        assert data_type is str
+
+    # Get metadata file from S3 using XZ compression. Here we can use a presigned S3 URL
+    # because for .xz files, get_metadata uses requests to download the file in chunks
+    # before polars processes it.
     presigned_url = s3_client.generate_presigned_url(
         "get_object",
         Params={"Bucket": bucket_name, "Key": s3_object_keys["sequence_metadata_xz"]},
         ExpiresIn=3600,
     )
     metadata = sequence.get_metadata(metadata_url=presigned_url)
-    assert isinstance(metadata, pl.LazyFrame)
-    expected_metadata = pl.DataFrame(
-            {"data/object-key/metadata.tsv.xz version 4": []}
-            ).cast({"data/object-key/metadata.tsv.xz version 4": str})
-
-    assert_frame_equal(expected_metadata, metadata.collect(), check_column_order=False, check_row_order=False)
+    # ensure lazyframe can be collected and check its shape and columns
+    metadata_df = metadata.collect()
+    assert metadata_df.shape == (99373, 58)
+    # focus on a handful of columns that an integral to cladetime
+    metadata_df = metadata.collect().select("strain", "date", "country", "division", "location", "clade_nextstrain")
+    # strain column is required and should be unique
+    assert metadata_df.select("strain").n_unique() == len(metadata_df)
+    # all columns should have a string data type
+    for data_type in metadata_df.schema.to_python().values():
+        assert data_type is str
 
 
 def test_filter_metadata():

diff --git a/tests/unit/util/test_reference.py b/tests/unit/util/test_reference.py
@@ -7,14 +7,15 @@ def test__get_s3_object_url(s3_setup):
     s3_client, bucket_name, s3_object_keys = s3_setup
 
     target_date = datetime.strptime("2023-02-15", "%Y-%m-%d").replace(tzinfo=timezone.utc)
-    object_key = s3_object_keys["sequence_metadata"]
+    object_key = s3_object_keys["sequence_metadata_zst"]
 
     version_id, version_url = _get_s3_object_url(bucket_name, object_key, target_date)
 
     assert version_id is not None
     s3_object = s3_client.get_object(Bucket=bucket_name, Key=object_key, VersionId=version_id)
     last_modified = s3_object["LastModified"]
 
+    assert s3_object.get("Metadata") == {"version": "3"}
     assert last_modified <= target_date
     assert last_modified == datetime.strptime("2023-02-05 14:33:06", "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
     assert version_url == f"https://{bucket_name}.s3.amazonaws.com/{object_key}?versionId={version_id}"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"schema_version":"v1","nextclade_version":"nextclade 3.8.2","nextclade_dataset_name":"SARS-CoV-2","nextclade_dataset_version":"2024-11-19--14-18-53Z","nextclade_tsv_sha256sum":"1800155490bd925a85fbcb4a46d19c72311a0ed6d1cd58d7d26899673cca83f1","metadata_tsv_sha256sum":"dae40f81f1cef7cb4a246c4ad483d20bda91ed3c79f7bfb81de4f67cd4797156"}