Skip to content

Commit

Permalink
Bsweger/expand moto fixtures (#66)
Browse files Browse the repository at this point in the history
* Update test fixture used to simulate AWS S3

Originally, the s3_setup fixture created in conftest.py
was designed to unit test cladetime's ability to pull
the correct versionId of S3 objects when provided with
a specific date. Thus, the content of the objects was
irrelevant.

Since then, we've added Cladetime features that also require
testing the content of the files on S3. This changeset
updates the s3_setup pytest fixture to include realistic
metadata, sequence, and ncov_metadata files. Rather than
using file content to test the version, we can now
check a metadata field called "version".

* Don't use moto fixture when directly access URLs via Polar

This seems obvious in hindsignt, but for .zst files, the
sequence.get_metadata function uses polars to access URLs
directly (via scan_csv). Polars uses fsspec to open remote
files, so if we pass it a url to a mock, moto-created S3
bucket, it will simply try to access a real S3 bucket
(hence the 403 errors)

The moto setup works for .xz files, because in that case,
the actual file-handling is done by requests, which then
feeds the data to polars.

* Add more checks to sequence metadata obtained via URL

These additional checks do some basic asserts to ensure
the schema of the metadata columns used by Cladetime
and to ensure completeness/uniqueness of the strain
column (which acts, essentially, as a primary key)
  • Loading branch information
bsweger authored Dec 6, 2024
1 parent 290cc5d commit cba8c60
Show file tree
Hide file tree
Showing 10 changed files with 69 additions and 75 deletions.
4 changes: 2 additions & 2 deletions src/cladetime/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def get_metadata(
# get sequence metadata from a URL
file_suffix = Path(urlparse(metadata_url).path).suffix
if file_suffix in [".tsv", ".zst"]:
metadata = pl.scan_csv(metadata_url, separator="\t", n_rows=num_rows)
metadata = pl.scan_csv(metadata_url, separator="\t", n_rows=num_rows, infer_schema_length=100000)
elif file_suffix == ".xz":
# pytyon's lzma module doesn't support opening via HTTP, so use requests
# to download the file in chunks and then decompress it
Expand All @@ -83,7 +83,7 @@ def get_metadata(
decompressed_chunk = decompressor.decompress(chunk)
buffer.write(decompressed_chunk)
buffer.seek(0)
metadata = pl.scan_csv(buffer, separator="\t", n_rows=num_rows)
metadata = pl.scan_csv(buffer, separator="\t", n_rows=num_rows, infer_schema_length=100000)
else:
raise ValueError(f"Unsupported compression type: {file_suffix}")

Expand Down
62 changes: 23 additions & 39 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import json
import lzma
from datetime import datetime, timezone
from pathlib import Path

import boto3
import pytest
Expand All @@ -11,6 +10,15 @@
from cladetime.util.config import Config


@pytest.fixture
def moto_file_path() -> Path:
"""
Return path to the unit test files.
"""
moto_file_path = Path(__file__).parent.joinpath("data").joinpath("moto_fixture")
return moto_file_path


@pytest.fixture(scope="function")
def demo_mode(monkeypatch):
"Set demo mode to True for tests using the Nextstrain 100K sequence files."
Expand All @@ -37,22 +45,13 @@ def test_sequences():
return (file_name, set(sequences))


@pytest.fixture
def ncov_metadata():
return {
"schema_version": "v1",
"nextclade_dataset_name": "SARS-CoV-2",
"nextclade_dataset_version": "",
}


@pytest.fixture
def s3_object_keys():
return {
"sequence_metadata": "data/object-key/metadata.tsv.zst",
"sequence_metadata_xz": "data/object-key/metadata.tsv.xz",
"sequence": "data/object-key/sequences.fasta.zst",
"ncov_metadata": "data/object-key/metadata_version.json",
"sequence_metadata_zst": "data/metadata.tsv.zst",
"sequence_metadata_xz": "data/metadata.tsv.xz",
"sequences_xz": "data/sequences.fasta.xz",
"ncov_metadata": "data/metadata_version.json",
}


Expand All @@ -65,7 +64,7 @@ def mock_session(mocker):


@pytest.fixture
def s3_setup(s3_object_keys, ncov_metadata):
def s3_setup(moto_file_path, s3_object_keys):
"""
Setup mock S3 bucket with versioned objects that represent testing files for
sequence data, sequence metadata, and ncov pipeline metadata.
Expand Down Expand Up @@ -93,27 +92,12 @@ def s3_setup(s3_object_keys, ncov_metadata):
# Add versioned sequence, sequence metadata, and ncov metadata test objects
versions = ["2023-01-01 03:05:01", "2023-02-05 03:33:06", "2023-02-05 14:33:06", "2023-03-22 22:55:12"]
for i, version in enumerate(versions, start=1):
for key, value in s3_object_keys.items():
if key == "ncov_metadata":
ncov_metadata["nextclade_dataset_version"] = f"version-{i}"
ncov_metadata["nextclade_dataset_name"] = "sars-cov-2"
ncov_metadata["nextclade_dataset_name_full"] = "data/clades"
ncov_metadata["nextclade_version"] = "nexclade 3.8.2"
ncov_metadata["nextclade_version_num"] = "3.8.2"
ncov_metadata["greeting"] = "hello from pytest and moto"
content = json.dumps(ncov_metadata)
elif key == "sequence_metadata_xz":
content = lzma.compress(str.encode(f"{value} version {i}"))
else:
content = f"{value} version {i}"
# use freezegun to override system date, which in
# turn sets S3 object version LastModified date
with freeze_time(version):
s3_client.put_object(
Bucket=bucket_name,
Key=value,
Body=content,
)
extra_args = {"Metadata": {"version": str(i)}}
# use freezegun to override system date, which in
# turn sets S3 object version LastModified date
with freeze_time(version):
for file in moto_file_path.iterdir():
s3_client.upload_file(file, bucket_name, f"data/{file.name}", ExtraArgs=extra_args)

yield s3_client, bucket_name, s3_object_keys

Expand All @@ -127,8 +111,8 @@ def test_config(s3_setup):
test_config = Config()
test_config.nextstrain_min_seq_date = datetime(2023, 1, 1).replace(tzinfo=timezone.utc)
test_config.nextstrain_ncov_bucket = "versioned-bucket"
test_config.nextstrain_genome_metadata_key = s3_object_keys["sequence_metadata"]
test_config.nextstrain_genome_sequence_key = s3_object_keys["sequence"]
test_config.nextstrain_genome_metadata_key = s3_object_keys["sequence_metadata_zst"]
test_config.nextstrain_genome_sequence_key = s3_object_keys["sequences_xz"]
test_config.nextstrain_ncov_metadata_key = s3_object_keys["ncov_metadata"]

return test_config
5 changes: 3 additions & 2 deletions tests/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

This directory contains test files used by CladeTime's test suite.

* `moto_fixture` directory contains files used when recreating Nextstrain/Nextclade data in the moto mocked S3 bucket
* `test_metadata.tsv` was used to test `get_clade_list` before that functionality moved to variant-nowcast-hub
* `metadata.tsv.xz` and `metadata.tsv.xz` are used to test setting CladeTime's sequence_metadata property.
* `test_sequence.xz` is used to test the sequence filter function
* `test_sequences.fasta`, `test_sequences.fasta`, and `test_nexclade_dataset.zip` are used in Nextclade integration tests
* `test_sequences_evolving.fasta` is used to test clade assignments with prior reference trees
* `test_sequences.fasta`, `test_sequences_fake.fasta`, and `test_nexclade_dataset.zip` are used in Nextclade integration tests
* `test_sequences_updated.fasta` is used to test clade assignments with prior reference trees
* it contains 3 sequence strains with clade assignments that changed between 2024-08-02 and 2024-11-07
* differing clade assignments were determined by comparing the 2024-08-02 and 2024-11-07 versions of Nexstrain's sequence metadata
* `USA/VA-CDC-LC1109961/2024` is assigned to `24C` as of 2024-08-02 and `24E` as of 2024-11-07
Expand Down
Binary file added tests/data/moto_fixture/metadata.tsv.xz
Binary file not shown.
Binary file added tests/data/moto_fixture/metadata.tsv.zst
Binary file not shown.
1 change: 1 addition & 0 deletions tests/data/moto_fixture/metadata_version.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"schema_version":"v1","nextclade_version":"nextclade 3.8.2","nextclade_dataset_name":"SARS-CoV-2","nextclade_dataset_version":"2024-11-19--14-18-53Z","nextclade_tsv_sha256sum":"1800155490bd925a85fbcb4a46d19c72311a0ed6d1cd58d7d26899673cca83f1","metadata_tsv_sha256sum":"dae40f81f1cef7cb4a246c4ad483d20bda91ed3c79f7bfb81de4f67cd4797156"}
Binary file added tests/data/moto_fixture/sequences.fasta.xz
Binary file not shown.
15 changes: 7 additions & 8 deletions tests/unit/test_cladetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,27 +114,27 @@ def test_cladetime_future_date():


@pytest.mark.parametrize(
"sequence_as_of, expected_content",
"sequence_as_of, expected_metadata",
[
(
"2024-09-01",
"version 4",
{"version": "4"},
),
(
None,
"version 4",
{"version": "4"},
),
(
datetime(2023, 2, 5, 5, 55),
"version 2",
{"version": "2"},
),
(
datetime(2023, 2, 5, 1, 22),
"version 1",
{"version": "1"},
),
],
)
def test_cladetime_urls(s3_setup, test_config, sequence_as_of, expected_content):
def test_cladetime_urls(s3_setup, test_config, sequence_as_of, expected_metadata):
s3_client, bucket_name, s3_object_keys = s3_setup

mock = MagicMock(return_value=test_config, name="CladeTime._get_config_mock")
Expand All @@ -147,7 +147,7 @@ def test_cladetime_urls(s3_setup, test_config, sequence_as_of, expected_content)
key = parsed_url.path.strip("/")
version_id = parse_qs(parsed_url.query)["versionId"][0]
object = s3_client.get_object(Bucket=bucket_name, Key=key, VersionId=version_id)
assert expected_content in object["Body"].read().decode("utf-8").lower()
assert object.get("Metadata") == expected_metadata

if ct.sequence_as_of < test_config.nextstrain_min_ncov_metadata_date:
assert ct.url_ncov_metadata is None
Expand All @@ -170,7 +170,6 @@ def test_cladetime_ncov_metadata(s3_setup, s3_object_keys, test_config):
)
ct.url_ncov_metadata = presigned_url

assert ct.ncov_metadata.get("nextclade_dataset_version") == "version-4"
assert ct.ncov_metadata.get("nextclade_dataset_name_full") == "nextstrain/sars-cov-2/wuhan-hu-1/orfs"
assert ct.ncov_metadata.get("nextclade_version_num") == "3.8.2"

Expand Down
54 changes: 31 additions & 23 deletions tests/unit/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,41 +53,49 @@ def test_get_metadata(test_file_path, metadata_file):
assert expected_cols.issubset(metadata_cols)


def test_get_metadata_url(s3_setup):
def test_get_metadata_url(s3_setup, test_file_path):
"""
Test get_metadata when used with an S3 URL instead of a local file.
Needs additional research into moto and S3 url access.
"""
s3_client, bucket_name, s3_object_keys = s3_setup

# get metadata file from S3 using ZSTD compression
presigned_url = s3_client.generate_presigned_url(
"get_object",
Params={"Bucket": bucket_name, "Key": s3_object_keys["sequence_metadata"]},
ExpiresIn=3600,
)
metadata = sequence.get_metadata(metadata_url=presigned_url)
assert isinstance(metadata, pl.LazyFrame)
# ZNK 2024-11-25: I would like to test this, but I am not sure what the
# output should be and I am getting 403: no body errors with this.
# expected_metadata = pl.DataFrame(
# {"data/object-key/metadata.tsv.zst version 4": []}
# ).cast({"data/object-key/metadata.tsv.zst version 4": str})
# assert_frame_equal(expected_metadata, metadata.collect_schema(), check_column_order=False, check_row_order=False)

# get metadata file from S3 using XZ compression
# For .zst files, get_metadata uses polars to access the file directly via scan_csv
# However, that is difficult to test, because polars doesn't use requests or boto
# under the hood, so it doesn't work with moto. Thus, this hacky test passes a
# test file path as the metadata_url param.
test_file = test_file_path / "metadata.tsv.zst"
metadata = sequence.get_metadata(metadata_url=str(test_file))
# ensure lazyframe can be collected and check its shape and columns
metadata_df = metadata.collect()
assert metadata_df.shape == (99373, 58)
# focus on a handful of columns that an integral to cladetime
metadata_df = metadata.collect().select("strain", "date", "country", "division", "location", "clade_nextstrain")
# strain column is required and should be unique
assert metadata_df.select("strain").n_unique() == len(metadata_df)
# all columns should have a string data type
for data_type in metadata_df.schema.to_python().values():
assert data_type is str

# Get metadata file from S3 using XZ compression. Here we can use a presigned S3 URL
# because for .xz files, get_metadata uses requests to download the file in chunks
# before polars processes it.
presigned_url = s3_client.generate_presigned_url(
"get_object",
Params={"Bucket": bucket_name, "Key": s3_object_keys["sequence_metadata_xz"]},
ExpiresIn=3600,
)
metadata = sequence.get_metadata(metadata_url=presigned_url)
assert isinstance(metadata, pl.LazyFrame)
expected_metadata = pl.DataFrame(
{"data/object-key/metadata.tsv.xz version 4": []}
).cast({"data/object-key/metadata.tsv.xz version 4": str})

assert_frame_equal(expected_metadata, metadata.collect(), check_column_order=False, check_row_order=False)
# ensure lazyframe can be collected and check its shape and columns
metadata_df = metadata.collect()
assert metadata_df.shape == (99373, 58)
# focus on a handful of columns that an integral to cladetime
metadata_df = metadata.collect().select("strain", "date", "country", "division", "location", "clade_nextstrain")
# strain column is required and should be unique
assert metadata_df.select("strain").n_unique() == len(metadata_df)
# all columns should have a string data type
for data_type in metadata_df.schema.to_python().values():
assert data_type is str


def test_filter_metadata():
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/util/test_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@ def test__get_s3_object_url(s3_setup):
s3_client, bucket_name, s3_object_keys = s3_setup

target_date = datetime.strptime("2023-02-15", "%Y-%m-%d").replace(tzinfo=timezone.utc)
object_key = s3_object_keys["sequence_metadata"]
object_key = s3_object_keys["sequence_metadata_zst"]

version_id, version_url = _get_s3_object_url(bucket_name, object_key, target_date)

assert version_id is not None
s3_object = s3_client.get_object(Bucket=bucket_name, Key=object_key, VersionId=version_id)
last_modified = s3_object["LastModified"]

assert s3_object.get("Metadata") == {"version": "3"}
assert last_modified <= target_date
assert last_modified == datetime.strptime("2023-02-05 14:33:06", "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
assert version_url == f"https://{bucket_name}.s3.amazonaws.com/{object_key}?versionId={version_id}"

0 comments on commit cba8c60

Please sign in to comment.