Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dataset fixture to ease fetching DataFrames for tests #847

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
a8f5b84
WIP: First pass, untested
dagardner-nv Apr 4, 2023
b5f4036
wip
dagardner-nv Apr 4, 2023
673d45e
wip
dagardner-nv Apr 5, 2023
ffc3035
wip
dagardner-nv Apr 5, 2023
810a0cf
wip
dagardner-nv Apr 5, 2023
4e323a6
wip
dagardner-nv Apr 5, 2023
97c6edb
wip
dagardner-nv Apr 5, 2023
7e299ea
Remove unused imports and remove unnecessary writes to disk
dagardner-nv Apr 5, 2023
3a5d97c
wip
dagardner-nv Apr 5, 2023
f1a7fef
wip
dagardner-nv Apr 5, 2023
c7450a2
Fix formatting of docstring
dagardner-nv Apr 5, 2023
d9348cd
Remove unused import
dagardner-nv Apr 5, 2023
b2c6685
Cache instances of DatasetLoader, add type hints to class vars
dagardner-nv Apr 5, 2023
134e8fc
Use the get_loader method
dagardner-nv Apr 5, 2023
3927eff
Fix comment
dagardner-nv Apr 5, 2023
e69e413
Replace usage of use_pandas as a way to get test data which had the u…
dagardner-nv Apr 5, 2023
b1f9e43
Merge branch 'branch-23.07' of github.com:nv-morpheus/Morpheus into d…
dagardner-nv Apr 13, 2023
f74bd4b
Merge branch 'branch-23.07' of github.com:nv-morpheus/Morpheus into d…
dagardner-nv Apr 13, 2023
0b704f0
Merge branch 'branch-23.07' into david-datasets-fixture-785
dagardner-nv Apr 13, 2023
9386c8d
Rename DatasetLoader to DatasetManager [no ci]
dagardner-nv Apr 13, 2023
92e1a6c
wip [no ci]
dagardner-nv Apr 14, 2023
0c10a7c
wip
dagardner-nv Apr 14, 2023
e06cf15
Mark dfencoder end to end tests as slow
dagardner-nv Apr 14, 2023
da2fb5c
Move test utils into tests/utils [no ci]
dagardner-nv Apr 14, 2023
f135844
Replace get_loader with just depending on the class being a singleton
dagardner-nv Apr 14, 2023
fdcd008
Remove _dataset_mod
dagardner-nv Apr 14, 2023
1c6e30d
Use absolute paths as cache keys to avoid accidental cache misses
dagardner-nv Apr 14, 2023
1586f05
Rename tests to avoid ambiguity
dagardner-nv Apr 14, 2023
da47969
Remove use_cpp fixture from dataset fixture request
dagardner-nv Apr 14, 2023
ac30fcd
wip
dagardner-nv Apr 14, 2023
7a70c2d
Move init logic to new
dagardner-nv Apr 14, 2023
a43e2c3
Leave a comment explainint __new__ instead of __init__
dagardner-nv Apr 14, 2023
193286e
Add comments explaining fixture behavior
dagardner-nv Apr 14, 2023
638e91a
more tests
dagardner-nv Apr 14, 2023
c555fca
Move classes out of tests/utils/__init__.py to their own modules
dagardner-nv Apr 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 102 additions & 13 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,22 +442,111 @@ def chdir_tmpdir(request: pytest.FixtureRequest, tmp_path):
os.chdir(request.config.invocation_dir)


@pytest.fixture(scope="session")
def _filter_probs_df():
from morpheus.io.deserializers import read_file_to_df
from utils import TEST_DIRS
input_file = os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv")
yield read_file_to_df(input_file, df_type='cudf')
@pytest.fixture(scope="function")
def dataset(df_type: typing.Literal['cudf', 'pandas']):
"""
Yields a DatasetLoader instance with `df_type` as the default DataFrame type.
Users of this fixture can still explicitly request either a cudf or pandas dataframe with the `cudf` and `pandas`
properties:
```
def test_something(dataset: DatasetManager):
df = dataset["filter_probs.csv"] # type will match the df_type parameter
if dataset.default_df_type == 'pandas':
assert isinstance(df, pd.DataFrame)
else:
assert isinstance(df, cudf.DataFrame)

pdf = dataset.pandas["filter_probs.csv"]
cdf = dataset.cudf["filter_probs.csv"]

```

A test that requests this fixture will parameterize on the type of DataFrame returned by the DatasetManager.
If a test requests both this fixture and the `use_cpp` fixture, or indirectly via the `config` fixture, then
the test will parameterize over both df_type:[cudf, pandas] and use_cpp[True, False]. However it will remove the
df_type=pandas & use_cpp=True combinations as this will cause an unsupported usage of Pandas dataframes with the
C++ implementation of message classes.

This behavior can also be overridden by using the `use_cudf`, `use_pandas`, `use_cpp` or `use_pandas` marks ex:
```
# This test will only run once with C++ enabled and cudf dataframes
@pytest.mark.use_cpp
def test something(dataset: DatasetManager):
...
# This test will run once for each dataframe type, with C++ disabled both times
@pytest.mark.use_python
def test something(dataset: DatasetManager):
...
# This test will run twice with C++ mode enabled/disabled, using cudf dataframes both times
@pytest.mark.use_cudf
def test something(use_cpp: bool, dataset: DatasetManager):
...
# This test will run only once
@pytest.mark.use_cudf
@pytest.mark.use_python
def test something(dataset: DatasetManager):
...
# This test creates an incompatible combination and will raise a RuntimeError without being executed
@pytest.mark.use_pandas
@pytest.mark.use_cpp
def test something(dataset: DatasetManager):
```

Users who don't want to parametarize over the DataFrame should use the `dataset_pandas` or `dataset_cudf` fixtures.
"""
from utils import dataset_manager
yield dataset_manager.DatasetManager(df_type=df_type)


@pytest.fixture(scope="function")
def filter_probs_df(_filter_probs_df, df_type: typing.Literal['cudf', 'pandas'], use_cpp: bool):
if df_type == 'cudf':
yield _filter_probs_df.copy(deep=True)
elif df_type == 'pandas':
yield _filter_probs_df.to_pandas()
else:
assert False, "Unknown df_type type"
def dataset_pandas():
"""
Yields a DatasetLoader instance with pandas as the default DataFrame type.

Note: This fixture won't prevent a user from writing a test requiring C++ mode execution and requesting Pandas
dataframes. This is quite useful for tests like `tests/test_add_scores_stage_pipe.py` where we want to test with
both Python & C++ executions, but we use Pandas to build up the expected DataFrame to validate the test against.

In addition to this, users can use this fixture to explicitly request a cudf Dataframe as well, allowing for a test
that looks like:
```
@pytest.mark.use_cpp
def test_something(dataset_pandas: DatasetManager):
input_df = dataset_pandas.cudf["filter_probs.csv"] # Feed our source stage a cudf DF

# Perform pandas transformations to mimic the add scores stage
expected_df = dataset["filter_probs.csv"]
expected_df = expected_df.rename(columns=dict(zip(expected_df.columns, class_labels)))
```
"""
from utils import dataset_manager
yield dataset_manager.DatasetManager(df_type='pandas')


@pytest.fixture(scope="function")
def dataset_cudf():
"""
Yields a DatasetLoader instance with cudf as the default DataFrame type.

Users who wish to have both cudf and pandas DataFrames can do so with this fixture and using the `pandas` property:
def test_something(dataset_cudf: DatasetManager):
cdf = dataset_cudf["filter_probs.csv"]
pdf = dataset_cudf.pandas["filter_probs.csv"]
"""
from utils import dataset_manager
yield dataset_manager.DatasetManager(df_type='cudf')


@pytest.fixture(scope="function")
def filter_probs_df(dataset, use_cpp: bool):
"""
Shortcut fixture for loading the filter_probs.csv dataset.

Unless your test uses the `use_pandas` or `use_cudf` marks this fixture will parametarize over the two dataframe
types. Similarly unless your test uses the `use_cpp` or `use_python` marks this fixture will also parametarize over
that as well, while excluding the combination of C++ execution and Pandas dataframes.
"""
yield dataset["filter_probs.csv"]


def wait_for_camouflage(host="localhost", port=8000, timeout=5):
Expand Down
14 changes: 4 additions & 10 deletions tests/dfencoder/test_autoencoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@
import torch

from morpheus.config import AEFeatureScalar
from morpheus.io.deserializers import read_file_to_df
from morpheus.models.dfencoder import ae_module
from morpheus.models.dfencoder import autoencoder
from morpheus.models.dfencoder import scalers
from morpheus.models.dfencoder.dataframe import EncoderDataFrame
from utils import TEST_DIRS
from utils.dataset_manager import DatasetManager

# Only pandas and C++ is supported
# Only pandas and Python is supported
pytestmark = [pytest.mark.use_pandas, pytest.mark.use_python]

BIN_COLS = ['ts_anomaly']
Expand Down Expand Up @@ -73,15 +73,9 @@ def train_ae():
progress_bar=False)


@pytest.fixture(scope="module")
def _train_df() -> pd.DataFrame:
input_file = os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-role-g-validation-data-input.csv")
yield read_file_to_df(input_file, df_type='pandas')


@pytest.fixture(scope="function")
def train_df(_train_df) -> typing.Generator[pd.DataFrame, None, None]:
yield _train_df.copy(deep=True)
def train_df(dataset_pandas: DatasetManager) -> typing.Iterator[pd.DataFrame]:
yield dataset_pandas[os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-role-g-validation-data-input.csv")]


def compare_numeric_features(features, expected_features):
Expand Down
1 change: 1 addition & 0 deletions tests/dfencoder/test_dfencoder_distributed_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def cleanup_dist():
torch.distributed.destroy_process_group()


@pytest.mark.slow
@pytest.mark.usefixtures("manual_seed")
def test_dfencoder_distributed_e2e():

Expand Down
1 change: 1 addition & 0 deletions tests/dfencoder/test_dfencoder_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
LOSS_TOLERANCE_RATIO = 1.25


@pytest.mark.slow
@pytest.mark.usefixtures("manual_seed")
def test_dfencoder_e2e():
# Load data
Expand Down
13 changes: 7 additions & 6 deletions tests/test_abp_kafka.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,9 @@
import pandas
import pytest

from morpheus.common import FileTypes
from morpheus.config import Config
from morpheus.config import ConfigFIL
from morpheus.config import PipelineModes
from morpheus.io.deserializers import read_file_to_df
from morpheus.io.utils import filter_null_data
from morpheus.pipeline import LinearPipeline
from morpheus.stages.general.monitor_stage import MonitorStage
Expand All @@ -41,6 +39,7 @@
from morpheus.utils.compare_df import compare_df
from utils import TEST_DIRS
from utils import write_file_to_kafka
from utils.dataset_manager import DatasetManager

if (typing.TYPE_CHECKING):
from kafka import KafkaConsumer
Expand All @@ -54,7 +53,8 @@
@pytest.mark.slow
@pytest.mark.use_python
@mock.patch('tritonclient.grpc.InferenceServerClient')
def test_abp_no_cpp(mock_triton_client,
def test_abp_no_cpp(mock_triton_client: mock.MagicMock,
dataset_pandas: DatasetManager,
config: Config,
kafka_bootstrap_servers: str,
kafka_topics: typing.Tuple[str, str],
Expand Down Expand Up @@ -129,7 +129,7 @@ def async_infer(callback=None, **k):

pipe.run()

val_df = read_file_to_df(val_file_name, file_type=FileTypes.Auto, df_type='pandas')
val_df = dataset_pandas[val_file_name]

output_buf = StringIO()
for rec in kafka_consumer:
Expand All @@ -150,7 +150,8 @@ def async_infer(callback=None, **k):
@pytest.mark.slow
@pytest.mark.use_cpp
@pytest.mark.usefixtures("launch_mock_triton")
def test_abp_cpp(config,
def test_abp_cpp(config: Config,
dataset_pandas: DatasetManager,
kafka_bootstrap_servers: str,
kafka_topics: typing.Tuple[str, str],
kafka_consumer: "KafkaConsumer"):
Expand Down Expand Up @@ -195,7 +196,7 @@ def test_abp_cpp(config,

pipe.run()

val_df = read_file_to_df(val_file_name, file_type=FileTypes.Auto, df_type='pandas')
val_df = dataset_pandas[val_file_name]
output_buf = StringIO()
for rec in kafka_consumer:
output_buf.write("{}\n".format(rec.value.decode("utf-8")))
Expand Down
17 changes: 9 additions & 8 deletions tests/test_add_classifications_stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,15 @@

import cudf

from morpheus.config import Config
from morpheus.messages.memory.tensor_memory import TensorMemory
from morpheus.messages.message_meta import MessageMeta
from morpheus.messages.multi_response_message import MultiResponseMessage
from morpheus.stages.postprocess.add_classifications_stage import AddClassificationsStage
from utils import assert_df_equal
from utils.dataset_manager import DatasetManager


def test_constructor(config):
def test_constructor(config: Config):
config.class_labels = ['frogs', 'lizards', 'toads']

ac = AddClassificationsStage(config)
Expand Down Expand Up @@ -64,9 +65,9 @@ def test_add_labels():

labeled = AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=threshold)

assert assert_df_equal(labeled.get_meta("frogs"), probs_array_bool[:, 0])
assert assert_df_equal(labeled.get_meta("lizards"), probs_array_bool[:, 1])
assert assert_df_equal(labeled.get_meta("toads"), probs_array_bool[:, 2])
assert DatasetManager.assert_df_equal(labeled.get_meta("frogs"), probs_array_bool[:, 0])
assert DatasetManager.assert_df_equal(labeled.get_meta("lizards"), probs_array_bool[:, 1])
assert DatasetManager.assert_df_equal(labeled.get_meta("toads"), probs_array_bool[:, 2])

# Same thing but change the probs tensor name
message = MultiResponseMessage(meta=MessageMeta(df),
Expand All @@ -75,9 +76,9 @@ def test_add_labels():

labeled = AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=threshold)

assert assert_df_equal(labeled.get_meta("frogs"), probs_array_bool[:, 0])
assert assert_df_equal(labeled.get_meta("lizards"), probs_array_bool[:, 1])
assert assert_df_equal(labeled.get_meta("toads"), probs_array_bool[:, 2])
assert DatasetManager.assert_df_equal(labeled.get_meta("frogs"), probs_array_bool[:, 0])
assert DatasetManager.assert_df_equal(labeled.get_meta("lizards"), probs_array_bool[:, 1])
assert DatasetManager.assert_df_equal(labeled.get_meta("toads"), probs_array_bool[:, 2])

# Fail in missing probs data
message = MultiResponseMessage(meta=MessageMeta(df),
Expand Down
2 changes: 1 addition & 1 deletion tests/test_add_classifications_stage_pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
from morpheus.stages.postprocess.add_classifications_stage import AddClassificationsStage
from morpheus.stages.postprocess.serialize_stage import SerializeStage
from morpheus.stages.preprocess.deserialize_stage import DeserializeStage
from stages.conv_msg import ConvMsg
from utils import assert_results
from utils.stages.conv_msg import ConvMsg


def build_expected(df: pd.DataFrame, threshold: float, class_labels: typing.List[str]):
Expand Down
17 changes: 9 additions & 8 deletions tests/test_add_scores_stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,16 @@

import cudf

from morpheus.config import Config
from morpheus.messages.memory.tensor_memory import TensorMemory
from morpheus.messages.message_meta import MessageMeta
from morpheus.messages.multi_response_message import MultiResponseMessage
from morpheus.stages.postprocess.add_classifications_stage import AddClassificationsStage
from morpheus.stages.postprocess.add_scores_stage import AddScoresStage
from utils import assert_df_equal
from utils.dataset_manager import DatasetManager


def test_constructor(config):
def test_constructor(config: Config):
config.class_labels = ['frogs', 'lizards', 'toads']
config.feature_length = 12

Expand Down Expand Up @@ -62,9 +63,9 @@ def test_add_labels():

labeled = AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=None)

assert assert_df_equal(labeled.get_meta("frogs"), probs_array[:, 0])
assert assert_df_equal(labeled.get_meta("lizards"), probs_array[:, 1])
assert assert_df_equal(labeled.get_meta("toads"), probs_array[:, 2])
assert DatasetManager.assert_df_equal(labeled.get_meta("frogs"), probs_array[:, 0])
assert DatasetManager.assert_df_equal(labeled.get_meta("lizards"), probs_array[:, 1])
assert DatasetManager.assert_df_equal(labeled.get_meta("toads"), probs_array[:, 2])

# Same thing but change the probs tensor name
message = MultiResponseMessage(meta=MessageMeta(df),
Expand All @@ -73,9 +74,9 @@ def test_add_labels():

labeled = AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=None)

assert assert_df_equal(labeled.get_meta("frogs"), probs_array[:, 0])
assert assert_df_equal(labeled.get_meta("lizards"), probs_array[:, 1])
assert assert_df_equal(labeled.get_meta("toads"), probs_array[:, 2])
assert DatasetManager.assert_df_equal(labeled.get_meta("frogs"), probs_array[:, 0])
assert DatasetManager.assert_df_equal(labeled.get_meta("lizards"), probs_array[:, 1])
assert DatasetManager.assert_df_equal(labeled.get_meta("toads"), probs_array[:, 2])

# Fail in missing probs data
message = MultiResponseMessage(meta=MessageMeta(df),
Expand Down
Loading