nv-morpheus · rapids-bot · Apr 14, 2023 · Apr 4, 2023 · Apr 4, 2023 · Apr 5, 2023
@@ -442,22 +442,111 @@ def chdir_tmpdir(request: pytest.FixtureRequest, tmp_path):
     os.chdir(request.config.invocation_dir)
 
 
-@pytest.fixture(scope="session")
-def _filter_probs_df():
-    from morpheus.io.deserializers import read_file_to_df
-    from utils import TEST_DIRS
-    input_file = os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv")
-    yield read_file_to_df(input_file, df_type='cudf')
+@pytest.fixture(scope="function")
+def dataset(df_type: typing.Literal['cudf', 'pandas']):
+    """
+    Yields a DatasetLoader instance with `df_type` as the default DataFrame type.
+    Users of this fixture can still explicitly request either a cudf or pandas dataframe with the `cudf` and `pandas`
+    properties:
+    ```
+    def test_something(dataset: DatasetManager):
+        df = dataset["filter_probs.csv"]  # type will match the df_type parameter
+        if dataset.default_df_type == 'pandas':
+            assert isinstance(df, pd.DataFrame)
+        else:
+            assert isinstance(df, cudf.DataFrame)
+
+        pdf = dataset.pandas["filter_probs.csv"]
+        cdf = dataset.cudf["filter_probs.csv"]
+
+    ```
+
+    A test that requests this fixture will parameterize on the type of DataFrame returned by the DatasetManager.
+    If a test requests both this fixture and the `use_cpp` fixture, or indirectly via the `config` fixture, then
+    the test will parameterize over both df_type:[cudf, pandas] and use_cpp[True, False]. However it will remove the
+    df_type=pandas & use_cpp=True combinations as this will cause an unsupported usage of Pandas dataframes with the
+    C++ implementation of message classes.
+
+    This behavior can also be overridden by using the `use_cudf`, `use_pandas`, `use_cpp` or `use_pandas` marks ex:
+    ```
+    # This test will only run once with C++ enabled and cudf dataframes
+    @pytest.mark.use_cpp
+    def test something(dataset: DatasetManager):
+    ...
+    # This test will run once for each dataframe type, with C++ disabled both times
+    @pytest.mark.use_python
+    def test something(dataset: DatasetManager):
+    ...
+    # This test will run twice with C++ mode enabled/disabled, using cudf dataframes both times
+    @pytest.mark.use_cudf
+    def test something(use_cpp: bool, dataset: DatasetManager):
+    ...
+    # This test will run only once
+    @pytest.mark.use_cudf
+    @pytest.mark.use_python
+    def test something(dataset: DatasetManager):
+    ...
+    # This test creates an incompatible combination and will raise a RuntimeError without being executed
+    @pytest.mark.use_pandas
+    @pytest.mark.use_cpp
+    def test something(dataset: DatasetManager):
+    ```
+
+    Users who don't want to parametarize over the DataFrame should use the `dataset_pandas` or `dataset_cudf` fixtures.
+    """
+    from utils import dataset_manager
+    yield dataset_manager.DatasetManager(df_type=df_type)
 
 
 @pytest.fixture(scope="function")
-def filter_probs_df(_filter_probs_df, df_type: typing.Literal['cudf', 'pandas'], use_cpp: bool):
-    if df_type == 'cudf':
-        yield _filter_probs_df.copy(deep=True)
-    elif df_type == 'pandas':
-        yield _filter_probs_df.to_pandas()
-    else:
-        assert False, "Unknown df_type type"
+def dataset_pandas():
+    """
+    Yields a DatasetLoader instance with pandas as the default DataFrame type.
+
+    Note: This fixture won't prevent a user from writing a test requiring C++ mode execution and requesting Pandas
+    dataframes. This is quite useful for tests like `tests/test_add_scores_stage_pipe.py` where we want to test with
+    both Python & C++ executions, but we use Pandas to build up the expected DataFrame to validate the test against.
+
+    In addition to this, users can use this fixture to explicitly request a cudf Dataframe as well, allowing for a test
+    that looks like:
+    ```
+    @pytest.mark.use_cpp
+    def test_something(dataset_pandas: DatasetManager):
+        input_df = dataset_pandas.cudf["filter_probs.csv"] # Feed our source stage a cudf DF
+
+        # Perform pandas transformations to mimic the add scores stage
+        expected_df = dataset["filter_probs.csv"]
+        expected_df = expected_df.rename(columns=dict(zip(expected_df.columns, class_labels)))
+    ```
+    """
+    from utils import dataset_manager
+    yield dataset_manager.DatasetManager(df_type='pandas')
+
+
+@pytest.fixture(scope="function")
+def dataset_cudf():
+    """
+    Yields a DatasetLoader instance with cudf as the default DataFrame type.
+
+    Users who wish to have both cudf and pandas DataFrames can do so with this fixture and using the `pandas` property:
+    def test_something(dataset_cudf: DatasetManager):
+        cdf = dataset_cudf["filter_probs.csv"]
+        pdf = dataset_cudf.pandas["filter_probs.csv"]
+    """
+    from utils import dataset_manager
+    yield dataset_manager.DatasetManager(df_type='cudf')
+
+
+@pytest.fixture(scope="function")
+def filter_probs_df(dataset, use_cpp: bool):
+    """
+    Shortcut fixture for loading the filter_probs.csv dataset.
+
+    Unless your test uses the `use_pandas` or `use_cudf` marks this fixture will parametarize over the two dataframe
+    types. Similarly unless your test uses the `use_cpp` or `use_python` marks this fixture will also parametarize over
+    that as well, while excluding the combination of C++ execution and Pandas dataframes.
+    """
+    yield dataset["filter_probs.csv"]
 
 
 def wait_for_camouflage(host="localhost", port=8000, timeout=5):

@@ -22,14 +22,14 @@
 import torch
 
 from morpheus.config import AEFeatureScalar
-from morpheus.io.deserializers import read_file_to_df
 from morpheus.models.dfencoder import ae_module
 from morpheus.models.dfencoder import autoencoder
 from morpheus.models.dfencoder import scalers
 from morpheus.models.dfencoder.dataframe import EncoderDataFrame
 from utils import TEST_DIRS
+from utils.dataset_manager import DatasetManager
 
-# Only pandas and C++ is supported
+# Only pandas and Python is supported
 pytestmark = [pytest.mark.use_pandas, pytest.mark.use_python]
 
 BIN_COLS = ['ts_anomaly']
@@ -73,15 +73,9 @@ def train_ae():
                                   progress_bar=False)
 
 
-@pytest.fixture(scope="module")
-def _train_df() -> pd.DataFrame:
-    input_file = os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-role-g-validation-data-input.csv")
-    yield read_file_to_df(input_file, df_type='pandas')
-
-
 @pytest.fixture(scope="function")
-def train_df(_train_df) -> typing.Generator[pd.DataFrame, None, None]:
-    yield _train_df.copy(deep=True)
+def train_df(dataset_pandas: DatasetManager) -> typing.Iterator[pd.DataFrame]:
+    yield dataset_pandas[os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-role-g-validation-data-input.csv")]
 
 
 def compare_numeric_features(features, expected_features):

@@ -105,6 +105,7 @@ def cleanup_dist():
     torch.distributed.destroy_process_group()
 
 
+@pytest.mark.slow
 @pytest.mark.usefixtures("manual_seed")
 def test_dfencoder_distributed_e2e():
 

@@ -78,6 +78,7 @@
 LOSS_TOLERANCE_RATIO = 1.25
 
 
+@pytest.mark.slow
 @pytest.mark.usefixtures("manual_seed")
 def test_dfencoder_e2e():
     # Load data

@@ -23,11 +23,9 @@
 import pandas
 import pytest
 
-from morpheus.common import FileTypes
 from morpheus.config import Config
 from morpheus.config import ConfigFIL
 from morpheus.config import PipelineModes
-from morpheus.io.deserializers import read_file_to_df
 from morpheus.io.utils import filter_null_data
 from morpheus.pipeline import LinearPipeline
 from morpheus.stages.general.monitor_stage import MonitorStage
@@ -41,6 +39,7 @@
 from morpheus.utils.compare_df import compare_df
 from utils import TEST_DIRS
 from utils import write_file_to_kafka
+from utils.dataset_manager import DatasetManager
 
 if (typing.TYPE_CHECKING):
     from kafka import KafkaConsumer
@@ -54,7 +53,8 @@
 @pytest.mark.slow
 @pytest.mark.use_python
 @mock.patch('tritonclient.grpc.InferenceServerClient')
-def test_abp_no_cpp(mock_triton_client,
+def test_abp_no_cpp(mock_triton_client: mock.MagicMock,
+                    dataset_pandas: DatasetManager,
                     config: Config,
                     kafka_bootstrap_servers: str,
                     kafka_topics: typing.Tuple[str, str],
@@ -129,7 +129,7 @@ def async_infer(callback=None, **k):
 
     pipe.run()
 
-    val_df = read_file_to_df(val_file_name, file_type=FileTypes.Auto, df_type='pandas')
+    val_df = dataset_pandas[val_file_name]
 
     output_buf = StringIO()
     for rec in kafka_consumer:
@@ -150,7 +150,8 @@ def async_infer(callback=None, **k):
 @pytest.mark.slow
 @pytest.mark.use_cpp
 @pytest.mark.usefixtures("launch_mock_triton")
-def test_abp_cpp(config,
+def test_abp_cpp(config: Config,
+                 dataset_pandas: DatasetManager,
                  kafka_bootstrap_servers: str,
                  kafka_topics: typing.Tuple[str, str],
                  kafka_consumer: "KafkaConsumer"):
@@ -195,7 +196,7 @@ def test_abp_cpp(config,
 
     pipe.run()
 
-    val_df = read_file_to_df(val_file_name, file_type=FileTypes.Auto, df_type='pandas')
+    val_df = dataset_pandas[val_file_name]
     output_buf = StringIO()
     for rec in kafka_consumer:
         output_buf.write("{}\n".format(rec.value.decode("utf-8")))

@@ -19,14 +19,15 @@
 
 import cudf
 
+from morpheus.config import Config
 from morpheus.messages.memory.tensor_memory import TensorMemory
 from morpheus.messages.message_meta import MessageMeta
 from morpheus.messages.multi_response_message import MultiResponseMessage
 from morpheus.stages.postprocess.add_classifications_stage import AddClassificationsStage
-from utils import assert_df_equal
+from utils.dataset_manager import DatasetManager
 
 
-def test_constructor(config):
+def test_constructor(config: Config):
     config.class_labels = ['frogs', 'lizards', 'toads']
 
     ac = AddClassificationsStage(config)
@@ -64,9 +65,9 @@ def test_add_labels():
 
     labeled = AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=threshold)
 
-    assert assert_df_equal(labeled.get_meta("frogs"), probs_array_bool[:, 0])
-    assert assert_df_equal(labeled.get_meta("lizards"), probs_array_bool[:, 1])
-    assert assert_df_equal(labeled.get_meta("toads"), probs_array_bool[:, 2])
+    assert DatasetManager.assert_df_equal(labeled.get_meta("frogs"), probs_array_bool[:, 0])
+    assert DatasetManager.assert_df_equal(labeled.get_meta("lizards"), probs_array_bool[:, 1])
+    assert DatasetManager.assert_df_equal(labeled.get_meta("toads"), probs_array_bool[:, 2])
 
     # Same thing but change the probs tensor name
     message = MultiResponseMessage(meta=MessageMeta(df),
@@ -75,9 +76,9 @@ def test_add_labels():
 
     labeled = AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=threshold)
 
-    assert assert_df_equal(labeled.get_meta("frogs"), probs_array_bool[:, 0])
-    assert assert_df_equal(labeled.get_meta("lizards"), probs_array_bool[:, 1])
-    assert assert_df_equal(labeled.get_meta("toads"), probs_array_bool[:, 2])
+    assert DatasetManager.assert_df_equal(labeled.get_meta("frogs"), probs_array_bool[:, 0])
+    assert DatasetManager.assert_df_equal(labeled.get_meta("lizards"), probs_array_bool[:, 1])
+    assert DatasetManager.assert_df_equal(labeled.get_meta("toads"), probs_array_bool[:, 2])
 
     # Fail in missing probs data
     message = MultiResponseMessage(meta=MessageMeta(df),

@@ -28,8 +28,8 @@
 from morpheus.stages.postprocess.add_classifications_stage import AddClassificationsStage
 from morpheus.stages.postprocess.serialize_stage import SerializeStage
 from morpheus.stages.preprocess.deserialize_stage import DeserializeStage
-from stages.conv_msg import ConvMsg
 from utils import assert_results
+from utils.stages.conv_msg import ConvMsg
 
 
 def build_expected(df: pd.DataFrame, threshold: float, class_labels: typing.List[str]):

@@ -19,15 +19,16 @@
 
 import cudf
 
+from morpheus.config import Config
 from morpheus.messages.memory.tensor_memory import TensorMemory
 from morpheus.messages.message_meta import MessageMeta
 from morpheus.messages.multi_response_message import MultiResponseMessage
 from morpheus.stages.postprocess.add_classifications_stage import AddClassificationsStage
 from morpheus.stages.postprocess.add_scores_stage import AddScoresStage
-from utils import assert_df_equal
+from utils.dataset_manager import DatasetManager
 
 
-def test_constructor(config):
+def test_constructor(config: Config):
     config.class_labels = ['frogs', 'lizards', 'toads']
     config.feature_length = 12
 
@@ -62,9 +63,9 @@ def test_add_labels():
 
     labeled = AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=None)
 
-    assert assert_df_equal(labeled.get_meta("frogs"), probs_array[:, 0])
-    assert assert_df_equal(labeled.get_meta("lizards"), probs_array[:, 1])
-    assert assert_df_equal(labeled.get_meta("toads"), probs_array[:, 2])
+    assert DatasetManager.assert_df_equal(labeled.get_meta("frogs"), probs_array[:, 0])
+    assert DatasetManager.assert_df_equal(labeled.get_meta("lizards"), probs_array[:, 1])
+    assert DatasetManager.assert_df_equal(labeled.get_meta("toads"), probs_array[:, 2])
 
     # Same thing but change the probs tensor name
     message = MultiResponseMessage(meta=MessageMeta(df),
@@ -73,9 +74,9 @@ def test_add_labels():
 
     labeled = AddClassificationsStage._add_labels(message, idx2label=class_labels, threshold=None)
 
-    assert assert_df_equal(labeled.get_meta("frogs"), probs_array[:, 0])
-    assert assert_df_equal(labeled.get_meta("lizards"), probs_array[:, 1])
-    assert assert_df_equal(labeled.get_meta("toads"), probs_array[:, 2])
+    assert DatasetManager.assert_df_equal(labeled.get_meta("frogs"), probs_array[:, 0])
+    assert DatasetManager.assert_df_equal(labeled.get_meta("lizards"), probs_array[:, 1])
+    assert DatasetManager.assert_df_equal(labeled.get_meta("toads"), probs_array[:, 2])
 
     # Fail in missing probs data
     message = MultiResponseMessage(meta=MessageMeta(df),