ludwig-ai · geoffreyangus · Dec 7, 2022 · Dec 7, 2022
@@ -165,6 +165,9 @@ def run_preprocessing(
     dataset_type="parquet",
     num_examples_per_split=20,
     nan_percent=0.0,
+    first_row_none=False,
+    last_row_none=False,
+    nan_cols=None,
 ):
     # Split the dataset manually to avoid randomness in splitting
     split_to_df = {}
@@ -182,7 +185,8 @@ def run_preprocessing(
         split_to_df[split] = dataset_df
     full_df_path = os.path.join(tmpdir, "dataset.csv")
     pd.concat(split_to_df.values()).to_csv(full_df_path, index=False)
-    dataset_path = create_data_set_to_use(dataset_type, full_df_path, nan_percent=nan_percent)
+    dataset = create_data_set_to_use(dataset_type, full_df_path, nan_percent=nan_percent)
+    dataset = augment_dataset_with_none(dataset, first_row_none, last_row_none, nan_cols)
 
     # Configure ray backend
     config = {
@@ -204,15 +208,15 @@ def run_preprocessing(
     ray_model = LudwigModel(config, backend=backend_config)
     *ray_datasets, ray_training_set_metadata = ray_model.preprocess(
         skip_save_processed_input=False,  # Save the processed input to test pyarrow write/read
-        dataset=dataset_path,
+        dataset=dataset,
     )
 
     # Run preprocessing with local backend using the ray_training_set_metadata to ensure parity of
     # token assignments, etc.
     local_model = LudwigModel(config, backend=LOCAL_BACKEND)
     *local_datasets, _ = local_model.preprocess(
         training_set_metadata=ray_training_set_metadata,
-        dataset=dataset_path,
+        dataset=dataset,
     )
 
     for ray_dataset, local_dataset in zip(ray_datasets, local_datasets):
@@ -280,7 +284,7 @@ def run_test_with_features(
     preprocessing=None,
     first_row_none=False,
     last_row_none=False,
-    nan_cols=[],
+    nan_cols=None,
 ):
     preprocessing = preprocessing or {}
     config = {
@@ -573,12 +577,12 @@ def test_ray_image_with_fill_strategy_edge_cases(tmpdir, settings, ray_cluster_2
     output_features = [
         binary_feature(),
     ]
-    run_test_with_features(
+    run_preprocessing(
+        tmpdir,
+        "dask",
         input_features,
         output_features,
-        df_engine="dask",
         dataset_type="pandas+numpy_images",
-        skip_save_processed_input=False,
         first_row_none=first_row_none,
         last_row_none=last_row_none,
         nan_cols=[input_features[0][NAME]],

@@ -25,7 +25,7 @@
 import unittest
 import uuid
 from distutils.util import strtobool
-from typing import List, Union
+from typing import List, Optional, Union
 
 import cloudpickle
 import numpy as np
@@ -824,7 +824,7 @@ def to_fwf(df, fname):
 
 
 def augment_dataset_with_none(
-    df: pd.DataFrame, first_row_none: bool = False, last_row_none: bool = False, nan_cols: List = []
+    df: pd.DataFrame, first_row_none: bool = False, last_row_none: bool = False, nan_cols: Optional[List] = None
 ) -> pd.DataFrame:
     """Optionally sets the first and last rows of nan_cols of the given dataframe to nan.
 
@@ -837,6 +837,8 @@ def augment_dataset_with_none(
     :param nan_cols: a list of columns in the dataframe to explicitly set the first or last rows to np.nan
     :type nan_cols: list
     """
+    nan_cols = nan_cols if nan_cols is not None else []
+
     if first_row_none:
         for col in nan_cols:
             df.iloc[0, df.columns.get_loc(col)] = np.nan