Skip to content

Commit

Permalink
speed up ray image fill tests (#2828)
Browse files Browse the repository at this point in the history
  • Loading branch information
geoffreyangus authored Dec 7, 2022
1 parent 97f3e66 commit 92d9e9c
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 9 deletions.
18 changes: 11 additions & 7 deletions tests/integration_tests/test_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,9 @@ def run_preprocessing(
dataset_type="parquet",
num_examples_per_split=20,
nan_percent=0.0,
first_row_none=False,
last_row_none=False,
nan_cols=None,
):
# Split the dataset manually to avoid randomness in splitting
split_to_df = {}
Expand All @@ -182,7 +185,8 @@ def run_preprocessing(
split_to_df[split] = dataset_df
full_df_path = os.path.join(tmpdir, "dataset.csv")
pd.concat(split_to_df.values()).to_csv(full_df_path, index=False)
dataset_path = create_data_set_to_use(dataset_type, full_df_path, nan_percent=nan_percent)
dataset = create_data_set_to_use(dataset_type, full_df_path, nan_percent=nan_percent)
dataset = augment_dataset_with_none(dataset, first_row_none, last_row_none, nan_cols)

# Configure ray backend
config = {
Expand All @@ -204,15 +208,15 @@ def run_preprocessing(
ray_model = LudwigModel(config, backend=backend_config)
*ray_datasets, ray_training_set_metadata = ray_model.preprocess(
skip_save_processed_input=False, # Save the processed input to test pyarrow write/read
dataset=dataset_path,
dataset=dataset,
)

# Run preprocessing with local backend using the ray_training_set_metadata to ensure parity of
# token assignments, etc.
local_model = LudwigModel(config, backend=LOCAL_BACKEND)
*local_datasets, _ = local_model.preprocess(
training_set_metadata=ray_training_set_metadata,
dataset=dataset_path,
dataset=dataset,
)

for ray_dataset, local_dataset in zip(ray_datasets, local_datasets):
Expand Down Expand Up @@ -280,7 +284,7 @@ def run_test_with_features(
preprocessing=None,
first_row_none=False,
last_row_none=False,
nan_cols=[],
nan_cols=None,
):
preprocessing = preprocessing or {}
config = {
Expand Down Expand Up @@ -573,12 +577,12 @@ def test_ray_image_with_fill_strategy_edge_cases(tmpdir, settings, ray_cluster_2
output_features = [
binary_feature(),
]
run_test_with_features(
run_preprocessing(
tmpdir,
"dask",
input_features,
output_features,
df_engine="dask",
dataset_type="pandas+numpy_images",
skip_save_processed_input=False,
first_row_none=first_row_none,
last_row_none=last_row_none,
nan_cols=[input_features[0][NAME]],
Expand Down
6 changes: 4 additions & 2 deletions tests/integration_tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import unittest
import uuid
from distutils.util import strtobool
from typing import List, Union
from typing import List, Optional, Union

import cloudpickle
import numpy as np
Expand Down Expand Up @@ -824,7 +824,7 @@ def to_fwf(df, fname):


def augment_dataset_with_none(
df: pd.DataFrame, first_row_none: bool = False, last_row_none: bool = False, nan_cols: List = []
df: pd.DataFrame, first_row_none: bool = False, last_row_none: bool = False, nan_cols: Optional[List] = None
) -> pd.DataFrame:
"""Optionally sets the first and last rows of nan_cols of the given dataframe to nan.
Expand All @@ -837,6 +837,8 @@ def augment_dataset_with_none(
:param nan_cols: a list of columns in the dataframe to explicitly set the first or last rows to np.nan
:type nan_cols: list
"""
nan_cols = nan_cols if nan_cols is not None else []

if first_row_none:
for col in nan_cols:
df.iloc[0, df.columns.get_loc(col)] = np.nan
Expand Down

0 comments on commit 92d9e9c

Please sign in to comment.