Skip to content

Commit

Permalink
Add support for pandas 2 (#4216)
Browse files Browse the repository at this point in the history
* Squashed changes

* Ignored index

* Disabled column checking

* Reverted deleted code

* Updated pyproject.toml

* Replaced version check code
  • Loading branch information
christopherbunn authored Jul 27, 2023
1 parent b398501 commit 5b80a8e
Show file tree
Hide file tree
Showing 28 changed files with 147 additions and 58 deletions.
2 changes: 1 addition & 1 deletion .github/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ outputs:
- setuptools ==58.0.4
run:
- numpy >=1.21.0
- pandas >=1.5.0, <2.0.0
- pandas >=1.5.0
- dask >=2022.2.0, !=2022.10.1
- scipy >=1.5.0
- scikit-learn >=1.3.0
Expand Down
2 changes: 1 addition & 1 deletion core-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
numpy>=1.21.0
pandas>=1.5.0, <2.0.0
pandas>=1.5.0
scipy>=1.5.0
scikit-learn>=1.3.0
scikit-optimize>=0.9.0
Expand Down
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Release Notes
* Updated regression metrics to handle multioutput dataframes as well as single output series :pr:`4233`
* Added stacking and unstacking utility functions to work with multiseries data :pr:`4250`
* Fixes
* Added support for pandas 2 :pr:`4216`
* Changes
* Unpinned sktime version :pr:`4214`
* Bumped minimum lightgbm version to 4.0.0 for nullable type handling :pr:`4237`
Expand Down
4 changes: 2 additions & 2 deletions docs/source/user_guide/timeseries.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -996,8 +996,8 @@
" ),\n",
" # Plot prediction intervals\n",
" go.Scatter(\n",
" x=X_forecast_dates[\"Date\"].append(X_forecast_dates[\"Date\"][::-1]),\n",
" y=y_upper.append(y_lower[::-1]),\n",
" x=pd.concat([X_forecast_dates[\"Date\"], X_forecast_dates[\"Date\"][::-1]]),\n",
" y=pd.concat([y_upper, y_lower[::-1]]),\n",
" fill=\"toself\",\n",
" fillcolor=\"rgba(255,0,0,0.2)\",\n",
" line=dict(color=\"rgba(255,0,0,0.2)\"),\n",
Expand Down
4 changes: 2 additions & 2 deletions evalml/model_understanding/visualizations.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,8 +472,8 @@ def get_linear_coefficients(estimator, features=None):
coef_.name = "Coefficients"
coef_.index = features
coef_ = coef_.sort_values()
coef_ = pd.Series(estimator._component_obj.intercept_, index=["Intercept"]).append(
coef_,
coef_ = pd.concat(
[pd.Series(estimator._component_obj.intercept_, index=["Intercept"]), coef_],
)

return coef_
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def fit(self, X, y=None):
random_state=self._initial_state,
)
value_counts = value_counts.sort_values(
[col],
value_counts.iloc[:, 0].name,
ascending=False,
kind="mergesort",
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,9 @@ def determine_periodicity(
period is detected, returns None.
"""
X, y = cls._handle_nullable_types(cls, X, y)
# Only need to handle nullable types on pandas < 2. Kept for backwards compatibility with pandas 1.x.
if int(pd.__version__.split(".")[0]) < 2:
X, y = cls._handle_nullable_types(cls, X, y)

def _get_rel_max_from_acf(y):
"""Determines the relative maxima of the target's autocorrelation."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]:
index=truncated_y_t.index,
),
)
y = y_in_sample.append(y_out_of_sample)
y = pd.concat([y_in_sample, y_out_of_sample])
y.index = original_index
return y

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def transform(
index=truncated_y.index,
),
)
y_t = y_in_sample.append(y_out_of_sample)
y_t = pd.concat([y_in_sample, y_out_of_sample])
y_t.index = original_index
return X, y_t

Expand Down Expand Up @@ -317,7 +317,7 @@ def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]:
index=truncated_y_t.index,
),
)
y = y_in_sample.append(y_out_of_sample)
y = pd.concat([y_in_sample, y_out_of_sample])
y.index = original_index
return y

Expand Down
6 changes: 2 additions & 4 deletions evalml/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,19 +184,17 @@ def target_distribution(targets):
Examples:
>>> y = pd.Series([1, 2, 4, 1, 3, 3, 1, 2])
>>> target_distribution(y)
>>> print(target_distribution(y).to_string())
Targets
1 37.50%
2 25.00%
3 25.00%
4 12.50%
dtype: object
>>> y = pd.Series([True, False, False, False, True])
>>> target_distribution(y)
>>> print(target_distribution(y).to_string())
Targets
False 60.00%
True 40.00%
dtype: object
"""
distribution = targets.value_counts() / len(targets)
return distribution.mul(100).apply("{:.2f}%".format).rename_axis("Targets")
18 changes: 15 additions & 3 deletions evalml/tests/component_tests/decomposer_tests/test_decomposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,8 @@ def test_decomposer_build_seasonal_signal(
X, _, y = ts_data()

# Change the date time index to start at the same time but have different frequency
y.set_axis(
y = y.set_axis(
pd.date_range(start="2021-01-01", periods=len(y), freq=frequency),
inplace=True,
)

decomposer = decomposer_child_class(degree=2)
Expand Down Expand Up @@ -497,7 +496,12 @@ def test_decomposer_determine_periodicity(
True,
pytest.param(
False,
marks=pytest.mark.xfail(strict=True, raises=AssertionError),
marks=pytest.mark.xfail(
condition=int(pd.__version__.split(".")[0]) < 2,
strict=True,
raises=AssertionError,
reason="pandas 1.x does not recognize np.Nan in Float64 subtracted_floats.",
),
),
],
)
Expand Down Expand Up @@ -749,12 +753,20 @@ def test_decomposer_inverse_transform(
output_inverse_y = decomposer.inverse_transform(y_t_new)
else:
output_inverse_y = decomposer.inverse_transform(y_t_new)
# Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows,
# we need to test the indices equivalence separately.
pd.testing.assert_series_equal(
y[y_t_new.index],
output_inverse_y,
check_exact=False,
check_index=False,
rtol=1.0e-1,
)
pd.testing.assert_index_equal(
y[y_t_new.index].index,
output_inverse_y.index,
exact=False,
)


@pytest.mark.parametrize(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,13 +181,20 @@ def test_stl_decomposer_inverse_transform(
):
output_inverse_y = decomposer.inverse_transform(y_t_new)
else:
# Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows,
# we need to test the indices equivalence separately.
output_inverse_y = decomposer.inverse_transform(y_t_new)
pd.testing.assert_series_equal(
y[y_t_new.index],
output_inverse_y,
check_exact=False,
check_index=False,
rtol=1.0e-2,
)
pd.testing.assert_index_equal(
y[y_t_new.index].index,
output_inverse_y.index,
exact=False,
)


@pytest.mark.parametrize(
Expand Down
11 changes: 7 additions & 4 deletions evalml/tests/component_tests/test_datetime_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,10 @@ def test_datetime_featurizer_encodes_as_ints():
# Test that changing encode_as_categories to True only changes the dtypes but not the values
dt_with_cats = DateTimeFeaturizer(encode_as_categories=True)
X_transformed_df = dt_with_cats.fit_transform(X)
expected["date_month"] = pd.Categorical([3, 2, 6, 7, 0])
expected["date_day_of_week"] = pd.Categorical([0, 3, 2, 1, 5])
expected["date_month"] = pd.Categorical([3, 2, 6, 7, 0]).astype("category")
expected["date_day_of_week"] = pd.Categorical([0, 3, 2, 1, 5]).astype("category")

assert_frame_equal(expected, X_transformed_df)
assert_frame_equal(expected, X_transformed_df, check_categorical=False)
assert dt_with_cats.get_feature_names() == feature_names

# Test that sequential calls to the same DateTimeFeaturizer work as expected by using the first dt we defined
Expand Down Expand Up @@ -250,7 +250,10 @@ def test_datetime_featurizer_no_datetime_cols():

def test_datetime_featurizer_numpy_array_input():
datetime_transformer = DateTimeFeaturizer()
X = np.array([["2007-02-03"], ["2016-06-07"], ["2020-05-19"]], dtype="datetime64")
X = np.array(
[["2007-02-03"], ["2016-06-07"], ["2020-05-19"]],
dtype="datetime64[ns]",
)
datetime_transformer.fit(X)
assert list(datetime_transformer.transform(X).columns) == [
"0_year",
Expand Down
32 changes: 27 additions & 5 deletions evalml/tests/component_tests/test_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,21 @@ def test_drop_all_columns(imputer_test_data):
imputer.fit(X, y)
transformed = imputer.transform(X, y)
expected = X.drop(["all nan cat", "all nan"], axis=1)
assert_frame_equal(transformed, expected, check_dtype=False)
assert_frame_equal(
transformed,
expected,
check_column_type=False,
check_index_type=False,
)

imputer = Imputer()
transformed = imputer.fit_transform(X, y)
assert_frame_equal(transformed, expected, check_dtype=False)
assert_frame_equal(
transformed,
expected,
check_column_type=False,
check_index_type=False,
)


def test_typed_imputer_numpy_input():
Expand Down Expand Up @@ -271,11 +281,21 @@ def test_imputer_empty_data(data_type, make_data_type):
imputer = Imputer()
imputer.fit(X, y)
transformed = imputer.transform(X, y)
assert_frame_equal(transformed, expected, check_dtype=False)
assert_frame_equal(
transformed,
expected,
check_column_type=False,
check_index_type=False,
)

imputer = Imputer()
transformed = imputer.fit_transform(X, y)
assert_frame_equal(transformed, expected, check_dtype=False)
assert_frame_equal(
transformed,
expected,
check_column_type=False,
check_index_type=False,
)


def test_imputer_does_not_reset_index():
Expand Down Expand Up @@ -508,7 +528,9 @@ def test_imputer_with_none_separated(
for col in set(columns_dict["categoricals_only"]).intersection(
set(X_test.columns),
):
expected_df[col].cat.add_categories(categorical_fill_value, inplace=True)
expected_df[col] = expected_df[col].cat.add_categories(
categorical_fill_value,
)
expected_df[col].iloc[-1:] = categorical_fill_value
if boolean_impute_strategy == "constant":
for col in set(columns_dict["booleans_only"]).intersection(set(X_test.columns)):
Expand Down
18 changes: 16 additions & 2 deletions evalml/tests/component_tests/test_lgbm_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,25 @@ def test_correct_args(mock_predict, mock_predict_proba, X_y_binary):

clf.predict(X)
arg_X = mock_predict.call_args[0][0]
assert_frame_equal(X_expected, arg_X)
# Index type checking ignored so the test can pass on Windows
# X_expected is int32, arg_X is int64
assert_frame_equal(
X_expected,
arg_X,
check_index_type=False,
check_column_type=False,
)

clf.predict_proba(X)
arg_X = mock_predict_proba.call_args[0][0]
assert_frame_equal(X_expected, arg_X)
# Index type checking ignored so the test can pass on Windows
# X_expected is int32, arg_X is int64
assert_frame_equal(
X_expected,
arg_X,
check_index_type=False,
check_column_type=False,
)


@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict_proba")
Expand Down
9 changes: 8 additions & 1 deletion evalml/tests/component_tests/test_lgbm_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,14 @@ def test_correct_args(mock_predict, X_y_regression):

clf.predict(X)
arg_X = mock_predict.call_args[0][0]
assert_frame_equal(X_expected, arg_X)
# Index type checking ignored so the test can pass on Windows
# X_expected is int32, arg_X is int64
assert_frame_equal(
X_expected,
arg_X,
check_index_type=False,
check_column_type=False,
)


@patch("evalml.pipelines.components.estimators.estimator.Estimator.predict")
Expand Down
6 changes: 3 additions & 3 deletions evalml/tests/component_tests/test_one_hot_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ def test_more_top_n_unique_values():
col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed)
col_1_counts = col_1_counts.sort_values(
["col_1"],
col_1_counts.iloc[:, 0].name,
ascending=False,
kind="mergesort",
)
Expand All @@ -429,7 +429,7 @@ def test_more_top_n_unique_values():
col_2_counts = X["col_2"].value_counts(dropna=False).to_frame()
col_2_counts = col_2_counts.sample(frac=1, random_state=random_seed)
col_2_counts = col_2_counts.sort_values(
["col_2"],
col_2_counts.iloc[:, 0].name,
ascending=False,
kind="mergesort",
)
Expand Down Expand Up @@ -466,7 +466,7 @@ def test_more_top_n_unique_values_large():
col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed)
col_1_counts = col_1_counts.sort_values(
["col_1"],
col_1_counts.iloc[:, 0].name,
ascending=False,
kind="mergesort",
)
Expand Down
1 change: 1 addition & 0 deletions evalml/tests/component_tests/test_oversampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def test_oversample_imbalanced_binary(data_type, oversampler_type, make_data_typ
value_counts,
pd.Series([850, 850]),
check_dtype=False,
check_names=False,
)

oversampler = Oversampler(sampling_ratio=1)
Expand Down
4 changes: 2 additions & 2 deletions evalml/tests/component_tests/test_simple_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,14 +557,14 @@ def test_simple_imputer_ignores_natural_language(

if df_composition == "full_df":
if numeric_impute_strategy == "mean" and has_nan == "has_nan":
ans = X_df.mean()
ans = X_df.mean(numeric_only=True)
ans["natural language col"] = pd.NA
X_df = X_df.astype(
{"int col": float},
)
X_df.iloc[-1, :] = ans
elif numeric_impute_strategy == "median" and has_nan == "has_nan":
ans = X_df.median()
ans = X_df.median(numeric_only=True)
ans["natural language col"] = pd.NA
X_df = X_df.astype(
{"int col": float},
Expand Down
2 changes: 1 addition & 1 deletion evalml/tests/component_tests/test_target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def test_cols():
),
},
)
assert_frame_equal(X_expected, X_t, check_less_precise=True)
assert_frame_equal(X_expected, X_t, rtol=1e-3)

encoder = TargetEncoder(cols=["col_3"])
encoder.fit(X, y)
Expand Down
Loading

0 comments on commit 5b80a8e

Please sign in to comment.