From ded13c35b999f13cc111ea49d7c5e94afc401100 Mon Sep 17 00:00:00 2001 From: Andrey Pavlenko Date: Wed, 9 Aug 2023 15:11:45 +0200 Subject: [PATCH] FEAT-#6398: Improved performance of list-like objects insertion into DataFrames Wrap a list-like object into a single-column query compiler before the insertion. Signed-off-by: Andrey Pavlenko --- .../storage_formats/pandas/query_compiler.py | 21 ++++++++++++ modin/pandas/dataframe.py | 2 +- modin/pandas/test/conftest.py | 33 +++++++++++++++++++ modin/pandas/test/dataframe/test_default.py | 17 ++++++++-- modin/pandas/test/test_groupby.py | 6 +++- 5 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 modin/pandas/test/conftest.py diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 9a5280844c4..52dc1372806 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2757,6 +2757,7 @@ def getitem_row_array(self, key): ) def setitem(self, axis, key, value): + value = self._wrap_column_data(value) return self._setitem(axis=axis, key=key, value=value, how=None) def _setitem(self, axis, key, value, how="inner"): @@ -2922,6 +2923,7 @@ def _compute_duplicated(df): # pragma: no cover # return a new one from here and let the front end handle the inplace # update. def insert(self, loc, column, value): + value = self._wrap_column_data(value) if isinstance(value, type(self)): value.columns = [column] return self.insert_item(axis=1, loc=loc, value=value, how=None) @@ -2954,6 +2956,25 @@ def insert(df, internal_indices=[]): # pragma: no cover ) return self.__constructor__(new_modin_frame) + def _wrap_column_data(self, data): + """ + If the data is list-like, create a single column query compiler. + + Parameters + ---------- + data : any + + Returns + ------- + data or PandasQueryCompiler + """ + if is_list_like(data): + return self.from_pandas( + pandas.DataFrame(pandas.Series(data, index=self.index)), + data_cls=type(self._modin_frame), + ) + return data + # END Insert def explode(self, column): diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index e3a307773ec..9bc16281d70 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -2511,7 +2511,7 @@ def setitem_unhashable_key(df, value): value = value.T.reshape(-1) if len(self) > 0: value = value[: len(self)] - if not isinstance(value, (Series, Categorical, np.ndarray)): + if not isinstance(value, (Series, Categorical, np.ndarray, list, range)): value = list(value) if not self._query_compiler.lazy_execution and len(self.index) == 0: diff --git a/modin/pandas/test/conftest.py b/modin/pandas/test/conftest.py new file mode 100644 index 00000000000..f23efdd23f0 --- /dev/null +++ b/modin/pandas/test/conftest.py @@ -0,0 +1,33 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pytest + +from modin.config import Engine, StorageFormat + + +def pytest_collection_modifyitems(items): + if ( + Engine.get() in ("Ray", "Unidist", "Dask", "Python") + and StorageFormat.get() != "Base" + ): + for item in items: + if item.name in ( + "test_dataframe_dt_index[3s-both-DateCol-0]", + "test_dataframe_dt_index[3s-right-DateCol-0]", + ): + item.add_marker( + pytest.mark.xfail( + reason="https://github.com/modin-project/modin/issues/6399" + ) + ) diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index d7db58b5e32..c80d6e7a0d3 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -46,7 +46,7 @@ test_data_large_categorical_dataframe, default_to_pandas_ignore_string, ) -from modin.config import NPartitions, StorageFormat +from modin.config import NPartitions, StorageFormat, Engine from modin.test.test_utils import warns_that_defaulting_to_pandas NPartitions.put(4) @@ -850,7 +850,20 @@ def test_resampler_functions_with_arg(rule, axis, method_arg): @pytest.mark.parametrize("rule", ["5T"]) @pytest.mark.parametrize("closed", ["left", "right"]) @pytest.mark.parametrize("label", ["right", "left"]) -@pytest.mark.parametrize("on", [None, "DateColumn"]) +@pytest.mark.parametrize( + "on", + [ + None, + pytest.param( + "DateColumn", + marks=pytest.mark.xfail( + condition=Engine.get() in ("Ray", "Unidist", "Dask", "Python") + and StorageFormat.get() != "Base", + reason="https://github.com/modin-project/modin/issues/6399", + ), + ), + ], +) @pytest.mark.parametrize("level", [None, 1]) def test_resample_specific(rule, closed, label, on, level): data, index = ( diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index be3eb4866d8..8f3a04ea8b1 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -2748,7 +2748,11 @@ def test_rolling_timedelta_window(center, closed, as_index, on): pd_df = md_df._to_pandas() if StorageFormat.get() == "Pandas": - assert md_df._query_compiler._modin_frame._partitions.shape[1] == 2 + assert ( + md_df._query_compiler._modin_frame._partitions.shape[1] == 2 + if on is None + else 3 + ) md_window = md_df.groupby("by", as_index=as_index).rolling( datetime.timedelta(days=3), center=center, closed=closed, on=on