diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 8595e7ec0e67f..7f0c31201c84a 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -796,6 +796,8 @@ def __hash__(self): "pyspark.pandas.series", # unittests "pyspark.pandas.tests.indexes.test_base", + "pyspark.pandas.tests.indexes.test_conversion", + "pyspark.pandas.tests.indexes.test_drop", "pyspark.pandas.tests.indexes.test_asof", "pyspark.pandas.tests.indexes.test_astype", "pyspark.pandas.tests.indexes.test_delete", @@ -1086,6 +1088,8 @@ def __hash__(self): "pyspark.pandas.tests.connect.test_parity_typedef", "pyspark.pandas.tests.connect.test_parity_utils", "pyspark.pandas.tests.connect.indexes.test_parity_base", + "pyspark.pandas.tests.connect.indexes.test_parity_conversion", + "pyspark.pandas.tests.connect.indexes.test_parity_drop", "pyspark.pandas.tests.connect.indexes.test_parity_asof", "pyspark.pandas.tests.connect.indexes.test_parity_astype", "pyspark.pandas.tests.connect.indexes.test_parity_delete", diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_conversion.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_conversion.py new file mode 100644 index 0000000000000..ecb7aa2e7bc63 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_conversion.py @@ -0,0 +1,41 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +from pyspark.pandas.tests.indexes.test_conversion import ConversionMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestUtils + + +class ConversionParityTests( + ConversionMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.indexes.test_parity_conversion import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_drop.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_drop.py new file mode 100644 index 0000000000000..abfe366447c17 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_drop.py @@ -0,0 +1,41 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +from pyspark.pandas.tests.indexes.test_drop import IndexesDropMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestUtils + + +class IndexesDropParityTests( + IndexesDropMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.indexes.test_parity_drop import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index baa3f01c2ca55..788319a38c2d1 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -69,41 +69,6 @@ def test_index_basic(self): with self.assertRaisesRegex(TypeError, "Index.name must be a hashable type"): ps.Index([1.0, 2.0, 3.0], name=[(1, 2, 3)]) - def test_index_from_series(self): - pser = pd.Series([1, 2, 3], name="a", index=[10, 20, 30]) - psser = ps.from_pandas(pser) - - self.assert_eq(ps.Index(psser), pd.Index(pser)) - self.assert_eq(ps.Index(psser, dtype="float"), pd.Index(pser, dtype="float")) - self.assert_eq(ps.Index(psser, name="x"), pd.Index(pser, name="x")) - - self.assert_eq(ps.Index(psser, dtype="int64"), pd.Index(pser, dtype="int64")) - self.assert_eq(ps.Index(psser, dtype="float64"), pd.Index(pser, dtype="float64")) - - pser = pd.Series([datetime(2021, 3, 1), datetime(2021, 3, 2)], name="x", index=[10, 20]) - psser = ps.from_pandas(pser) - - self.assert_eq(ps.Index(psser), pd.Index(pser)) - self.assert_eq(ps.DatetimeIndex(psser), pd.DatetimeIndex(pser)) - - def test_index_from_index(self): - pidx = pd.Index([1, 2, 3], name="a") - psidx = ps.from_pandas(pidx) - - self.assert_eq(ps.Index(psidx), pd.Index(pidx)) - self.assert_eq(ps.Index(psidx, dtype="float"), pd.Index(pidx, dtype="float")) - self.assert_eq(ps.Index(psidx, name="x"), pd.Index(pidx, name="x")) - self.assert_eq(ps.Index(psidx, copy=True), pd.Index(pidx, copy=True)) - - self.assert_eq(ps.Index(psidx, dtype="int64"), pd.Index(pidx, dtype="int64")) - self.assert_eq(ps.Index(psidx, dtype="float64"), pd.Index(pidx, dtype="float64")) - - pidx = pd.DatetimeIndex(["2021-03-01", "2021-03-02"]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(ps.Index(psidx), pd.Index(pidx)) - self.assert_eq(ps.DatetimeIndex(psidx), pd.DatetimeIndex(pidx)) - def test_index_getattr(self): psidx = self.psdf.index item = "databricks" @@ -126,110 +91,6 @@ def test_multi_index_getattr(self): with self.assertRaisesRegex(AttributeError, expected_error_message): psidx.__getattr__(item) - def test_to_series(self): - pidx = self.pdf.index - psidx = self.psdf.index - - self.assert_eq(psidx.to_series(), pidx.to_series()) - self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a")) - - # With name - pidx.name = "Koalas" - psidx.name = "Koalas" - self.assert_eq(psidx.to_series(), pidx.to_series()) - self.assert_eq(psidx.to_series(name=("x", "a")), pidx.to_series(name=("x", "a"))) - - # With tupled name - pidx.name = ("x", "a") - psidx.name = ("x", "a") - self.assert_eq(psidx.to_series(), pidx.to_series()) - self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a")) - - self.assert_eq((psidx + 1).to_series(), (pidx + 1).to_series()) - - pidx = self.pdf.set_index("b", append=True).index - psidx = self.psdf.set_index("b", append=True).index - - with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}): - self.assert_eq(psidx.to_series(), pidx.to_series()) - self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a")) - - expected_error_message = "Series.name must be a hashable type" - with self.assertRaisesRegex(TypeError, expected_error_message): - psidx.to_series(name=["x", "a"]) - - def test_to_frame(self): - pidx = self.pdf.index - psidx = self.psdf.index - - self.assert_eq(psidx.to_frame(), pidx.to_frame()) - self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) - - pidx.name = "a" - psidx.name = "a" - - self.assert_eq(psidx.to_frame(), pidx.to_frame()) - self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) - - self.assert_eq(psidx.to_frame(name="x"), pidx.to_frame(name="x")) - self.assert_eq(psidx.to_frame(index=False, name="x"), pidx.to_frame(index=False, name="x")) - - self.assertRaises(TypeError, lambda: psidx.to_frame(name=["x"])) - - # non-string name - self.assert_eq(psidx.to_frame(name=10), pidx.to_frame(name=10)) - self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) - - pidx = self.pdf.set_index("b", append=True).index - psidx = self.psdf.set_index("b", append=True).index - - self.assert_eq(psidx.to_frame(), pidx.to_frame()) - self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) - - self.assert_eq(psidx.to_frame(name=["x", "y"]), pidx.to_frame(name=["x", "y"])) - self.assert_eq(psidx.to_frame(name=("x", "y")), pidx.to_frame(name=("x", "y"))) - self.assert_eq( - psidx.to_frame(index=False, name=["x", "y"]), - pidx.to_frame(index=False, name=["x", "y"]), - ) - - self.assertRaises(TypeError, lambda: psidx.to_frame(name="x")) - self.assertRaises(ValueError, lambda: psidx.to_frame(name=["x"])) - - # non-string names - self.assert_eq(psidx.to_frame(name=[10, 20]), pidx.to_frame(name=[10, 20])) - self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) - if LooseVersion(pd.__version__) < LooseVersion("1.5.0"): - self.assert_eq( - psidx.to_frame(name=[("x", 10), ("y", 20)]), - pidx.to_frame(name=[("x", 10), ("y", 20)]), - ) - else: - # Since pandas 1.5.0, the result is changed as below: - # (x, 10) (y, 20) - # b - # 0 4 0 4 - # 1 5 1 5 - # 3 6 3 6 - # 5 3 5 3 - # 6 2 6 2 - # 8 1 8 1 - # 9 0 9 0 - # 0 9 0 - # 0 9 0 - # - # The columns should be `Index([('x', 20), ('y', 20)], dtype='object')`, - # but pandas API on Spark doesn't support such a way for creating Index. - # So, we currently cannot follow the behavior of pandas. - expected_result = ps.DataFrame( - {("x", 10): [0, 1, 3, 5, 6, 8, 9, 9, 9], ("y", 20): [4, 5, 6, 3, 2, 1, 0, 0, 0]}, - index=ps.MultiIndex.from_tuples( - [(0, 4), (1, 5), (3, 6), (5, 3), (6, 2), (8, 1), (9, 0), (9, 0), (9, 0)], - names=[None, "b"], - ), - ) - self.assert_eq(psidx.to_frame(name=[("x", 10), ("y", 20)]), expected_result) - def test_index_names(self): psdf = self.psdf self.assertIsNone(psdf.index.name) @@ -400,47 +261,6 @@ def test_multi_index_copy(self): self.assert_eq(psdf.index.copy(), pdf.index.copy()) - def test_drop_duplicates(self): - pidx = pd.Index([4, 2, 4, 1, 4, 3]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(psidx.drop_duplicates(), pidx.drop_duplicates()) - self.assert_eq((psidx + 1).drop_duplicates(), (pidx + 1).drop_duplicates()) - - self.assert_eq(psidx.drop_duplicates(keep="first"), pidx.drop_duplicates(keep="first")) - self.assert_eq(psidx.drop_duplicates(keep="last"), pidx.drop_duplicates(keep="last")) - self.assert_eq(psidx.drop_duplicates(keep=False), pidx.drop_duplicates(keep=False)) - - arrays = [[1, 2, 3, 1, 2], ["red", "blue", "black", "red", "blue"]] - pmidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) - psmidx = ps.from_pandas(pmidx) - self.assert_eq(psmidx.drop_duplicates(), pmidx.drop_duplicates()) - self.assert_eq(psmidx.drop_duplicates(keep="first"), pmidx.drop_duplicates(keep="first")) - self.assert_eq(psmidx.drop_duplicates(keep="last"), pmidx.drop_duplicates(keep="last")) - self.assert_eq(psmidx.drop_duplicates(keep=False), pmidx.drop_duplicates(keep=False)) - - def test_dropna(self): - pidx = pd.Index([np.nan, 2, 4, 1, None, 3]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(psidx.dropna(), pidx.dropna()) - self.assert_eq((psidx + 1).dropna(), (pidx + 1).dropna()) - - self.assert_eq(psidx.dropna(how="any"), pidx.dropna(how="any")) - self.assert_eq(psidx.dropna(how="all"), pidx.dropna(how="all")) - - pmidx = pd.MultiIndex.from_tuples( - [(np.nan, 1.0), (2.0, 2.0), (np.nan, None), (3.0, np.nan)] - ) - psmidx = ps.from_pandas(pmidx) - self.assert_eq(psmidx.dropna(), pmidx.dropna()) - self.assert_eq(psmidx.dropna(how="any"), pmidx.dropna(how="any")) - self.assert_eq(psmidx.dropna(how="all"), pmidx.dropna(how="all")) - - invalid_how = "none" - with self.assertRaisesRegex(ValueError, "invalid how option: %s" % invalid_how): - psmidx.dropna(invalid_how) - def test_index_symmetric_difference(self): pidx1 = pd.Index([1, 2, 3, 4]) pidx2 = pd.Index([2, 3, 4, 5]) @@ -748,13 +568,6 @@ def test_multiindex_nlevel(self): self.assertEqual(psdf.index.nlevels, 2) - def test_multiindex_from_arrays(self): - arrays = [["a", "a", "b", "b"], ["red", "blue", "red", "blue"]] - pidx = pd.MultiIndex.from_arrays(arrays) - psidx = ps.MultiIndex.from_arrays(arrays) - - self.assert_eq(pidx, psidx) - def test_multiindex_swaplevel(self): pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]]) psidx = ps.from_pandas(pidx) @@ -779,46 +592,6 @@ def test_multiindex_swaplevel(self): with self.assertRaisesRegex(KeyError, "Level work not found"): psidx.swaplevel(0, "work") - def test_multiindex_droplevel(self): - pidx = pd.MultiIndex.from_tuples( - [("a", "x", 1), ("b", "y", 2)], names=["level1", "level2", "level3"] - ) - psidx = ps.from_pandas(pidx) - with self.assertRaisesRegex(IndexError, "Too many levels: Index has only 3 levels, not 5"): - psidx.droplevel(4) - - with self.assertRaisesRegex(KeyError, "Level level4 not found"): - psidx.droplevel("level4") - - with self.assertRaisesRegex(KeyError, "Level.*level3.*level4.*not found"): - psidx.droplevel([("level3", "level4")]) - - with self.assertRaisesRegex( - ValueError, - "Cannot remove 4 levels from an index with 3 levels: at least one " - "level must be left.", - ): - psidx.droplevel([0, 0, 1, 2]) - - with self.assertRaisesRegex( - ValueError, - "Cannot remove 3 levels from an index with 3 levels: at least one " - "level must be left.", - ): - psidx.droplevel([0, 1, 2]) - - self.assert_eq(pidx.droplevel(0), psidx.droplevel(0)) - self.assert_eq(pidx.droplevel([0, 1]), psidx.droplevel([0, 1])) - self.assert_eq(pidx.droplevel((0, 1)), psidx.droplevel((0, 1))) - self.assert_eq(pidx.droplevel([0, "level2"]), psidx.droplevel([0, "level2"])) - self.assert_eq(pidx.droplevel((0, "level2")), psidx.droplevel((0, "level2"))) - - # non-string names - pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)], names=[1.0, 2.0, 3.0]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.droplevel(1.0), psidx.droplevel(1.0)) - self.assert_eq(pidx.droplevel([0, 2.0]), psidx.droplevel([0, 2.0])) - def test_index_fillna(self): pidx = pd.Index([1, 2, None]) psidx = ps.from_pandas(pidx) @@ -829,36 +602,6 @@ def test_index_fillna(self): with self.assertRaisesRegex(TypeError, "Unsupported type list"): psidx.fillna([1, 2]) - def test_index_drop(self): - pidx = pd.Index([1, 2, 3]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(pidx.drop(1), psidx.drop(1)) - self.assert_eq(pidx.drop([1, 2]), psidx.drop([1, 2])) - self.assert_eq((pidx + 1).drop([2, 3]), (psidx + 1).drop([2, 3])) - - def test_multiindex_drop(self): - pidx = pd.MultiIndex.from_tuples( - [("a", "x"), ("b", "y"), ("c", "z")], names=["level1", "level2"] - ) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.drop("a"), psidx.drop("a")) - self.assert_eq(pidx.drop(["a", "b"]), psidx.drop(["a", "b"])) - self.assert_eq(pidx.drop(["x", "y"], level=1), psidx.drop(["x", "y"], level=1)) - self.assert_eq( - pidx.drop(["x", "y"], level="level2"), psidx.drop(["x", "y"], level="level2") - ) - - pidx.names = ["lv1", "lv2"] - psidx.names = ["lv1", "lv2"] - self.assert_eq(pidx.drop(["x", "y"], level="lv2"), psidx.drop(["x", "y"], level="lv2")) - - self.assertRaises(IndexError, lambda: psidx.drop(["a", "b"], level=2)) - self.assertRaises(KeyError, lambda: psidx.drop(["a", "b"], level="level")) - - psidx.names = ["lv", "lv"] - self.assertRaises(ValueError, lambda: psidx.drop(["x", "y"], level="lv")) - def _test_sort_values(self, pidx, psidx): self.assert_eq(pidx.sort_values(), psidx.sort_values()) # Parameter ascending @@ -900,15 +643,6 @@ def test_sort_values(self): self._test_sort_values(pidx, psidx) - def test_index_drop_duplicates(self): - pidx = pd.Index([1, 1, 2]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.drop_duplicates().sort_values(), psidx.drop_duplicates().sort_values()) - - pidx = pd.MultiIndex.from_tuples([(1, 1), (1, 1), (2, 2)], names=["level1", "level2"]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.drop_duplicates().sort_values(), psidx.drop_duplicates().sort_values()) - def test_index_sort(self): idx = ps.Index([1, 2, 3, 4, 5]) midx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) @@ -997,20 +731,6 @@ def test_multiindex_set_names(self): psidx.set_names("third", level=2, inplace=True) self.assert_eq(pidx, psidx) - def test_multiindex_from_tuples(self): - tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] - pidx = pd.MultiIndex.from_tuples(tuples) - psidx = ps.MultiIndex.from_tuples(tuples) - - self.assert_eq(pidx, psidx) - - def test_multiindex_from_product(self): - iterables = [[0, 1, 2], ["green", "purple"]] - pidx = pd.MultiIndex.from_product(iterables) - psidx = ps.MultiIndex.from_product(iterables) - - self.assert_eq(pidx, psidx) - def test_multiindex_tuple_column_name(self): column_labels = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) pdf = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=column_labels) @@ -1336,83 +1056,6 @@ def test_inferred_type(self): psmidx = ps.from_pandas(pmidx) self.assert_eq(pmidx.inferred_type, psmidx.inferred_type) - def test_multi_index_from_index(self): - tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] - pmidx = pd.Index(tuples) - psmidx = ps.Index(tuples) - - self.assertTrue(isinstance(psmidx, ps.MultiIndex)) - self.assert_eq(pmidx, psmidx) - - # Specify the `names` - # Specify the `names` while Index creating is no longer supported from pandas 2.0.0. - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - pmidx = pd.Index(tuples) - pmidx.names = ["Hello", "Koalas"] - psmidx = ps.Index(tuples) - psmidx.names = ["Hello", "Koalas"] - else: - pmidx = pd.Index(tuples, names=["Hello", "Koalas"]) - psmidx = ps.Index(tuples, names=["Hello", "Koalas"]) - - self.assertTrue(isinstance(psmidx, ps.MultiIndex)) - self.assert_eq(pmidx, psmidx) - - def test_multiindex_from_frame(self): - pdf = pd.DataFrame( - [["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]], columns=["a", "b"] - ) - psdf = ps.from_pandas(pdf) - pidx = pd.MultiIndex.from_frame(pdf) - psidx = ps.MultiIndex.from_frame(psdf) - - self.assert_eq(pidx, psidx) - - # Specify `names` - pidx = pd.MultiIndex.from_frame(pdf, names=["state", "observation"]) - psidx = ps.MultiIndex.from_frame(psdf, names=["state", "observation"]) - self.assert_eq(pidx, psidx) - - pidx = pd.MultiIndex.from_frame(pdf, names=("state", "observation")) - psidx = ps.MultiIndex.from_frame(psdf, names=("state", "observation")) - self.assert_eq(pidx, psidx) - - # MultiIndex columns - pidx = pd.MultiIndex.from_tuples([("a", "w"), ("b", "x")]) - pdf.columns = pidx - psdf = ps.from_pandas(pdf) - - pidx = pd.MultiIndex.from_frame(pdf) - psidx = ps.MultiIndex.from_frame(psdf) - - self.assert_eq(pidx, psidx) - - # tuples for names - pidx = pd.MultiIndex.from_frame(pdf, names=[("a", "w"), ("b", "x")]) - psidx = ps.MultiIndex.from_frame(psdf, names=[("a", "w"), ("b", "x")]) - - self.assert_eq(pidx, psidx) - - err_msg = "Input must be a DataFrame" - with self.assertRaisesRegex(TypeError, err_msg): - ps.MultiIndex.from_frame({"a": [1, 2, 3], "b": [4, 5, 6]}) - - self.assertRaises(TypeError, lambda: ps.MultiIndex.from_frame(psdf, names="ab")) - - # non-string names - self.assert_eq( - ps.MultiIndex.from_frame(psdf, names=[0, 1]), - pd.MultiIndex.from_frame(pdf, names=[0, 1]), - ) - self.assert_eq( - ps.MultiIndex.from_frame(psdf, names=[("x", 0), ("y", 1)]), - pd.MultiIndex.from_frame(pdf, names=[("x", 0), ("y", 1)]), - ) - - pdf = pd.DataFrame([["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]]) - psdf = ps.from_pandas(pdf) - self.assert_eq(ps.MultiIndex.from_frame(psdf), pd.MultiIndex.from_frame(pdf)) - def test_index_is_unique(self): indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)] names = [None, "ks", "ks", None] @@ -1451,18 +1094,6 @@ def test_view(self): self.assert_eq(pmidx.view(), psmidx.view()) - def test_to_list(self): - # Index - pidx = pd.Index([1, 2, 3, 4, 5]) - psidx = ps.from_pandas(pidx) - # MultiIndex - tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")] - pmidx = pd.MultiIndex.from_tuples(tuples) - psmidx = ps.from_pandas(pmidx) - - self.assert_eq(psidx.tolist(), pidx.tolist()) - self.assert_eq(psmidx.tolist(), pmidx.tolist()) - def test_index_ops(self): pidx = pd.Index([1, 2, 3, 4, 5]) psidx = ps.from_pandas(pidx) @@ -1534,22 +1165,6 @@ def test_multiindex_equal_levels(self): psmidx2 = ps.from_pandas(pmidx2) self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - def test_to_numpy(self): - pidx = pd.Index([1, 2, 3, 4]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(pidx.to_numpy(copy=True), psidx.to_numpy(copy=True)) - - def test_drop_level(self): - tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")] - pmidx = pd.MultiIndex.from_tuples(tuples) - psmidx = ps.from_pandas(pmidx) - - with self.assertRaisesRegex( - IndexError, "Too many levels: Index has only 2 levels, -3 is not a valid level number" - ): - psmidx.droplevel(-3) - def test_multi_index_nunique(self): tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")] pmidx = pd.MultiIndex.from_tuples(tuples) diff --git a/python/pyspark/pandas/tests/indexes/test_conversion.py b/python/pyspark/pandas/tests/indexes/test_conversion.py new file mode 100644 index 0000000000000..5790fb66ecf9b --- /dev/null +++ b/python/pyspark/pandas/tests/indexes/test_conversion.py @@ -0,0 +1,313 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest +from datetime import datetime + +import pandas as pd + +from pyspark import pandas as ps +from pyspark.loose_version import LooseVersion +from pyspark.testing.pandasutils import PandasOnSparkTestCase, SPARK_CONF_ARROW_ENABLED +from pyspark.testing.sqlutils import SQLTestUtils + + +class ConversionMixin: + @property + def pdf(self): + return pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]}, + index=[0, 1, 3, 5, 6, 8, 9, 9, 9], + ) + + @property + def psdf(self): + return ps.from_pandas(self.pdf) + + def test_index_from_series(self): + pser = pd.Series([1, 2, 3], name="a", index=[10, 20, 30]) + psser = ps.from_pandas(pser) + + self.assert_eq(ps.Index(psser), pd.Index(pser)) + self.assert_eq(ps.Index(psser, dtype="float"), pd.Index(pser, dtype="float")) + self.assert_eq(ps.Index(psser, name="x"), pd.Index(pser, name="x")) + + self.assert_eq(ps.Index(psser, dtype="int64"), pd.Index(pser, dtype="int64")) + self.assert_eq(ps.Index(psser, dtype="float64"), pd.Index(pser, dtype="float64")) + + pser = pd.Series([datetime(2021, 3, 1), datetime(2021, 3, 2)], name="x", index=[10, 20]) + psser = ps.from_pandas(pser) + + self.assert_eq(ps.Index(psser), pd.Index(pser)) + self.assert_eq(ps.DatetimeIndex(psser), pd.DatetimeIndex(pser)) + + def test_index_from_index(self): + pidx = pd.Index([1, 2, 3], name="a") + psidx = ps.from_pandas(pidx) + + self.assert_eq(ps.Index(psidx), pd.Index(pidx)) + self.assert_eq(ps.Index(psidx, dtype="float"), pd.Index(pidx, dtype="float")) + self.assert_eq(ps.Index(psidx, name="x"), pd.Index(pidx, name="x")) + self.assert_eq(ps.Index(psidx, copy=True), pd.Index(pidx, copy=True)) + + self.assert_eq(ps.Index(psidx, dtype="int64"), pd.Index(pidx, dtype="int64")) + self.assert_eq(ps.Index(psidx, dtype="float64"), pd.Index(pidx, dtype="float64")) + + pidx = pd.DatetimeIndex(["2021-03-01", "2021-03-02"]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(ps.Index(psidx), pd.Index(pidx)) + self.assert_eq(ps.DatetimeIndex(psidx), pd.DatetimeIndex(pidx)) + + def test_multiindex_from_arrays(self): + arrays = [["a", "a", "b", "b"], ["red", "blue", "red", "blue"]] + pidx = pd.MultiIndex.from_arrays(arrays) + psidx = ps.MultiIndex.from_arrays(arrays) + + self.assert_eq(pidx, psidx) + + def test_multiindex_from_tuples(self): + tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] + pidx = pd.MultiIndex.from_tuples(tuples) + psidx = ps.MultiIndex.from_tuples(tuples) + + self.assert_eq(pidx, psidx) + + def test_multiindex_from_product(self): + iterables = [[0, 1, 2], ["green", "purple"]] + pidx = pd.MultiIndex.from_product(iterables) + psidx = ps.MultiIndex.from_product(iterables) + + self.assert_eq(pidx, psidx) + + def test_multi_index_from_index(self): + tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] + pmidx = pd.Index(tuples) + psmidx = ps.Index(tuples) + + self.assertTrue(isinstance(psmidx, ps.MultiIndex)) + self.assert_eq(pmidx, psmidx) + + # Specify the `names` + # Specify the `names` while Index creating is no longer supported from pandas 2.0.0. + if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): + pmidx = pd.Index(tuples) + pmidx.names = ["Hello", "Koalas"] + psmidx = ps.Index(tuples) + psmidx.names = ["Hello", "Koalas"] + else: + pmidx = pd.Index(tuples, names=["Hello", "Koalas"]) + psmidx = ps.Index(tuples, names=["Hello", "Koalas"]) + + self.assertTrue(isinstance(psmidx, ps.MultiIndex)) + self.assert_eq(pmidx, psmidx) + + def test_multiindex_from_frame(self): + pdf = pd.DataFrame( + [["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]], columns=["a", "b"] + ) + psdf = ps.from_pandas(pdf) + pidx = pd.MultiIndex.from_frame(pdf) + psidx = ps.MultiIndex.from_frame(psdf) + + self.assert_eq(pidx, psidx) + + # Specify `names` + pidx = pd.MultiIndex.from_frame(pdf, names=["state", "observation"]) + psidx = ps.MultiIndex.from_frame(psdf, names=["state", "observation"]) + self.assert_eq(pidx, psidx) + + pidx = pd.MultiIndex.from_frame(pdf, names=("state", "observation")) + psidx = ps.MultiIndex.from_frame(psdf, names=("state", "observation")) + self.assert_eq(pidx, psidx) + + # MultiIndex columns + pidx = pd.MultiIndex.from_tuples([("a", "w"), ("b", "x")]) + pdf.columns = pidx + psdf = ps.from_pandas(pdf) + + pidx = pd.MultiIndex.from_frame(pdf) + psidx = ps.MultiIndex.from_frame(psdf) + + self.assert_eq(pidx, psidx) + + # tuples for names + pidx = pd.MultiIndex.from_frame(pdf, names=[("a", "w"), ("b", "x")]) + psidx = ps.MultiIndex.from_frame(psdf, names=[("a", "w"), ("b", "x")]) + + self.assert_eq(pidx, psidx) + + err_msg = "Input must be a DataFrame" + with self.assertRaisesRegex(TypeError, err_msg): + ps.MultiIndex.from_frame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + self.assertRaises(TypeError, lambda: ps.MultiIndex.from_frame(psdf, names="ab")) + + # non-string names + self.assert_eq( + ps.MultiIndex.from_frame(psdf, names=[0, 1]), + pd.MultiIndex.from_frame(pdf, names=[0, 1]), + ) + self.assert_eq( + ps.MultiIndex.from_frame(psdf, names=[("x", 0), ("y", 1)]), + pd.MultiIndex.from_frame(pdf, names=[("x", 0), ("y", 1)]), + ) + + pdf = pd.DataFrame([["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]]) + psdf = ps.from_pandas(pdf) + self.assert_eq(ps.MultiIndex.from_frame(psdf), pd.MultiIndex.from_frame(pdf)) + + def test_to_series(self): + pidx = self.pdf.index + psidx = self.psdf.index + + self.assert_eq(psidx.to_series(), pidx.to_series()) + self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a")) + + # With name + pidx.name = "Koalas" + psidx.name = "Koalas" + self.assert_eq(psidx.to_series(), pidx.to_series()) + self.assert_eq(psidx.to_series(name=("x", "a")), pidx.to_series(name=("x", "a"))) + + # With tupled name + pidx.name = ("x", "a") + psidx.name = ("x", "a") + self.assert_eq(psidx.to_series(), pidx.to_series()) + self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a")) + + self.assert_eq((psidx + 1).to_series(), (pidx + 1).to_series()) + + pidx = self.pdf.set_index("b", append=True).index + psidx = self.psdf.set_index("b", append=True).index + + with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}): + self.assert_eq(psidx.to_series(), pidx.to_series()) + self.assert_eq(psidx.to_series(name="a"), pidx.to_series(name="a")) + + expected_error_message = "Series.name must be a hashable type" + with self.assertRaisesRegex(TypeError, expected_error_message): + psidx.to_series(name=["x", "a"]) + + def test_to_frame(self): + pidx = self.pdf.index + psidx = self.psdf.index + + self.assert_eq(psidx.to_frame(), pidx.to_frame()) + self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) + + pidx.name = "a" + psidx.name = "a" + + self.assert_eq(psidx.to_frame(), pidx.to_frame()) + self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) + + self.assert_eq(psidx.to_frame(name="x"), pidx.to_frame(name="x")) + self.assert_eq(psidx.to_frame(index=False, name="x"), pidx.to_frame(index=False, name="x")) + + self.assertRaises(TypeError, lambda: psidx.to_frame(name=["x"])) + + # non-string name + self.assert_eq(psidx.to_frame(name=10), pidx.to_frame(name=10)) + self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) + + pidx = self.pdf.set_index("b", append=True).index + psidx = self.psdf.set_index("b", append=True).index + + self.assert_eq(psidx.to_frame(), pidx.to_frame()) + self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) + + self.assert_eq(psidx.to_frame(name=["x", "y"]), pidx.to_frame(name=["x", "y"])) + self.assert_eq(psidx.to_frame(name=("x", "y")), pidx.to_frame(name=("x", "y"))) + self.assert_eq( + psidx.to_frame(index=False, name=["x", "y"]), + pidx.to_frame(index=False, name=["x", "y"]), + ) + + self.assertRaises(TypeError, lambda: psidx.to_frame(name="x")) + self.assertRaises(ValueError, lambda: psidx.to_frame(name=["x"])) + + # non-string names + self.assert_eq(psidx.to_frame(name=[10, 20]), pidx.to_frame(name=[10, 20])) + self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) + if LooseVersion(pd.__version__) < LooseVersion("1.5.0"): + self.assert_eq( + psidx.to_frame(name=[("x", 10), ("y", 20)]), + pidx.to_frame(name=[("x", 10), ("y", 20)]), + ) + else: + # Since pandas 1.5.0, the result is changed as below: + # (x, 10) (y, 20) + # b + # 0 4 0 4 + # 1 5 1 5 + # 3 6 3 6 + # 5 3 5 3 + # 6 2 6 2 + # 8 1 8 1 + # 9 0 9 0 + # 0 9 0 + # 0 9 0 + # + # The columns should be `Index([('x', 20), ('y', 20)], dtype='object')`, + # but pandas API on Spark doesn't support such a way for creating Index. + # So, we currently cannot follow the behavior of pandas. + expected_result = ps.DataFrame( + {("x", 10): [0, 1, 3, 5, 6, 8, 9, 9, 9], ("y", 20): [4, 5, 6, 3, 2, 1, 0, 0, 0]}, + index=ps.MultiIndex.from_tuples( + [(0, 4), (1, 5), (3, 6), (5, 3), (6, 2), (8, 1), (9, 0), (9, 0), (9, 0)], + names=[None, "b"], + ), + ) + self.assert_eq(psidx.to_frame(name=[("x", 10), ("y", 20)]), expected_result) + + def test_to_list(self): + # Index + pidx = pd.Index([1, 2, 3, 4, 5]) + psidx = ps.from_pandas(pidx) + # MultiIndex + tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")] + pmidx = pd.MultiIndex.from_tuples(tuples) + psmidx = ps.from_pandas(pmidx) + + self.assert_eq(psidx.tolist(), pidx.tolist()) + self.assert_eq(psmidx.tolist(), pmidx.tolist()) + + def test_to_numpy(self): + pidx = pd.Index([1, 2, 3, 4]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(pidx.to_numpy(copy=True), psidx.to_numpy(copy=True)) + + +class ConversionTests( + ConversionMixin, + PandasOnSparkTestCase, + SQLTestUtils, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.indexes.test_conversion import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/indexes/test_drop.py b/python/pyspark/pandas/tests/indexes/test_drop.py new file mode 100644 index 0000000000000..7f5a3b480e1ce --- /dev/null +++ b/python/pyspark/pandas/tests/indexes/test_drop.py @@ -0,0 +1,176 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +import numpy as np +import pandas as pd + +import pyspark.pandas as ps +from pyspark.testing.pandasutils import ComparisonTestBase, TestUtils + + +class IndexesDropMixin: + def test_drop_duplicates(self): + pidx = pd.Index([4, 2, 4, 1, 4, 3]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(psidx.drop_duplicates(), pidx.drop_duplicates()) + self.assert_eq((psidx + 1).drop_duplicates(), (pidx + 1).drop_duplicates()) + + self.assert_eq(psidx.drop_duplicates(keep="first"), pidx.drop_duplicates(keep="first")) + self.assert_eq(psidx.drop_duplicates(keep="last"), pidx.drop_duplicates(keep="last")) + self.assert_eq(psidx.drop_duplicates(keep=False), pidx.drop_duplicates(keep=False)) + + arrays = [[1, 2, 3, 1, 2], ["red", "blue", "black", "red", "blue"]] + pmidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) + psmidx = ps.from_pandas(pmidx) + self.assert_eq(psmidx.drop_duplicates(), pmidx.drop_duplicates()) + self.assert_eq(psmidx.drop_duplicates(keep="first"), pmidx.drop_duplicates(keep="first")) + self.assert_eq(psmidx.drop_duplicates(keep="last"), pmidx.drop_duplicates(keep="last")) + self.assert_eq(psmidx.drop_duplicates(keep=False), pmidx.drop_duplicates(keep=False)) + + def test_dropna(self): + pidx = pd.Index([np.nan, 2, 4, 1, None, 3]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(psidx.dropna(), pidx.dropna()) + self.assert_eq((psidx + 1).dropna(), (pidx + 1).dropna()) + + self.assert_eq(psidx.dropna(how="any"), pidx.dropna(how="any")) + self.assert_eq(psidx.dropna(how="all"), pidx.dropna(how="all")) + + pmidx = pd.MultiIndex.from_tuples( + [(np.nan, 1.0), (2.0, 2.0), (np.nan, None), (3.0, np.nan)] + ) + psmidx = ps.from_pandas(pmidx) + self.assert_eq(psmidx.dropna(), pmidx.dropna()) + self.assert_eq(psmidx.dropna(how="any"), pmidx.dropna(how="any")) + self.assert_eq(psmidx.dropna(how="all"), pmidx.dropna(how="all")) + + invalid_how = "none" + with self.assertRaisesRegex(ValueError, "invalid how option: %s" % invalid_how): + psmidx.dropna(invalid_how) + + def test_drop_level(self): + tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "green")] + pmidx = pd.MultiIndex.from_tuples(tuples) + psmidx = ps.from_pandas(pmidx) + + with self.assertRaisesRegex( + IndexError, "Too many levels: Index has only 2 levels, -3 is not a valid level number" + ): + psmidx.droplevel(-3) + + def test_multiindex_droplevel(self): + pidx = pd.MultiIndex.from_tuples( + [("a", "x", 1), ("b", "y", 2)], names=["level1", "level2", "level3"] + ) + psidx = ps.from_pandas(pidx) + with self.assertRaisesRegex(IndexError, "Too many levels: Index has only 3 levels, not 5"): + psidx.droplevel(4) + + with self.assertRaisesRegex(KeyError, "Level level4 not found"): + psidx.droplevel("level4") + + with self.assertRaisesRegex(KeyError, "Level.*level3.*level4.*not found"): + psidx.droplevel([("level3", "level4")]) + + with self.assertRaisesRegex( + ValueError, + "Cannot remove 4 levels from an index with 3 levels: at least one " + "level must be left.", + ): + psidx.droplevel([0, 0, 1, 2]) + + with self.assertRaisesRegex( + ValueError, + "Cannot remove 3 levels from an index with 3 levels: at least one " + "level must be left.", + ): + psidx.droplevel([0, 1, 2]) + + self.assert_eq(pidx.droplevel(0), psidx.droplevel(0)) + self.assert_eq(pidx.droplevel([0, 1]), psidx.droplevel([0, 1])) + self.assert_eq(pidx.droplevel((0, 1)), psidx.droplevel((0, 1))) + self.assert_eq(pidx.droplevel([0, "level2"]), psidx.droplevel([0, "level2"])) + self.assert_eq(pidx.droplevel((0, "level2")), psidx.droplevel((0, "level2"))) + + # non-string names + pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)], names=[1.0, 2.0, 3.0]) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.droplevel(1.0), psidx.droplevel(1.0)) + self.assert_eq(pidx.droplevel([0, 2.0]), psidx.droplevel([0, 2.0])) + + def test_index_drop(self): + pidx = pd.Index([1, 2, 3]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(pidx.drop(1), psidx.drop(1)) + self.assert_eq(pidx.drop([1, 2]), psidx.drop([1, 2])) + self.assert_eq((pidx + 1).drop([2, 3]), (psidx + 1).drop([2, 3])) + + def test_multiindex_drop(self): + pidx = pd.MultiIndex.from_tuples( + [("a", "x"), ("b", "y"), ("c", "z")], names=["level1", "level2"] + ) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.drop("a"), psidx.drop("a")) + self.assert_eq(pidx.drop(["a", "b"]), psidx.drop(["a", "b"])) + self.assert_eq(pidx.drop(["x", "y"], level=1), psidx.drop(["x", "y"], level=1)) + self.assert_eq( + pidx.drop(["x", "y"], level="level2"), psidx.drop(["x", "y"], level="level2") + ) + + pidx.names = ["lv1", "lv2"] + psidx.names = ["lv1", "lv2"] + self.assert_eq(pidx.drop(["x", "y"], level="lv2"), psidx.drop(["x", "y"], level="lv2")) + + self.assertRaises(IndexError, lambda: psidx.drop(["a", "b"], level=2)) + self.assertRaises(KeyError, lambda: psidx.drop(["a", "b"], level="level")) + + psidx.names = ["lv", "lv"] + self.assertRaises(ValueError, lambda: psidx.drop(["x", "y"], level="lv")) + + def test_index_drop_duplicates(self): + pidx = pd.Index([1, 1, 2]) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.drop_duplicates().sort_values(), psidx.drop_duplicates().sort_values()) + + pidx = pd.MultiIndex.from_tuples([(1, 1), (1, 1), (2, 2)], names=["level1", "level2"]) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.drop_duplicates().sort_values(), psidx.drop_duplicates().sort_values()) + + +class IndexesDropTests( + IndexesDropMixin, + ComparisonTestBase, + TestUtils, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.indexes.test_drop import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2)