diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 900329d07c00a..d204fcf829534 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -815,11 +815,18 @@ def __hash__(self): "pyspark.pandas.tests.groupby.test_cumulative", "pyspark.pandas.tests.groupby.test_describe", "pyspark.pandas.tests.groupby.test_groupby", + "pyspark.pandas.tests.groupby.test_grouping", "pyspark.pandas.tests.groupby.test_head_tail", "pyspark.pandas.tests.groupby.test_index", + "pyspark.pandas.tests.groupby.test_missing", "pyspark.pandas.tests.groupby.test_missing_data", + "pyspark.pandas.tests.groupby.test_nlargest_nsmallest", + "pyspark.pandas.tests.groupby.test_raises", + "pyspark.pandas.tests.groupby.test_rank", + "pyspark.pandas.tests.groupby.test_size", "pyspark.pandas.tests.groupby.test_split_apply", "pyspark.pandas.tests.groupby.test_stat", + "pyspark.pandas.tests.groupby.test_value_counts", "pyspark.pandas.tests.test_indexing", "pyspark.pandas.tests.test_ops_on_diff_frames", "pyspark.pandas.tests.test_ops_on_diff_frames_groupby", @@ -1113,6 +1120,13 @@ def __hash__(self): "pyspark.pandas.tests.connect.groupby.test_parity_describe", "pyspark.pandas.tests.connect.groupby.test_parity_head_tail", "pyspark.pandas.tests.connect.groupby.test_parity_groupby", + "pyspark.pandas.tests.connect.groupby.test_parity_grouping", + "pyspark.pandas.tests.connect.groupby.test_parity_missing", + "pyspark.pandas.tests.connect.groupby.test_parity_nlargest_nsmallest", + "pyspark.pandas.tests.connect.groupby.test_parity_raises", + "pyspark.pandas.tests.connect.groupby.test_parity_rank", + "pyspark.pandas.tests.connect.groupby.test_parity_size", + "pyspark.pandas.tests.connect.groupby.test_parity_value_counts", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_grouping.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_grouping.py new file mode 100644 index 0000000000000..8b3f9927c0f2b --- /dev/null +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_grouping.py @@ -0,0 +1,38 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.pandas.tests.groupby.test_grouping import GroupingTestsMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestCase + + +class GroupingParityTests(GroupingTestsMixin, PandasOnSparkTestCase, ReusedConnectTestCase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.groupby.test_parity_grouping import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_missing.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_missing.py new file mode 100644 index 0000000000000..f6776d9bac608 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_missing.py @@ -0,0 +1,38 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.pandas.tests.groupby.test_missing import MissingTestsMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestCase + + +class MissingParityTests(MissingTestsMixin, PandasOnSparkTestCase, ReusedConnectTestCase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.groupby.test_parity_missing import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_nlargest_nsmallest.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_nlargest_nsmallest.py new file mode 100644 index 0000000000000..71c388a1d2981 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_nlargest_nsmallest.py @@ -0,0 +1,40 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.pandas.tests.groupby.test_nlargest_nsmallest import NlargestNsmallestTestsMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestCase + + +class NlargestNsmallestParityTests( + NlargestNsmallestTestsMixin, PandasOnSparkTestCase, ReusedConnectTestCase +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.groupby.test_parity_nlargest_nsmallest import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_raises.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_raises.py new file mode 100644 index 0000000000000..db122a81ebdd1 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_raises.py @@ -0,0 +1,38 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.pandas.tests.groupby.test_raises import RaisesTestsMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestCase + + +class RaisesParityTests(RaisesTestsMixin, PandasOnSparkTestCase, ReusedConnectTestCase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.groupby.test_parity_raises import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_rank.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_rank.py new file mode 100644 index 0000000000000..2ad5cf07cfcaa --- /dev/null +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_rank.py @@ -0,0 +1,38 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.pandas.tests.groupby.test_rank import RankTestsMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestCase + + +class RankParityTests(RankTestsMixin, PandasOnSparkTestCase, ReusedConnectTestCase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.groupby.test_parity_rank import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_size.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_size.py new file mode 100644 index 0000000000000..2904f0cded276 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_size.py @@ -0,0 +1,38 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.pandas.tests.groupby.test_size import SizeTestsMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestCase + + +class SizeParityTests(SizeTestsMixin, PandasOnSparkTestCase, ReusedConnectTestCase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.groupby.test_parity_size import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_value_counts.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_value_counts.py new file mode 100644 index 0000000000000..a9c84822006df --- /dev/null +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_value_counts.py @@ -0,0 +1,38 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.pandas.tests.groupby.test_value_counts import ValueCountsTestsMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestCase + + +class ValueCountsParityTests(ValueCountsTestsMixin, PandasOnSparkTestCase, ReusedConnectTestCase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.groupby.test_parity_value_counts import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/groupby/test_groupby.py b/python/pyspark/pandas/tests/groupby/test_groupby.py index 4ef2d1acd811d..6ad74cdf81777 100644 --- a/python/pyspark/pandas/tests/groupby/test_groupby.py +++ b/python/pyspark/pandas/tests/groupby/test_groupby.py @@ -186,84 +186,6 @@ def sort(df): sort(pdf.groupby(10, as_index=as_index)[[20, 30]].sum()), ) - def test_nsmallest(self): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2, 2, 3, 3, 3] * 3, - "b": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, - "c": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, - "d": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, - }, - index=np.random.rand(9 * 3), - ) - psdf = ps.from_pandas(pdf) - - self.assert_eq( - psdf.groupby(["a"])["b"].nsmallest(1).sort_values(), - pdf.groupby(["a"])["b"].nsmallest(1).sort_values(), - ) - self.assert_eq( - psdf.groupby(["a"])["b"].nsmallest(2).sort_index(), - pdf.groupby(["a"])["b"].nsmallest(2).sort_index(), - ) - self.assert_eq( - (psdf.b * 10).groupby(psdf.a).nsmallest(2).sort_index(), - (pdf.b * 10).groupby(pdf.a).nsmallest(2).sort_index(), - ) - self.assert_eq( - psdf.b.rename().groupby(psdf.a).nsmallest(2).sort_index(), - pdf.b.rename().groupby(pdf.a).nsmallest(2).sort_index(), - ) - self.assert_eq( - psdf.b.groupby(psdf.a.rename()).nsmallest(2).sort_index(), - pdf.b.groupby(pdf.a.rename()).nsmallest(2).sort_index(), - ) - self.assert_eq( - psdf.b.rename().groupby(psdf.a.rename()).nsmallest(2).sort_index(), - pdf.b.rename().groupby(pdf.a.rename()).nsmallest(2).sort_index(), - ) - with self.assertRaisesRegex(ValueError, "nsmallest do not support multi-index now"): - psdf.set_index(["a", "b"]).groupby(["c"])["d"].nsmallest(1) - - def test_nlargest(self): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2, 2, 3, 3, 3] * 3, - "b": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, - "c": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, - "d": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, - }, - index=np.random.rand(9 * 3), - ) - psdf = ps.from_pandas(pdf) - - self.assert_eq( - psdf.groupby(["a"])["b"].nlargest(1).sort_values(), - pdf.groupby(["a"])["b"].nlargest(1).sort_values(), - ) - self.assert_eq( - psdf.groupby(["a"])["b"].nlargest(2).sort_index(), - pdf.groupby(["a"])["b"].nlargest(2).sort_index(), - ) - self.assert_eq( - (psdf.b * 10).groupby(psdf.a).nlargest(2).sort_index(), - (pdf.b * 10).groupby(pdf.a).nlargest(2).sort_index(), - ) - self.assert_eq( - psdf.b.rename().groupby(psdf.a).nlargest(2).sort_index(), - pdf.b.rename().groupby(pdf.a).nlargest(2).sort_index(), - ) - self.assert_eq( - psdf.b.groupby(psdf.a.rename()).nlargest(2).sort_index(), - pdf.b.groupby(pdf.a.rename()).nlargest(2).sort_index(), - ) - self.assert_eq( - psdf.b.rename().groupby(psdf.a.rename()).nlargest(2).sort_index(), - pdf.b.rename().groupby(pdf.a.rename()).nlargest(2).sort_index(), - ) - with self.assertRaisesRegex(ValueError, "nlargest do not support multi-index now"): - psdf.set_index(["a", "b"]).groupby(["c"])["d"].nlargest(1) - def test_shift(self): pdf = pd.DataFrame( { @@ -331,239 +253,11 @@ def test_shift(self): # pdf.groupby([('x', 'a'), ('x', 'b')]).shift(periods=-1, # fill_value=0).sort_index()) - def test_missing(self): - psdf = ps.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - - # DataFrameGroupBy functions - missing_functions = inspect.getmembers( - MissingPandasLikeDataFrameGroupBy, inspect.isfunction - ) - unsupported_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" - ] - for name in unsupported_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(psdf.groupby("a"), name)() - - deprecated_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" - ] - for name in deprecated_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, "method.*GroupBy.*{}.*is deprecated".format(name) - ): - getattr(psdf.groupby("a"), name)() - - # SeriesGroupBy functions - missing_functions = inspect.getmembers(MissingPandasLikeSeriesGroupBy, inspect.isfunction) - unsupported_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" - ] - for name in unsupported_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(psdf.a.groupby(psdf.a), name)() - - deprecated_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" - ] - for name in deprecated_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, "method.*GroupBy.*{}.*is deprecated".format(name) - ): - getattr(psdf.a.groupby(psdf.a), name)() - - # DataFrameGroupBy properties - missing_properties = inspect.getmembers( - MissingPandasLikeDataFrameGroupBy, lambda o: isinstance(o, property) - ) - unsupported_properties = [ - name - for (name, type_) in missing_properties - if type_.fget.__name__ == "unsupported_property" - ] - for name in unsupported_properties: - with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(psdf.groupby("a"), name) - deprecated_properties = [ - name - for (name, type_) in missing_properties - if type_.fget.__name__ == "deprecated_property" - ] - for name in deprecated_properties: - with self.assertRaisesRegex( - PandasNotImplementedError, "property.*GroupBy.*{}.*is deprecated".format(name) - ): - getattr(psdf.groupby("a"), name) - - # SeriesGroupBy properties - missing_properties = inspect.getmembers( - MissingPandasLikeSeriesGroupBy, lambda o: isinstance(o, property) - ) - unsupported_properties = [ - name - for (name, type_) in missing_properties - if type_.fget.__name__ == "unsupported_property" - ] - for name in unsupported_properties: - with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(psdf.a.groupby(psdf.a), name) - deprecated_properties = [ - name - for (name, type_) in missing_properties - if type_.fget.__name__ == "deprecated_property" - ] - for name in deprecated_properties: - with self.assertRaisesRegex( - PandasNotImplementedError, "property.*GroupBy.*{}.*is deprecated".format(name) - ): - getattr(psdf.a.groupby(psdf.a), name) - @staticmethod def test_is_multi_agg_with_relabel(): assert is_multi_agg_with_relabel(a="max") is False assert is_multi_agg_with_relabel(a_min=("a", "max"), a_max=("a", "min")) is True - def test_get_group(self): - pdf = pd.DataFrame( - [ - ("falcon", "bird", 389.0), - ("parrot", "bird", 24.0), - ("lion", "mammal", 80.5), - ("monkey", "mammal", np.nan), - ], - columns=["name", "class", "max_speed"], - index=[0, 2, 3, 1], - ) - pdf.columns.name = "Koalas" - psdf = ps.from_pandas(pdf) - - self.assert_eq( - psdf.groupby("class").get_group("bird"), - pdf.groupby("class").get_group("bird"), - ) - self.assert_eq( - psdf.groupby("class")["name"].get_group("mammal"), - pdf.groupby("class")["name"].get_group("mammal"), - ) - self.assert_eq( - psdf.groupby("class")[["name"]].get_group("mammal"), - pdf.groupby("class")[["name"]].get_group("mammal"), - ) - self.assert_eq( - psdf.groupby(["class", "name"]).get_group(("mammal", "lion")), - pdf.groupby(["class", "name"]).get_group(("mammal", "lion")), - ) - self.assert_eq( - psdf.groupby(["class", "name"])["max_speed"].get_group(("mammal", "lion")), - pdf.groupby(["class", "name"])["max_speed"].get_group(("mammal", "lion")), - ) - self.assert_eq( - psdf.groupby(["class", "name"])[["max_speed"]].get_group(("mammal", "lion")), - pdf.groupby(["class", "name"])[["max_speed"]].get_group(("mammal", "lion")), - ) - self.assert_eq( - (psdf.max_speed + 1).groupby(psdf["class"]).get_group("mammal"), - (pdf.max_speed + 1).groupby(pdf["class"]).get_group("mammal"), - ) - self.assert_eq( - psdf.groupby("max_speed").get_group(80.5), - pdf.groupby("max_speed").get_group(80.5), - ) - - self.assertRaises(KeyError, lambda: psdf.groupby("class").get_group("fish")) - self.assertRaises(TypeError, lambda: psdf.groupby("class").get_group(["bird", "mammal"])) - self.assertRaises(KeyError, lambda: psdf.groupby("class")["name"].get_group("fish")) - self.assertRaises( - TypeError, lambda: psdf.groupby("class")["name"].get_group(["bird", "mammal"]) - ) - self.assertRaises( - KeyError, lambda: psdf.groupby(["class", "name"]).get_group(("lion", "mammal")) - ) - self.assertRaises(ValueError, lambda: psdf.groupby(["class", "name"]).get_group(("lion",))) - self.assertRaises( - ValueError, lambda: psdf.groupby(["class", "name"]).get_group(("mammal",)) - ) - self.assertRaises(ValueError, lambda: psdf.groupby(["class", "name"]).get_group("mammal")) - - # MultiIndex columns - pdf.columns = pd.MultiIndex.from_tuples([("A", "name"), ("B", "class"), ("C", "max_speed")]) - pdf.columns.names = ["Hello", "Koalas"] - psdf = ps.from_pandas(pdf) - self.assert_eq( - psdf.groupby(("B", "class")).get_group("bird"), - pdf.groupby(("B", "class")).get_group("bird"), - ) - self.assert_eq( - psdf.groupby(("B", "class"))[[("A", "name")]].get_group("mammal"), - pdf.groupby(("B", "class"))[[("A", "name")]].get_group("mammal"), - ) - self.assert_eq( - psdf.groupby([("B", "class"), ("A", "name")]).get_group(("mammal", "lion")), - pdf.groupby([("B", "class"), ("A", "name")]).get_group(("mammal", "lion")), - ) - self.assert_eq( - psdf.groupby([("B", "class"), ("A", "name")])[[("C", "max_speed")]].get_group( - ("mammal", "lion") - ), - pdf.groupby([("B", "class"), ("A", "name")])[[("C", "max_speed")]].get_group( - ("mammal", "lion") - ), - ) - self.assert_eq( - (psdf[("C", "max_speed")] + 1).groupby(psdf[("B", "class")]).get_group("mammal"), - (pdf[("C", "max_speed")] + 1).groupby(pdf[("B", "class")]).get_group("mammal"), - ) - self.assert_eq( - psdf.groupby(("C", "max_speed")).get_group(80.5), - pdf.groupby(("C", "max_speed")).get_group(80.5), - ) - - self.assertRaises(KeyError, lambda: psdf.groupby(("B", "class")).get_group("fish")) - self.assertRaises( - TypeError, lambda: psdf.groupby(("B", "class")).get_group(["bird", "mammal"]) - ) - self.assertRaises( - KeyError, lambda: psdf.groupby(("B", "class"))[("A", "name")].get_group("fish") - ) - self.assertRaises( - KeyError, - lambda: psdf.groupby([("B", "class"), ("A", "name")]).get_group(("lion", "mammal")), - ) - self.assertRaises( - ValueError, - lambda: psdf.groupby([("B", "class"), ("A", "name")]).get_group(("lion",)), - ) - self.assertRaises( - ValueError, lambda: psdf.groupby([("B", "class"), ("A", "name")]).get_group(("mammal",)) - ) - self.assertRaises( - ValueError, lambda: psdf.groupby([("B", "class"), ("A", "name")]).get_group("mammal") - ) - - def test_getitem(self): - psdf = ps.DataFrame( - { - "a": [1, 1, 1, 1, 2, 2, 2, 3, 3, 3] * 3, - "b": [2, 3, 1, 4, 6, 9, 8, 10, 7, 5] * 3, - "c": [3, 5, 2, 5, 1, 2, 6, 4, 3, 6] * 3, - }, - index=np.random.rand(10 * 3), - ) - - self.assertTrue(isinstance(psdf.groupby("a")["b"], SeriesGroupBy)) - def test_all_any(self): pdf = pd.DataFrame( { @@ -647,19 +341,6 @@ def sort(df): pdf.groupby("A").all(skipna=True).sort_index(), ) - def test_raises(self): - psdf = ps.DataFrame( - {"a": [1, 2, 6, 4, 4, 6, 4, 3, 7], "b": [4, 2, 7, 3, 3, 1, 1, 1, 2]}, - index=[0, 1, 3, 5, 6, 8, 9, 9, 9], - ) - # test raises with incorrect key - self.assertRaises(ValueError, lambda: psdf.groupby([])) - self.assertRaises(KeyError, lambda: psdf.groupby("x")) - self.assertRaises(KeyError, lambda: psdf.groupby(["a", "x"])) - self.assertRaises(KeyError, lambda: psdf.groupby("a")["x"]) - self.assertRaises(KeyError, lambda: psdf.groupby("a")["b", "x"]) - self.assertRaises(KeyError, lambda: psdf.groupby("a")[["b", "x"]]) - def test_nunique(self): pdf = pd.DataFrame( {"a": [1, 1, 1, 1, 1, 0, 0, 0, 0, 0], "b": [2, 2, 2, 3, 3, 4, 4, 5, 5, 5]} @@ -727,99 +408,6 @@ def test_unique(self): for act, exp in zip(actual, expect): self.assertTrue(sorted(act) == sorted(exp)) - def test_value_counts(self): - pdf = pd.DataFrame( - {"A": [np.nan, 2, 2, 3, 3, 3], "B": [1, 1, 2, 3, 3, np.nan]}, columns=["A", "B"] - ) - psdf = ps.from_pandas(pdf) - self.assert_eq( - psdf.groupby("A")["B"].value_counts().sort_index(), - pdf.groupby("A")["B"].value_counts().sort_index(), - ) - self.assert_eq( - psdf.groupby("A")["B"].value_counts(dropna=False).sort_index(), - pdf.groupby("A")["B"].value_counts(dropna=False).sort_index(), - almost=True, - ) - self.assert_eq( - psdf.groupby("A", dropna=False)["B"].value_counts(dropna=False).sort_index(), - pdf.groupby("A", dropna=False)["B"].value_counts(dropna=False).sort_index(), - # Returns are the same considering values and types, - # disable check_exact to pass the assert_eq - check_exact=False, - ) - self.assert_eq( - psdf.groupby("A")["B"].value_counts(sort=True, ascending=False).sort_index(), - pdf.groupby("A")["B"].value_counts(sort=True, ascending=False).sort_index(), - ) - self.assert_eq( - psdf.groupby("A")["B"] - .value_counts(sort=True, ascending=False, dropna=False) - .sort_index(), - pdf.groupby("A")["B"] - .value_counts(sort=True, ascending=False, dropna=False) - .sort_index(), - almost=True, - ) - self.assert_eq( - psdf.groupby("A")["B"] - .value_counts(sort=True, ascending=True, dropna=False) - .sort_index(), - pdf.groupby("A")["B"] - .value_counts(sort=True, ascending=True, dropna=False) - .sort_index(), - almost=True, - ) - self.assert_eq( - psdf.B.rename().groupby(psdf.A).value_counts().sort_index(), - pdf.B.rename().groupby(pdf.A).value_counts().sort_index(), - ) - self.assert_eq( - psdf.B.rename().groupby(psdf.A, dropna=False).value_counts().sort_index(), - pdf.B.rename().groupby(pdf.A, dropna=False).value_counts().sort_index(), - # Returns are the same considering values and types, - # disable check_exact to pass the assert_eq - check_exact=False, - ) - self.assert_eq( - psdf.B.groupby(psdf.A.rename()).value_counts().sort_index(), - pdf.B.groupby(pdf.A.rename()).value_counts().sort_index(), - ) - self.assert_eq( - psdf.B.rename().groupby(psdf.A.rename()).value_counts().sort_index(), - pdf.B.rename().groupby(pdf.A.rename()).value_counts().sort_index(), - ) - - def test_size(self): - pdf = pd.DataFrame({"A": [1, 2, 2, 3, 3, 3], "B": [1, 1, 2, 3, 3, 3]}) - psdf = ps.from_pandas(pdf) - self.assert_eq(psdf.groupby("A").size().sort_index(), pdf.groupby("A").size().sort_index()) - self.assert_eq( - psdf.groupby("A")["B"].size().sort_index(), pdf.groupby("A")["B"].size().sort_index() - ) - self.assert_eq( - psdf.groupby("A")[["B"]].size().sort_index(), - pdf.groupby("A")[["B"]].size().sort_index(), - ) - self.assert_eq( - psdf.groupby(["A", "B"]).size().sort_index(), - pdf.groupby(["A", "B"]).size().sort_index(), - ) - - # multi-index columns - columns = pd.MultiIndex.from_tuples([("X", "A"), ("Y", "B")]) - pdf.columns = columns - psdf.columns = columns - - self.assert_eq( - psdf.groupby(("X", "A")).size().sort_index(), - pdf.groupby(("X", "A")).size().sort_index(), - ) - self.assert_eq( - psdf.groupby([("X", "A"), ("Y", "B")]).size().sort_index(), - pdf.groupby([("X", "A"), ("Y", "B")]).size().sort_index(), - ) - def test_diff(self): pdf = pd.DataFrame( { @@ -869,56 +457,6 @@ def test_diff(self): pdf.groupby([("x", "a"), ("x", "b")]).diff().sort_index(), ) - def test_rank(self): - pdf = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6] * 3, - "b": [1, 1, 2, 3, 5, 8] * 3, - "c": [1, 4, 9, 16, 25, 36] * 3, - }, - index=np.random.rand(6 * 3), - ) - psdf = ps.from_pandas(pdf) - - self.assert_eq(psdf.groupby("b").rank().sort_index(), pdf.groupby("b").rank().sort_index()) - self.assert_eq( - psdf.groupby(["a", "b"]).rank().sort_index(), - pdf.groupby(["a", "b"]).rank().sort_index(), - ) - self.assert_eq( - psdf.groupby(["b"])["a"].rank().sort_index(), - pdf.groupby(["b"])["a"].rank().sort_index(), - ) - self.assert_eq( - psdf.groupby(["b"])[["a", "c"]].rank().sort_index(), - pdf.groupby(["b"])[["a", "c"]].rank().sort_index(), - ) - self.assert_eq( - psdf.groupby(psdf.b // 5).rank().sort_index(), - pdf.groupby(pdf.b // 5).rank().sort_index(), - ) - self.assert_eq( - psdf.groupby(psdf.b // 5)["a"].rank().sort_index(), - pdf.groupby(pdf.b // 5)["a"].rank().sort_index(), - ) - - self.assert_eq(psdf.groupby("b").rank().sum(), pdf.groupby("b").rank().sum()) - self.assert_eq(psdf.groupby(["b"])["a"].rank().sum(), pdf.groupby(["b"])["a"].rank().sum()) - - # multi-index columns - columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) - pdf.columns = columns - psdf.columns = columns - - self.assert_eq( - psdf.groupby(("x", "b")).rank().sort_index(), - pdf.groupby(("x", "b")).rank().sort_index(), - ) - self.assert_eq( - psdf.groupby([("x", "a"), ("x", "b")]).rank().sort_index(), - pdf.groupby([("x", "a"), ("x", "b")]).rank().sort_index(), - ) - class GroupByTests(GroupByTestsMixin, PandasOnSparkTestCase, TestUtils): pass diff --git a/python/pyspark/pandas/tests/groupby/test_grouping.py b/python/pyspark/pandas/tests/groupby/test_grouping.py new file mode 100644 index 0000000000000..13aa86ad9b4e2 --- /dev/null +++ b/python/pyspark/pandas/tests/groupby/test_grouping.py @@ -0,0 +1,171 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +import pandas as pd +import numpy as np +import pyspark.pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.pandas.groupby import SeriesGroupBy + + +class GroupingTestsMixin: + def test_get_group(self): + pdf = pd.DataFrame( + [ + ("falcon", "bird", 389.0), + ("parrot", "bird", 24.0), + ("lion", "mammal", 80.5), + ("monkey", "mammal", np.nan), + ], + columns=["name", "class", "max_speed"], + index=[0, 2, 3, 1], + ) + pdf.columns.name = "Koalas" + psdf = ps.from_pandas(pdf) + + self.assert_eq( + psdf.groupby("class").get_group("bird"), + pdf.groupby("class").get_group("bird"), + ) + self.assert_eq( + psdf.groupby("class")["name"].get_group("mammal"), + pdf.groupby("class")["name"].get_group("mammal"), + ) + self.assert_eq( + psdf.groupby("class")[["name"]].get_group("mammal"), + pdf.groupby("class")[["name"]].get_group("mammal"), + ) + self.assert_eq( + psdf.groupby(["class", "name"]).get_group(("mammal", "lion")), + pdf.groupby(["class", "name"]).get_group(("mammal", "lion")), + ) + self.assert_eq( + psdf.groupby(["class", "name"])["max_speed"].get_group(("mammal", "lion")), + pdf.groupby(["class", "name"])["max_speed"].get_group(("mammal", "lion")), + ) + self.assert_eq( + psdf.groupby(["class", "name"])[["max_speed"]].get_group(("mammal", "lion")), + pdf.groupby(["class", "name"])[["max_speed"]].get_group(("mammal", "lion")), + ) + self.assert_eq( + (psdf.max_speed + 1).groupby(psdf["class"]).get_group("mammal"), + (pdf.max_speed + 1).groupby(pdf["class"]).get_group("mammal"), + ) + self.assert_eq( + psdf.groupby("max_speed").get_group(80.5), + pdf.groupby("max_speed").get_group(80.5), + ) + + self.assertRaises(KeyError, lambda: psdf.groupby("class").get_group("fish")) + self.assertRaises(TypeError, lambda: psdf.groupby("class").get_group(["bird", "mammal"])) + self.assertRaises(KeyError, lambda: psdf.groupby("class")["name"].get_group("fish")) + self.assertRaises( + TypeError, lambda: psdf.groupby("class")["name"].get_group(["bird", "mammal"]) + ) + self.assertRaises( + KeyError, lambda: psdf.groupby(["class", "name"]).get_group(("lion", "mammal")) + ) + self.assertRaises(ValueError, lambda: psdf.groupby(["class", "name"]).get_group(("lion",))) + self.assertRaises( + ValueError, lambda: psdf.groupby(["class", "name"]).get_group(("mammal",)) + ) + self.assertRaises(ValueError, lambda: psdf.groupby(["class", "name"]).get_group("mammal")) + + # MultiIndex columns + pdf.columns = pd.MultiIndex.from_tuples([("A", "name"), ("B", "class"), ("C", "max_speed")]) + pdf.columns.names = ["Hello", "Koalas"] + psdf = ps.from_pandas(pdf) + self.assert_eq( + psdf.groupby(("B", "class")).get_group("bird"), + pdf.groupby(("B", "class")).get_group("bird"), + ) + self.assert_eq( + psdf.groupby(("B", "class"))[[("A", "name")]].get_group("mammal"), + pdf.groupby(("B", "class"))[[("A", "name")]].get_group("mammal"), + ) + self.assert_eq( + psdf.groupby([("B", "class"), ("A", "name")]).get_group(("mammal", "lion")), + pdf.groupby([("B", "class"), ("A", "name")]).get_group(("mammal", "lion")), + ) + self.assert_eq( + psdf.groupby([("B", "class"), ("A", "name")])[[("C", "max_speed")]].get_group( + ("mammal", "lion") + ), + pdf.groupby([("B", "class"), ("A", "name")])[[("C", "max_speed")]].get_group( + ("mammal", "lion") + ), + ) + self.assert_eq( + (psdf[("C", "max_speed")] + 1).groupby(psdf[("B", "class")]).get_group("mammal"), + (pdf[("C", "max_speed")] + 1).groupby(pdf[("B", "class")]).get_group("mammal"), + ) + self.assert_eq( + psdf.groupby(("C", "max_speed")).get_group(80.5), + pdf.groupby(("C", "max_speed")).get_group(80.5), + ) + + self.assertRaises(KeyError, lambda: psdf.groupby(("B", "class")).get_group("fish")) + self.assertRaises( + TypeError, lambda: psdf.groupby(("B", "class")).get_group(["bird", "mammal"]) + ) + self.assertRaises( + KeyError, lambda: psdf.groupby(("B", "class"))[("A", "name")].get_group("fish") + ) + self.assertRaises( + KeyError, + lambda: psdf.groupby([("B", "class"), ("A", "name")]).get_group(("lion", "mammal")), + ) + self.assertRaises( + ValueError, + lambda: psdf.groupby([("B", "class"), ("A", "name")]).get_group(("lion",)), + ) + self.assertRaises( + ValueError, lambda: psdf.groupby([("B", "class"), ("A", "name")]).get_group(("mammal",)) + ) + self.assertRaises( + ValueError, lambda: psdf.groupby([("B", "class"), ("A", "name")]).get_group("mammal") + ) + + def test_getitem(self): + psdf = ps.DataFrame( + { + "a": [1, 1, 1, 1, 2, 2, 2, 3, 3, 3] * 3, + "b": [2, 3, 1, 4, 6, 9, 8, 10, 7, 5] * 3, + "c": [3, 5, 2, 5, 1, 2, 6, 4, 3, 6] * 3, + }, + index=np.random.rand(10 * 3), + ) + + self.assertTrue(isinstance(psdf.groupby("a")["b"], SeriesGroupBy)) + + +class GroupingTests(GroupingTestsMixin, PandasOnSparkTestCase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.groupby.test_grouping import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/groupby/test_missing.py b/python/pyspark/pandas/tests/groupby/test_missing.py new file mode 100644 index 0000000000000..d3c70aa1a9852 --- /dev/null +++ b/python/pyspark/pandas/tests/groupby/test_missing.py @@ -0,0 +1,144 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest +import inspect + +import pyspark.pandas as ps +from pyspark.pandas.exceptions import PandasNotImplementedError +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.pandas.missing.groupby import ( + MissingPandasLikeDataFrameGroupBy, + MissingPandasLikeSeriesGroupBy, +) + + +class MissingTestsMixin: + def test_missing(self): + psdf = ps.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + + # DataFrameGroupBy functions + missing_functions = inspect.getmembers( + MissingPandasLikeDataFrameGroupBy, inspect.isfunction + ) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] + for name in unsupported_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, + "method.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(psdf.groupby("a"), name)() + + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] + for name in deprecated_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*GroupBy.*{}.*is deprecated".format(name) + ): + getattr(psdf.groupby("a"), name)() + + # SeriesGroupBy functions + missing_functions = inspect.getmembers(MissingPandasLikeSeriesGroupBy, inspect.isfunction) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] + for name in unsupported_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, + "method.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(psdf.a.groupby(psdf.a), name)() + + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] + for name in deprecated_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*GroupBy.*{}.*is deprecated".format(name) + ): + getattr(psdf.a.groupby(psdf.a), name)() + + # DataFrameGroupBy properties + missing_properties = inspect.getmembers( + MissingPandasLikeDataFrameGroupBy, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] + for name in unsupported_properties: + with self.assertRaisesRegex( + PandasNotImplementedError, + "property.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(psdf.groupby("a"), name) + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] + for name in deprecated_properties: + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*GroupBy.*{}.*is deprecated".format(name) + ): + getattr(psdf.groupby("a"), name) + + # SeriesGroupBy properties + missing_properties = inspect.getmembers( + MissingPandasLikeSeriesGroupBy, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] + for name in unsupported_properties: + with self.assertRaisesRegex( + PandasNotImplementedError, + "property.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(psdf.a.groupby(psdf.a), name) + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] + for name in deprecated_properties: + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*GroupBy.*{}.*is deprecated".format(name) + ): + getattr(psdf.a.groupby(psdf.a), name) + + +class MissingTests(MissingTestsMixin, PandasOnSparkTestCase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.groupby.test_missing import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/groupby/test_nlargest_nsmallest.py b/python/pyspark/pandas/tests/groupby/test_nlargest_nsmallest.py new file mode 100644 index 0000000000000..e74f880bce8d9 --- /dev/null +++ b/python/pyspark/pandas/tests/groupby/test_nlargest_nsmallest.py @@ -0,0 +1,119 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +import pandas as pd +import numpy as np +import pyspark.pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase + + +class NlargestNsmallestTestsMixin: + def test_nlargest(self): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2, 2, 3, 3, 3] * 3, + "b": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, + "c": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, + "d": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, + }, + index=np.random.rand(9 * 3), + ) + psdf = ps.from_pandas(pdf) + + self.assert_eq( + psdf.groupby(["a"])["b"].nlargest(1).sort_values(), + pdf.groupby(["a"])["b"].nlargest(1).sort_values(), + ) + self.assert_eq( + psdf.groupby(["a"])["b"].nlargest(2).sort_index(), + pdf.groupby(["a"])["b"].nlargest(2).sort_index(), + ) + self.assert_eq( + (psdf.b * 10).groupby(psdf.a).nlargest(2).sort_index(), + (pdf.b * 10).groupby(pdf.a).nlargest(2).sort_index(), + ) + self.assert_eq( + psdf.b.rename().groupby(psdf.a).nlargest(2).sort_index(), + pdf.b.rename().groupby(pdf.a).nlargest(2).sort_index(), + ) + self.assert_eq( + psdf.b.groupby(psdf.a.rename()).nlargest(2).sort_index(), + pdf.b.groupby(pdf.a.rename()).nlargest(2).sort_index(), + ) + self.assert_eq( + psdf.b.rename().groupby(psdf.a.rename()).nlargest(2).sort_index(), + pdf.b.rename().groupby(pdf.a.rename()).nlargest(2).sort_index(), + ) + with self.assertRaisesRegex(ValueError, "nlargest do not support multi-index now"): + psdf.set_index(["a", "b"]).groupby(["c"])["d"].nlargest(1) + + def test_nsmallest(self): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2, 2, 3, 3, 3] * 3, + "b": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, + "c": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, + "d": [1, 2, 2, 2, 3, 3, 3, 4, 4] * 3, + }, + index=np.random.rand(9 * 3), + ) + psdf = ps.from_pandas(pdf) + + self.assert_eq( + psdf.groupby(["a"])["b"].nsmallest(1).sort_values(), + pdf.groupby(["a"])["b"].nsmallest(1).sort_values(), + ) + self.assert_eq( + psdf.groupby(["a"])["b"].nsmallest(2).sort_index(), + pdf.groupby(["a"])["b"].nsmallest(2).sort_index(), + ) + self.assert_eq( + (psdf.b * 10).groupby(psdf.a).nsmallest(2).sort_index(), + (pdf.b * 10).groupby(pdf.a).nsmallest(2).sort_index(), + ) + self.assert_eq( + psdf.b.rename().groupby(psdf.a).nsmallest(2).sort_index(), + pdf.b.rename().groupby(pdf.a).nsmallest(2).sort_index(), + ) + self.assert_eq( + psdf.b.groupby(psdf.a.rename()).nsmallest(2).sort_index(), + pdf.b.groupby(pdf.a.rename()).nsmallest(2).sort_index(), + ) + self.assert_eq( + psdf.b.rename().groupby(psdf.a.rename()).nsmallest(2).sort_index(), + pdf.b.rename().groupby(pdf.a.rename()).nsmallest(2).sort_index(), + ) + with self.assertRaisesRegex(ValueError, "nsmallest do not support multi-index now"): + psdf.set_index(["a", "b"]).groupby(["c"])["d"].nsmallest(1) + + +class NlargestNsmallestTests(NlargestNsmallestTestsMixin, PandasOnSparkTestCase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.groupby.test_nlargest_nsmallest import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/groupby/test_raises.py b/python/pyspark/pandas/tests/groupby/test_raises.py new file mode 100644 index 0000000000000..4fc134f5e06b1 --- /dev/null +++ b/python/pyspark/pandas/tests/groupby/test_raises.py @@ -0,0 +1,52 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark import pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase + + +class RaisesTestsMixin: + def test_raises(self): + psdf = ps.DataFrame( + {"a": [1, 2, 6, 4, 4, 6, 4, 3, 7], "b": [4, 2, 7, 3, 3, 1, 1, 1, 2]}, + index=[0, 1, 3, 5, 6, 8, 9, 9, 9], + ) + # test raises with incorrect key + self.assertRaises(ValueError, lambda: psdf.groupby([])) + self.assertRaises(KeyError, lambda: psdf.groupby("x")) + self.assertRaises(KeyError, lambda: psdf.groupby(["a", "x"])) + self.assertRaises(KeyError, lambda: psdf.groupby("a")["x"]) + self.assertRaises(KeyError, lambda: psdf.groupby("a")["b", "x"]) + self.assertRaises(KeyError, lambda: psdf.groupby("a")[["b", "x"]]) + + +class RaisesTests(RaisesTestsMixin, PandasOnSparkTestCase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.groupby.test_raises import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/groupby/test_rank.py b/python/pyspark/pandas/tests/groupby/test_rank.py new file mode 100644 index 0000000000000..f1efbd3f8a4a5 --- /dev/null +++ b/python/pyspark/pandas/tests/groupby/test_rank.py @@ -0,0 +1,91 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +import pandas as pd +import numpy as np +import pyspark.pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase + + +class RankTestsMixin: + def test_rank(self): + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6] * 3, + "b": [1, 1, 2, 3, 5, 8] * 3, + "c": [1, 4, 9, 16, 25, 36] * 3, + }, + index=np.random.rand(6 * 3), + ) + psdf = ps.from_pandas(pdf) + + self.assert_eq(psdf.groupby("b").rank().sort_index(), pdf.groupby("b").rank().sort_index()) + self.assert_eq( + psdf.groupby(["a", "b"]).rank().sort_index(), + pdf.groupby(["a", "b"]).rank().sort_index(), + ) + self.assert_eq( + psdf.groupby(["b"])["a"].rank().sort_index(), + pdf.groupby(["b"])["a"].rank().sort_index(), + ) + self.assert_eq( + psdf.groupby(["b"])[["a", "c"]].rank().sort_index(), + pdf.groupby(["b"])[["a", "c"]].rank().sort_index(), + ) + self.assert_eq( + psdf.groupby(psdf.b // 5).rank().sort_index(), + pdf.groupby(pdf.b // 5).rank().sort_index(), + ) + self.assert_eq( + psdf.groupby(psdf.b // 5)["a"].rank().sort_index(), + pdf.groupby(pdf.b // 5)["a"].rank().sort_index(), + ) + + self.assert_eq(psdf.groupby("b").rank().sum(), pdf.groupby("b").rank().sum()) + self.assert_eq(psdf.groupby(["b"])["a"].rank().sum(), pdf.groupby(["b"])["a"].rank().sum()) + + # multi-index columns + columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) + pdf.columns = columns + psdf.columns = columns + + self.assert_eq( + psdf.groupby(("x", "b")).rank().sort_index(), + pdf.groupby(("x", "b")).rank().sort_index(), + ) + self.assert_eq( + psdf.groupby([("x", "a"), ("x", "b")]).rank().sort_index(), + pdf.groupby([("x", "a"), ("x", "b")]).rank().sort_index(), + ) + + +class RankTests(RankTestsMixin, PandasOnSparkTestCase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.groupby.test_rank import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/groupby/test_size.py b/python/pyspark/pandas/tests/groupby/test_size.py new file mode 100644 index 0000000000000..8d187ba377a37 --- /dev/null +++ b/python/pyspark/pandas/tests/groupby/test_size.py @@ -0,0 +1,70 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +import pandas as pd +import pyspark.pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase + + +class SizeTestsMixin: + def test_size(self): + pdf = pd.DataFrame({"A": [1, 2, 2, 3, 3, 3], "B": [1, 1, 2, 3, 3, 3]}) + psdf = ps.from_pandas(pdf) + self.assert_eq(psdf.groupby("A").size().sort_index(), pdf.groupby("A").size().sort_index()) + self.assert_eq( + psdf.groupby("A")["B"].size().sort_index(), pdf.groupby("A")["B"].size().sort_index() + ) + self.assert_eq( + psdf.groupby("A")[["B"]].size().sort_index(), + pdf.groupby("A")[["B"]].size().sort_index(), + ) + self.assert_eq( + psdf.groupby(["A", "B"]).size().sort_index(), + pdf.groupby(["A", "B"]).size().sort_index(), + ) + + # multi-index columns + columns = pd.MultiIndex.from_tuples([("X", "A"), ("Y", "B")]) + pdf.columns = columns + psdf.columns = columns + + self.assert_eq( + psdf.groupby(("X", "A")).size().sort_index(), + pdf.groupby(("X", "A")).size().sort_index(), + ) + self.assert_eq( + psdf.groupby([("X", "A"), ("Y", "B")]).size().sort_index(), + pdf.groupby([("X", "A"), ("Y", "B")]).size().sort_index(), + ) + + +class SizeTests(SizeTestsMixin, PandasOnSparkTestCase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.groupby.test_size import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/groupby/test_value_counts.py b/python/pyspark/pandas/tests/groupby/test_value_counts.py new file mode 100644 index 0000000000000..5ed2d1b0583dc --- /dev/null +++ b/python/pyspark/pandas/tests/groupby/test_value_counts.py @@ -0,0 +1,104 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +import pandas as pd +import numpy as np +import pyspark.pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase + + +class ValueCountsTestsMixin: + def test_value_counts(self): + pdf = pd.DataFrame( + {"A": [np.nan, 2, 2, 3, 3, 3], "B": [1, 1, 2, 3, 3, np.nan]}, columns=["A", "B"] + ) + psdf = ps.from_pandas(pdf) + self.assert_eq( + psdf.groupby("A")["B"].value_counts().sort_index(), + pdf.groupby("A")["B"].value_counts().sort_index(), + ) + self.assert_eq( + psdf.groupby("A")["B"].value_counts(dropna=False).sort_index(), + pdf.groupby("A")["B"].value_counts(dropna=False).sort_index(), + almost=True, + ) + self.assert_eq( + psdf.groupby("A", dropna=False)["B"].value_counts(dropna=False).sort_index(), + pdf.groupby("A", dropna=False)["B"].value_counts(dropna=False).sort_index(), + # Returns are the same considering values and types, + # disable check_exact to pass the assert_eq + check_exact=False, + ) + self.assert_eq( + psdf.groupby("A")["B"].value_counts(sort=True, ascending=False).sort_index(), + pdf.groupby("A")["B"].value_counts(sort=True, ascending=False).sort_index(), + ) + self.assert_eq( + psdf.groupby("A")["B"] + .value_counts(sort=True, ascending=False, dropna=False) + .sort_index(), + pdf.groupby("A")["B"] + .value_counts(sort=True, ascending=False, dropna=False) + .sort_index(), + almost=True, + ) + self.assert_eq( + psdf.groupby("A")["B"] + .value_counts(sort=True, ascending=True, dropna=False) + .sort_index(), + pdf.groupby("A")["B"] + .value_counts(sort=True, ascending=True, dropna=False) + .sort_index(), + almost=True, + ) + self.assert_eq( + psdf.B.rename().groupby(psdf.A).value_counts().sort_index(), + pdf.B.rename().groupby(pdf.A).value_counts().sort_index(), + ) + self.assert_eq( + psdf.B.rename().groupby(psdf.A, dropna=False).value_counts().sort_index(), + pdf.B.rename().groupby(pdf.A, dropna=False).value_counts().sort_index(), + # Returns are the same considering values and types, + # disable check_exact to pass the assert_eq + check_exact=False, + ) + self.assert_eq( + psdf.B.groupby(psdf.A.rename()).value_counts().sort_index(), + pdf.B.groupby(pdf.A.rename()).value_counts().sort_index(), + ) + self.assert_eq( + psdf.B.rename().groupby(psdf.A.rename()).value_counts().sort_index(), + pdf.B.rename().groupby(pdf.A.rename()).value_counts().sort_index(), + ) + + +class ValueCountsTests(ValueCountsTestsMixin, PandasOnSparkTestCase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.groupby.test_value_counts import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2)