apache · EnricoMi · Nov 4, 2022 · Feb 7, 2023 · Feb 9, 2023 · Feb 9, 2023
diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py
@@ -658,6 +658,11 @@
       "Expected <expected> values for `<item>`, got <actual>."
     ]
   },
+  "UDF_RETURN_TYPE" : {
+    "message" : [
+      "Return type of the user-defined function should be <expected>, but is <actual>."
+    ]
+  },
   "UDTF_EXEC_ERROR" : {
     "message" : [
       "User defined table function encountered an error in the '<method_name>' method: <error>"

diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
@@ -399,7 +399,7 @@ class DataFrame(Frame, Generic[T]):
         `compute.ops_on_diff_frames` should be turned on;
         2, when `data` is a local dataset (Pandas DataFrame/numpy ndarray/list/etc),
         it will first collect the `index` to driver if necessary, and then apply
-        the `Pandas.DataFrame(...)` creation internally;
+        the `pandas.DataFrame(...)` creation internally;
 
     Examples
     --------

diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
@@ -155,7 +155,7 @@ def wrap_and_init_stream():
 
 class ArrowStreamPandasSerializer(ArrowStreamSerializer):
     """
-    Serializes Pandas.Series as Arrow data with Arrow streaming format.
+    Serializes pandas.Series as Arrow data with Arrow streaming format.
 
     Parameters
     ----------

diff --git a/python/pyspark/sql/tests/connect/test_parity_arrow_map.py b/python/pyspark/sql/tests/connect/test_parity_arrow_map.py
@@ -22,7 +22,8 @@
 
 
 class ArrowMapParityTests(MapInArrowTestsMixin, ReusedConnectTestCase):
-    pass
+    def test_other_than_recordbatch_iter(self):
+        self.check_other_than_recordbatch_iter()
 
 
 if __name__ == "__main__":

diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_map.py b/python/pyspark/sql/tests/connect/test_parity_pandas_map.py
@@ -14,16 +14,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import unittest
+
+
 from pyspark.sql.tests.pandas.test_pandas_map import MapInPandasTestsMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
 
 
 class MapInPandasParityTests(MapInPandasTestsMixin, ReusedConnectTestCase):
+    def test_other_than_dataframe_iter(self):
+        self.check_other_than_dataframe_iter()
+
+    def test_dataframes_with_other_column_names(self):
+        self.check_dataframes_with_other_column_names()
+
+    def test_dataframes_with_duplicate_column_names(self):
+        self.check_dataframes_with_duplicate_column_names()
+
+    def test_dataframes_with_less_columns(self):
+        self.check_dataframes_with_less_columns()
+
+    @unittest.skip("Fails in Spark Connect, should enable.")
+    def test_dataframes_with_incompatible_types(self):
+        self.check_dataframes_with_incompatible_types()
+
     def test_empty_dataframes_with_less_columns(self):
         self.check_empty_dataframes_with_less_columns()
 
-    def test_other_than_dataframe(self):
-        self.check_other_than_dataframe()
+    def test_empty_dataframes_with_other_columns(self):
+        self.check_empty_dataframes_with_other_columns()
 
 
 if __name__ == "__main__":

diff --git a/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py
@@ -56,7 +56,6 @@ class CogroupedApplyInPandasTestsMixin:
     def data1(self):
         return (
             self.spark.range(10)
-            .toDF("id")
             .withColumn("ks", array([lit(i) for i in range(20, 30)]))
             .withColumn("k", explode(col("ks")))
             .withColumn("v", col("k") * 10)
@@ -67,7 +66,6 @@ def data1(self):
     def data2(self):
         return (
             self.spark.range(10)
-            .toDF("id")
             .withColumn("ks", array([lit(i) for i in range(20, 30)]))
             .withColumn("k", explode(col("ks")))
             .withColumn("v2", col("k") * 100)
@@ -168,7 +166,7 @@ def check_apply_in_pandas_not_returning_pandas_dataframe(self):
             fn=lambda lft, rgt: lft.size + rgt.size,
             error_class=PythonException,
             error_message_regex="Return type of the user-defined function "
-            "should be pandas.DataFrame, but is <class 'numpy.int64'>",
+            "should be pandas.DataFrame, but is int64.",
         )
 
     def test_apply_in_pandas_returning_column_names(self):

diff --git a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py
@@ -79,7 +79,6 @@ class GroupedApplyInPandasTestsMixin:
     def data(self):
         return (
             self.spark.range(10)
-            .toDF("id")
             .withColumn("vs", array([lit(i) for i in range(20, 30)]))
             .withColumn("v", explode(col("vs")))
             .drop("vs")
@@ -287,8 +286,7 @@ def test_apply_in_pandas_not_returning_pandas_dataframe(self):
     def check_apply_in_pandas_not_returning_pandas_dataframe(self):
         with self.assertRaisesRegex(
             PythonException,
-            "Return type of the user-defined function should be pandas.DataFrame, "
-            "but is <class 'tuple'>",
+            "Return type of the user-defined function should be pandas.DataFrame, but is tuple.",
         ):
             self._test_apply_in_pandas(lambda key, pdf: key)