apache · HyukjinKwon · Jan 23, 2024 · Jan 23, 2024 · nchammas · Jan 23, 2024
diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py
diff --git a/python/pyspark/errors/exceptions/__init__.py b/python/pyspark/errors/exceptions/__init__.py
@@ -14,3 +14,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+
+def _write_self() -> None:
+    import json
+    from pyspark.errors import error_classes
+
+    with open("python/pyspark/errors/error_classes.py", "w") as f:
+        error_class_py_file = """#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# NOTE: Automatically sort this file via
+# - cd $SPARK_HOME
+# - bin/pyspark
+# - from pyspark.errors.exceptions import _write_self; _write_self()
+import json
+
+
+ERROR_CLASSES_JSON = '''
+%s
+'''
+
+ERROR_CLASSES_MAP = json.loads(ERROR_CLASSES_JSON)
+""" % json.dumps(
+            error_classes.ERROR_CLASSES_MAP, sort_keys=True, indent=2
+        )
+        f.write(error_class_py_file)
diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py
@@ -1663,7 +1663,7 @@ def sampleBy(
                         "arg_name": "fractions",
                         "arg_type": type(fractions).__name__,
                         "allowed_types": "float, int, str",
-                        "return_type": type(k).__name__,
+                        "item_type": type(k).__name__,
                     },
                 )
             fractions[k] = float(v)

diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
@@ -347,7 +347,7 @@ def createDataFrame(
         if isinstance(data, DataFrame):
             raise PySparkTypeError(
                 error_class="INVALID_TYPE",
-                message_parameters={"arg_name": "data", "data_type": "DataFrame"},
+                message_parameters={"arg_name": "data", "arg_type": "DataFrame"},
             )
 
         _schema: Optional[Union[AtomicType, StructType]] = None

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -1319,6 +1319,7 @@ def hint(
                     error_class="DISALLOWED_TYPE_FOR_CONTAINER",
                     message_parameters={
                         "arg_name": "parameters",
+                        "arg_type": type(parameters).__name__,
                         "allowed_types": allowed_types_repr,
                         "item_type": type(p).__name__,
                     },
@@ -1329,6 +1330,7 @@ def hint(
                         error_class="DISALLOWED_TYPE_FOR_CONTAINER",
                         message_parameters={
                             "arg_name": "parameters",
+                            "arg_type": type(parameters).__name__,
                             "allowed_types": allowed_types_repr,
                             "item_type": type(p).__name__ + "[" + type(p[0]).__name__ + "]",
                         },
@@ -2385,7 +2387,7 @@ def sampleBy(
                         "arg_name": "fractions",
                         "arg_type": type(fractions).__name__,
                         "allowed_types": "float, int, str",
-                        "return_type": type(k).__name__,
+                        "item_type": type(k).__name__,
                     },
                 )
             fractions[k] = float(v)
@@ -5839,7 +5841,7 @@ def approxQuantile(
                         "arg_name": "col",
                         "arg_type": type(col).__name__,
                         "allowed_types": "str",
-                        "return_type": type(c).__name__,
+                        "item_type": type(c).__name__,
                     },
                 )
         col = _to_list(self._sc, cast(List["ColumnOrName"], col))

diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
@@ -1421,7 +1421,7 @@ def createDataFrame(  # type: ignore[misc]
         if isinstance(data, DataFrame):
             raise PySparkTypeError(
                 error_class="INVALID_TYPE",
-                message_parameters={"arg_name": "data", "data_type": "DataFrame"},
+                message_parameters={"arg_name": "data", "arg_type": "DataFrame"},
             )
 
         if isinstance(schema, str):

diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py
@@ -2282,7 +2282,7 @@ def test_stat_sample_by(self):
                 "arg_name": "fractions",
                 "arg_type": "dict",
                 "allowed_types": "float, int, str",
-                "return_type": "NoneType",
+                "item_type": "NoneType",
             },
         )
 

diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py
@@ -1101,7 +1101,7 @@ def test_observe(self):
         # observation requires name (if given) to be non empty string
         with self.assertRaisesRegex(TypeError, "`name` should be a str, got int"):
             Observation(123)
-        with self.assertRaisesRegex(ValueError, "`name` must be a non empty string, got ''."):
+        with self.assertRaisesRegex(ValueError, "`name` must be a non-empty string, got ''."):
             Observation("")
 
         # dataframe.observe requires at least one expr
@@ -2034,7 +2034,7 @@ def test_invalid_argument_create_dataframe(self):
         self.check_error(
             exception=pe.exception,
             error_class="INVALID_TYPE",
-            message_parameters={"arg_name": "data", "data_type": "DataFrame"},
+            message_parameters={"arg_name": "data", "arg_type": "DataFrame"},
         )
 
 

diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
@@ -190,7 +190,7 @@ def test_sampleby(self):
                 "arg_name": "fractions",
                 "arg_type": "dict",
                 "allowed_types": "float, int, str",
-                "return_type": "NoneType",
+                "item_type": "NoneType",
             },
         )
 

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
@@ -2236,7 +2236,7 @@ def verify_udf(obj: Any) -> None:
                     error_class="NOT_INSTANCE_OF",
                     message_parameters={
                         "value": str(obj),
-                        "data_type": str(dataType),
+                        "type": str(dataType),
                     },
                 )
             verifier(dataType.toInternal(obj))

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -1175,7 +1175,7 @@ def verify_result(result):
                     raise PySparkTypeError(
                         error_class="INVALID_ARROW_UDTF_RETURN_TYPE",
                         message_parameters={
-                            "type_name": type(result).__name__,
+                            "return_type": type(result).__name__,
                             "value": str(result),
                             "func": f.__name__,
                         },