apache · allisonwang-db · Jan 16, 2025 · Jan 17, 2025 · Jan 23, 2025 · Jan 23, 2025
diff --git a/python/pyspark/sql/streaming/python_streaming_source_runner.py b/python/pyspark/sql/streaming/python_streaming_source_runner.py
@@ -21,7 +21,7 @@
 from typing import IO, Iterator, Tuple
 
 from pyspark.accumulators import _accumulatorRegistry
-from pyspark.errors import IllegalArgumentException, PySparkAssertionError, PySparkRuntimeError
+from pyspark.errors import IllegalArgumentException, PySparkAssertionError
 from pyspark.serializers import (
     read_int,
     write_int,
@@ -78,6 +78,7 @@ def partitions_func(
     start_offset = json.loads(utf8_deserializer.loads(infile))
     end_offset = json.loads(utf8_deserializer.loads(infile))
     partitions = reader.partitions(start_offset, end_offset)
+
     # Return the serialized partition values.
     write_int(len(partitions), outfile)
     for partition in partitions:
@@ -183,12 +184,6 @@ def main(infile: IO, outfile: IO) -> None:
                         },
                     )
                 outfile.flush()
-        except Exception as e:
-            error_msg = "data source {} throw exception: {}".format(data_source.name, e)
-            raise PySparkRuntimeError(
-                errorClass="PYTHON_STREAMING_DATA_SOURCE_RUNTIME_ERROR",
-                messageParameters={"msg": error_msg},
-            )
         finally:
             reader.stop()
     except BaseException as e:

diff --git a/python/pyspark/sql/worker/python_streaming_sink_runner.py b/python/pyspark/sql/worker/python_streaming_sink_runner.py
@@ -21,7 +21,7 @@
 from typing import IO
 
 from pyspark.accumulators import _accumulatorRegistry
-from pyspark.errors import PySparkAssertionError, PySparkRuntimeError
+from pyspark.errors import PySparkAssertionError
 from pyspark.serializers import (
     read_bool,
     read_int,
@@ -96,44 +96,36 @@ def main(infile: IO, outfile: IO) -> None:
             )
         # Receive the `overwrite` flag.
         overwrite = read_bool(infile)
-        # Instantiate data source reader.
-        try:
-            # Create the data source writer instance.
-            writer = data_source.streamWriter(schema=schema, overwrite=overwrite)
-
-            # Receive the commit messages.
-            num_messages = read_int(infile)
-            commit_messages = []
-            for _ in range(num_messages):
-                message = pickleSer._read_with_length(infile)
-                if message is not None and not isinstance(message, WriterCommitMessage):
-                    raise PySparkAssertionError(
-                        errorClass="DATA_SOURCE_TYPE_MISMATCH",
-                        messageParameters={
-                            "expected": "an instance of WriterCommitMessage",
-                            "actual": f"'{type(message).__name__}'",
-                        },
-                    )
-                commit_messages.append(message)
-
-            batch_id = read_long(infile)
-            abort = read_bool(infile)
-
-            # Commit or abort the Python data source write.
-            # Note the commit messages can be None if there are failed tasks.
-            if abort:
-                writer.abort(commit_messages, batch_id)
-            else:
-                writer.commit(commit_messages, batch_id)
-            # Send a status code back to JVM.
-            write_int(0, outfile)
-            outfile.flush()
-        except Exception as e:
-            error_msg = "data source {} throw exception: {}".format(data_source.name, e)
-            raise PySparkRuntimeError(
-                errorClass="PYTHON_STREAMING_DATA_SOURCE_RUNTIME_ERROR",
-                messageParameters={"action": "commitOrAbort", "error": error_msg},
-            )
+        # Create the data source writer instance.
+        writer = data_source.streamWriter(schema=schema, overwrite=overwrite)
+        # Receive the commit messages.
+        num_messages = read_int(infile)
+
+        commit_messages = []
+        for _ in range(num_messages):
+            message = pickleSer._read_with_length(infile)
+            if message is not None and not isinstance(message, WriterCommitMessage):
+                raise PySparkAssertionError(
+                    errorClass="DATA_SOURCE_TYPE_MISMATCH",
+                    messageParameters={
+                        "expected": "an instance of WriterCommitMessage",
+                        "actual": f"'{type(message).__name__}'",
+                    },
+                )
+            commit_messages.append(message)
+
+        batch_id = read_long(infile)
+        abort = read_bool(infile)
+
+        # Commit or abort the Python data source write.
+        # Note the commit messages can be None if there are failed tasks.
+        if abort:
+            writer.abort(commit_messages, batch_id)
+        else:
+            writer.commit(commit_messages, batch_id)
+        # Send a status code back to JVM.
+        write_int(0, outfile)
+        outfile.flush()
     except BaseException as e:
         handle_worker_exception(e, outfile)
         sys.exit(-1)

diff --git a/...cala/org/apache/spark/sql/execution/python/streaming/PythonStreamingDataSourceSuite.scala b/...cala/org/apache/spark/sql/execution/python/streaming/PythonStreamingDataSourceSuite.scala
@@ -266,7 +266,6 @@ class PythonStreamingDataSourceSimpleSuite extends PythonDataSourceSuiteBase {
         )
       )
       assert(err.getMessage.contains(msg))
-      assert(err.getMessage.contains("ErrorDataSource"))
       stream.stop()
     }
 
@@ -332,7 +331,6 @@ class PythonStreamingDataSourceSimpleSuite extends PythonDataSourceSuiteBase {
         )
       )
       assert(err.getMessage.contains(msg))
-      assert(err.getMessage.contains("ErrorDataSource"))
       stream.stop()
     }
 
@@ -669,7 +667,6 @@ class PythonStreamingDataSourceSuite extends PythonDataSourceSuiteBase {
         )
       )
       assert(err.getMessage.contains(msg))
-      assert(err.getMessage.contains("ErrorDataSource"))
       stream.stop()
     }
 
@@ -731,7 +728,6 @@ class PythonStreamingDataSourceSuite extends PythonDataSourceSuiteBase {
         )
       )
       assert(err.getMessage.contains(msg))
-      assert(err.getMessage.contains("ErrorDataSource"))
       stream.stop()
     }