apache · xu20160924 · Jun 24, 2025 · Jun 24, 2025 · Jul 3, 2025 · Jul 6, 2025
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
@@ -42,11 +42,6 @@
       <version>${project.version}</version>
     </dependency>
 
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-lang3</artifactId>
-    </dependency>
-
     <dependency>
       <groupId>io.dropwizard.metrics</groupId>
       <artifactId>metrics-core</artifactId>

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -212,6 +212,8 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
   protected val hideTraceback: Boolean = false
   protected val simplifiedTraceback: Boolean = false
 
+  protected val runnerConf: Map[String, String] = Map.empty
+
   // All the Python functions should have the same exec, version and envvars.
   protected val envVars: java.util.Map[String, String] = funcs.head.funcs.head.envVars
   protected val pythonExec: String = funcs.head.funcs.head.pythonExec
@@ -403,6 +405,17 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
      */
     protected def writeCommand(dataOut: DataOutputStream): Unit
 
+    /**
+     * Writes worker configuration to the stream connected to the Python worker.
+     */
+    protected def writeRunnerConf(dataOut: DataOutputStream): Unit = {
+      dataOut.writeInt(runnerConf.size)
+      for ((k, v) <- runnerConf) {
+        PythonWorkerUtils.writeUTF(k, dataOut)
+        PythonWorkerUtils.writeUTF(v, dataOut)
+      }
+    }
+
     /**
      * Writes input data to the stream connected to the Python worker.
      * Returns true if any data was written to the stream, false if the input is exhausted.
@@ -532,6 +545,7 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
         PythonWorkerUtils.writeBroadcasts(broadcastVars, worker, env, dataOut)
 
         dataOut.writeInt(evalType)
+        writeRunnerConf(dataOut)
         writeCommand(dataOut)
 
         dataOut.flush()

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -407,10 +407,6 @@ private[spark] class Executor(
             TaskState.FAILED,
             env.closureSerializer.newInstance().serialize(new ExceptionFailure(t, Seq.empty)))
         } catch {
-          case oom: OutOfMemoryError =>
-            logError(log"Executor update launching task ${MDC(TASK_NAME, taskDescription.name)} " +
-              log"failed status failed, reason: ${MDC(REASON, oom.getMessage)}")
-            System.exit(SparkExitCode.OOM)
           case t: Throwable =>
             logError(log"Executor update launching task ${MDC(TASK_NAME, taskDescription.name)} " +
               log"failed status failed, reason: ${MDC(REASON, t.getMessage)}")

diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3
@@ -30,7 +30,7 @@ azure-storage/7.0.1//azure-storage-7.0.1.jar
 blas/3.0.4//blas-3.0.4.jar
 breeze-macros_2.13/2.1.0//breeze-macros_2.13-2.1.0.jar
 breeze_2.13/2.1.0//breeze_2.13-2.1.0.jar
-bundle/2.29.52//bundle-2.29.52.jar
+bundle/2.35.4//bundle-2.35.4.jar
 cats-kernel_2.13/2.8.0//cats-kernel_2.13-2.8.0.jar
 chill-java/0.10.0//chill-java-0.10.0.jar
 chill_2.13/0.10.0//chill_2.13-0.10.0.jar

diff --git a/pom.xml b/pom.xml
@@ -162,7 +162,7 @@
     <aws.kinesis.client.version>1.15.3</aws.kinesis.client.version>
     <!-- Should be consistent with Kinesis client dependency -->
     <aws.java.sdk.version>1.12.681</aws.java.sdk.version>
-    <aws.java.sdk.v2.version>2.29.52</aws.java.sdk.v2.version>
+    <aws.java.sdk.v2.version>2.35.4</aws.java.sdk.v2.version>
     <!-- the producer is used in tests -->
     <aws.kinesis.producer.version>1.0.5</aws.kinesis.producer.version>
     <!-- Do not use 3.0.0: https://github.com/GoogleCloudDataproc/hadoop-connectors/issues/1114 -->
@@ -233,7 +233,7 @@
     ./python/packaging/client/setup.py, and ./python/packaging/connect/setup.py too.
     -->
     <arrow.version>18.3.0</arrow.version>
-    <ammonite.version>3.0.4</ammonite.version>
+    <ammonite.version>3.0.5</ammonite.version>
     <jjwt.version>0.12.6</jjwt.version>
 
     <!-- org.fusesource.leveldbjni will be used except on arm64 platform. -->

diff --git a/python/pyspark/daemon.py b/python/pyspark/daemon.py
@@ -163,14 +163,26 @@ def handle_sigterm(*args):
 
     # Initialization complete
     try:
+        poller = None
+        if os.name == "posix":
+            # select.select has a known limit on the number of file descriptors
+            # it can handle. We use select.poll instead to avoid this limit.
+            poller = select.poll()
+            fd_reverse_map = {0: 0, listen_sock.fileno(): listen_sock}
+            poller.register(0, select.POLLIN)
+            poller.register(listen_sock, select.POLLIN)
+
         while True:
-            try:
-                ready_fds = select.select([0, listen_sock], [], [], 1)[0]
-            except select.error as ex:
-                if ex[0] == EINTR:
-                    continue
-                else:
-                    raise
+            if poller is not None:
+                ready_fds = [fd_reverse_map[fd] for fd, _ in poller.poll(1000)]
+            else:
+                try:
+                    ready_fds = select.select([0, listen_sock], [], [], 1)[0]
+                except select.error as ex:
+                    if ex[0] == EINTR:
+                        continue
+                    else:
+                        raise
 
             if 0 in ready_fds:
                 try:
@@ -208,6 +220,9 @@ def handle_sigterm(*args):
 
                 if pid == 0:
                     # in child process
+                    if poller is not None:
+                        poller.unregister(0)
+                        poller.unregister(listen_sock)
                     listen_sock.close()
 
                     # It should close the standard input in the child process so that
@@ -256,6 +271,9 @@ def handle_sigterm(*args):
                     sock.close()
 
     finally:
+        if poller is not None:
+            poller.unregister(0)
+            poller.unregister(listen_sock)
         shutdown(1)
 
 

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -622,7 +622,7 @@ class _LinearSVCParams(
     )
 
     def __init__(self, *args: Any) -> None:
-        super(_LinearSVCParams, self).__init__(*args)
+        super().__init__(*args)
         self._setDefault(
             maxIter=100,
             regParam=0.0,
@@ -743,7 +743,7 @@ def __init__(
                  fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, \
                  aggregationDepth=2, maxBlockSizeInMB=0.0):
         """
-        super(LinearSVC, self).__init__()
+        super().__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.LinearSVC", self.uid
         )
@@ -1019,7 +1019,7 @@ class _LogisticRegressionParams(
     )
 
     def __init__(self, *args: Any):
-        super(_LogisticRegressionParams, self).__init__(*args)
+        super().__init__(*args)
         self._setDefault(
             maxIter=100, regParam=0.0, tol=1e-6, threshold=0.5, family="auto", maxBlockSizeInMB=0.0
         )
@@ -1328,7 +1328,7 @@ def __init__(
                  maxBlockSizeInMB=0.0):
         If the threshold and thresholds Params are both set, they must be equivalent.
         """
-        super(LogisticRegression, self).__init__()
+        super().__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.LogisticRegression", self.uid
         )
@@ -1676,7 +1676,7 @@ class _DecisionTreeClassifierParams(_DecisionTreeParams, _TreeClassifierParams):
     """
 
     def __init__(self, *args: Any):
-        super(_DecisionTreeClassifierParams, self).__init__(*args)
+        super().__init__(*args)
         self._setDefault(
             maxDepth=5,
             maxBins=32,
@@ -1809,7 +1809,7 @@ def __init__(
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \
                  seed=None, weightCol=None, leafCol="", minWeightFractionPerNode=0.0)
         """
-        super(DecisionTreeClassifier, self).__init__()
+        super().__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.DecisionTreeClassifier", self.uid
         )
@@ -1970,7 +1970,7 @@ class _RandomForestClassifierParams(_RandomForestParams, _TreeClassifierParams):
     """
 
     def __init__(self, *args: Any):
-        super(_RandomForestClassifierParams, self).__init__(*args)
+        super().__init__(*args)
         self._setDefault(
             maxDepth=5,
             maxBins=32,
@@ -2106,7 +2106,7 @@ def __init__(
                  numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0, \
                  leafCol="", minWeightFractionPerNode=0.0, weightCol=None, bootstrap=True)
         """
-        super(RandomForestClassifier, self).__init__()
+        super().__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.RandomForestClassifier", self.uid
         )
@@ -2400,7 +2400,7 @@ class _GBTClassifierParams(_GBTParams, _HasVarianceImpurity):
     )
 
     def __init__(self, *args: Any):
-        super(_GBTClassifierParams, self).__init__(*args)
+        super().__init__(*args)
         self._setDefault(
             maxDepth=5,
             maxBins=32,
@@ -2577,7 +2577,7 @@ def __init__(
                  validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0, \
                  weightCol=None)
         """
-        super(GBTClassifier, self).__init__()
+        super().__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.GBTClassifier", self.uid
         )
@@ -2823,7 +2823,7 @@ class _NaiveBayesParams(_PredictorParams, HasWeightCol):
     )
 
     def __init__(self, *args: Any):
-        super(_NaiveBayesParams, self).__init__(*args)
+        super().__init__(*args)
         self._setDefault(smoothing=1.0, modelType="multinomial")
 
     @since("1.5.0")
@@ -2964,7 +2964,7 @@ def __init__(
                  probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0, \
                  modelType="multinomial", thresholds=None, weightCol=None)
         """
-        super(NaiveBayes, self).__init__()
+        super().__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.NaiveBayes", self.uid
         )
@@ -3093,7 +3093,7 @@ class _MultilayerPerceptronParams(
     )
 
     def __init__(self, *args: Any):
-        super(_MultilayerPerceptronParams, self).__init__(*args)
+        super().__init__(*args)
         self._setDefault(maxIter=100, tol=1e-6, blockSize=128, stepSize=0.03, solver="l-bfgs")
 
     @since("1.6.0")
@@ -3219,7 +3219,7 @@ def __init__(
                  solver="l-bfgs", initialWeights=None, probabilityCol="probability", \
                  rawPredictionCol="rawPrediction")
         """
-        super(MultilayerPerceptronClassifier, self).__init__()
+        super().__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid
         )
@@ -3484,7 +3484,7 @@ def __init__(
         __init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  rawPredictionCol="rawPrediction", classifier=None, weightCol=None, parallelism=1):
         """
-        super(OneVsRest, self).__init__()
+        super().__init__()
         self._setDefault(parallelism=1)
         kwargs = self._input_kwargs
         self._set(**kwargs)
@@ -3749,7 +3749,7 @@ def validateParams(instance: Union[OneVsRest, "OneVsRestModel"]) -> None:
 @inherit_doc
 class OneVsRestReader(MLReader[OneVsRest]):
     def __init__(self, cls: Type[OneVsRest]) -> None:
-        super(OneVsRestReader, self).__init__()
+        super().__init__()
         self.cls = cls
 
     def load(self, path: str) -> OneVsRest:
@@ -3765,7 +3765,7 @@ def load(self, path: str) -> OneVsRest:
 @inherit_doc
 class OneVsRestWriter(MLWriter):
     def __init__(self, instance: OneVsRest):
-        super(OneVsRestWriter, self).__init__()
+        super().__init__()
         self.instance = instance
 
     def saveImpl(self, path: str) -> None:
@@ -3807,7 +3807,7 @@ def setRawPredictionCol(self, value: str) -> "OneVsRestModel":
         return self._set(rawPredictionCol=value)
 
     def __init__(self, models: List[ClassificationModel]):
-        super(OneVsRestModel, self).__init__()
+        super().__init__()
         self.models = models
         if is_remote() or not isinstance(models[0], JavaMLWritable):
             return
@@ -3980,7 +3980,7 @@ def write(self) -> MLWriter:
 @inherit_doc
 class OneVsRestModelReader(MLReader[OneVsRestModel]):
     def __init__(self, cls: Type[OneVsRestModel]):
-        super(OneVsRestModelReader, self).__init__()
+        super().__init__()
         self.cls = cls
 
     def load(self, path: str) -> OneVsRestModel:
@@ -4002,7 +4002,7 @@ def load(self, path: str) -> OneVsRestModel:
 @inherit_doc
 class OneVsRestModelWriter(MLWriter):
     def __init__(self, instance: OneVsRestModel):
-        super(OneVsRestModelWriter, self).__init__()
+        super().__init__()
         self.instance = instance
 
     def saveImpl(self, path: str) -> None:
@@ -4119,7 +4119,7 @@ def __init__(
                  miniBatchFraction=1.0, initStd=0.01, maxIter=100, stepSize=1.0, \
                  tol=1e-6, solver="adamW", thresholds=None, seed=None)
         """
-        super(FMClassifier, self).__init__()
+        super().__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.FMClassifier", self.uid
         )