Skip to content

Commit 3d6066e

Browse files
parenteholdenk
authored andcommitted
[SPARK-21094][PYTHON] Add popen_kwargs to launch_gateway
## What changes were proposed in this pull request? Allow the caller to customize the py4j JVM subprocess pipes and buffers for programmatic capturing of its output. https://issues.apache.org/jira/browse/SPARK-21094 has more detail about the use case. ## How was this patch tested? Tested by running the pyspark unit tests locally. Closes #18339 from parente/feature/SPARK-21094-popen-args. Lead-authored-by: Peter Parente <parente@cs.unc.edu> Co-authored-by: Peter Parente <peter.parente@maxpoint.com> Signed-off-by: Holden Karau <holden@pigscanfly.ca>
1 parent 28ced38 commit 3d6066e

File tree

1 file changed

+16
-3
lines changed

1 file changed

+16
-3
lines changed

python/pyspark/java_gateway.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,21 @@
3636
from pyspark.util import _exception_message
3737

3838

39-
def launch_gateway(conf=None):
39+
def launch_gateway(conf=None, popen_kwargs=None):
4040
"""
4141
launch jvm gateway
4242
:param conf: spark configuration passed to spark-submit
43+
:param popen_kwargs: Dictionary of kwargs to pass to Popen when spawning
44+
the py4j JVM. This is a developer feature intended for use in
45+
customizing how pyspark interacts with the py4j JVM (e.g., capturing
46+
stdout/stderr).
4347
:return:
4448
"""
4549
if "PYSPARK_GATEWAY_PORT" in os.environ:
4650
gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
4751
gateway_secret = os.environ["PYSPARK_GATEWAY_SECRET"]
52+
# Process already exists
53+
proc = None
4854
else:
4955
SPARK_HOME = _find_spark_home()
5056
# Launch the Py4j gateway using Spark's run command so that we pick up the
@@ -75,15 +81,20 @@ def launch_gateway(conf=None):
7581
env["_PYSPARK_DRIVER_CONN_INFO_PATH"] = conn_info_file
7682

7783
# Launch the Java gateway.
84+
popen_kwargs = {} if popen_kwargs is None else popen_kwargs
7885
# We open a pipe to stdin so that the Java gateway can die when the pipe is broken
86+
popen_kwargs['stdin'] = PIPE
87+
# We always set the necessary environment variables.
88+
popen_kwargs['env'] = env
7989
if not on_windows:
8090
# Don't send ctrl-c / SIGINT to the Java gateway:
8191
def preexec_func():
8292
signal.signal(signal.SIGINT, signal.SIG_IGN)
83-
proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
93+
popen_kwargs['preexec_fn'] = preexec_func
94+
proc = Popen(command, **popen_kwargs)
8495
else:
8596
# preexec_fn not supported on Windows
86-
proc = Popen(command, stdin=PIPE, env=env)
97+
proc = Popen(command, **popen_kwargs)
8798

8899
# Wait for the file to appear, or for the process to exit, whichever happens first.
89100
while not proc.poll() and not os.path.isfile(conn_info_file):
@@ -118,6 +129,8 @@ def killChild():
118129
gateway = JavaGateway(
119130
gateway_parameters=GatewayParameters(port=gateway_port, auth_token=gateway_secret,
120131
auto_convert=True))
132+
# Store a reference to the Popen object for use by the caller (e.g., in reading stdout/stderr)
133+
gateway.proc = proc
121134

122135
# Import the classes used by PySpark
123136
java_import(gateway.jvm, "org.apache.spark.SparkConf")

0 commit comments

Comments
 (0)