From af4cef40fa48feee32389e2a7fa8f96c1f82fd91 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 25 Jun 2015 19:54:33 -0700
Subject: [PATCH 01/15] Initial attempt at parallelizing Python test execution

---
 python/run-tests.py | 85 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 25 deletions(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 7d485b500ee3..c1463f4ec606 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -18,12 +18,19 @@
 #
 
 from __future__ import print_function
+import logging
 from optparse import OptionParser
 import os
 import re
 import subprocess
 import sys
+import tempfile
+from threading import Thread, Lock
 import time
+if sys.version < '3':
+    import Queue
+else:
+    import queue as Queue
 
 
 # Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
@@ -43,34 +50,43 @@ def print_red(text):
 
 
 LOG_FILE = os.path.join(SPARK_HOME, "python/unit-tests.log")
+LOG_FILE_LOCK = Lock()
+LOGGER = logging.getLogger()
 
 
 def run_individual_python_test(test_name, pyspark_python):
     env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python)}
-    print("    Running test: %s ..." % test_name, end='')
+    LOGGER.info("Starting test(%s): %s" % (pyspark_python, test_name))
     start_time = time.time()
-    with open(LOG_FILE, 'a') as log_file:
-        retcode = subprocess.call(
-            [os.path.join(SPARK_HOME, "bin/pyspark"), test_name],
-            stderr=log_file, stdout=log_file, env=env)
+    per_test_output = tempfile.TemporaryFile()
+    retcode = subprocess.Popen(
+        [os.path.join(SPARK_HOME, "bin/pyspark"), test_name],
+        stderr=per_test_output, stdout=per_test_output, env=env).wait()
     duration = time.time() - start_time
+    with LOG_FILE_LOCK:
+        with open(LOG_FILE, 'ab') as log_file:
+            per_test_output.seek(0)
+            log_file.writelines(per_test_output.readlines())
+    per_test_output.close()
     # Exit on the first failure.
     if retcode != 0:
         with open(LOG_FILE, 'r') as log_file:
             for line in log_file:
                 if not re.match('[0-9]+', line):
                     print(line, end='')
-        print_red("\nHad test failures in %s; see logs." % test_name)
-        exit(-1)
+        print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python))
+        # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
+        # this code is invoked from a thread other than the main thread.
+        os._exit(-1)
     else:
-        print("ok (%is)" % duration)
+        LOGGER.info("Finished test(%s): %s (%is)" % (pyspark_python, test_name, duration))
 
 
 def get_default_python_executables():
     python_execs = [x for x in ["python2.6", "python3.4", "pypy"] if which(x)]
     if "python2.6" not in python_execs:
-        print("WARNING: Not testing against `python2.6` because it could not be found; falling"
-              " back to `python` instead")
+        LOGGER.warning("Not testing against `python2.6` because it could not be found; falling"
+                       " back to `python` instead")
         python_execs.insert(0, "python")
     return python_execs
 
@@ -88,6 +104,10 @@ def parse_opts():
         default=",".join(sorted(python_modules.keys())),
         help="A comma-separated list of Python modules to test (default: %default)"
     )
+    parser.add_option(
+        "-p", "--parallelism", type="int", default=4,
+        help="The number of suites to test in parallel (default %default)"
+    )
 
     (opts, args) = parser.parse_args()
     if args:
@@ -96,8 +116,9 @@ def parse_opts():
 
 
 def main():
+    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format="%(message)s")
     opts = parse_opts()
-    print("Running PySpark tests. Output is in python/%s" % LOG_FILE)
+    LOGGER.info("Running PySpark tests. Output is in python/%s" % LOG_FILE)
     if os.path.exists(LOG_FILE):
         os.remove(LOG_FILE)
     python_execs = opts.python_executables.split(',')
@@ -108,24 +129,38 @@ def main():
         else:
             print("Error: unrecognized module %s" % module_name)
             sys.exit(-1)
-    print("Will test against the following Python executables: %s" % python_execs)
-    print("Will test the following Python modules: %s" % [x.name for x in modules_to_test])
+    LOGGER.info("Will test against the following Python executables: %s" % python_execs)
+    LOGGER.info("Will test the following Python modules: %s" % [x.name for x in modules_to_test])
 
-    start_time = time.time()
+    task_queue = Queue.Queue()
     for python_exec in python_execs:
-        python_implementation = subprocess.check_output(
-            [python_exec, "-c", "import platform; print(platform.python_implementation())"],
-            universal_newlines=True).strip()
-        print("Testing with `%s`: " % python_exec, end='')
-        subprocess.call([python_exec, "--version"])
-
         for module in modules_to_test:
-            if python_implementation not in module.blacklisted_python_implementations:
-                print("Running %s tests ..." % module.name)
-                for test_goal in module.python_test_goals:
-                    run_individual_python_test(test_goal, python_exec)
+            for test_goal in module.python_test_goals:
+                task_queue.put((python_exec, test_goal))
+
+    def process_queue(task_queue):
+        while True:
+            try:
+                (python_exec, test_goal) = task_queue.get_nowait()
+            except Queue.Empty:
+                break
+            try:
+                run_individual_python_test(test_goal, python_exec)
+            finally:
+                task_queue.task_done()
+
+    start_time = time.time()
+    for _ in range(opts.parallelism):
+        worker = Thread(target=process_queue, args=(task_queue,))
+        worker.daemon = True
+        worker.start()
+    try:
+        task_queue.join()
+    except (KeyboardInterrupt, SystemExit):
+        print_red("Exiting due to interrupt")
+        sys.exit(-1)
     total_duration = time.time() - start_time
-    print("Tests passed in %i seconds" % total_duration)
+    LOGGER.info("Tests passed in %i seconds" % total_duration)
 
 
 if __name__ == "__main__":

From 037b6860a1a3caa8c5bab14447bad62b92a486a2 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 25 Jun 2015 19:55:23 -0700
Subject: [PATCH 02/15] Temporarily disable JVM tests so we can test Python
 speedup in Jenkins.

---
 dev/run-tests.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index c51b0d3010a0..ce4f28b7c957 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -454,16 +454,16 @@ def main():
     build_apache_spark(build_tool, hadoop_version)
 
     # backwards compatibility checks
-    detect_binary_inop_with_mima()
+    # detect_binary_inop_with_mima()
 
     # run the test suites
-    run_scala_tests(build_tool, hadoop_version, test_modules)
+    # run_scala_tests(build_tool, hadoop_version, test_modules)
 
     modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
     if modules_with_python_tests:
         run_python_tests(modules_with_python_tests)
-    if any(m.should_run_r_tests for m in test_modules):
-        run_sparkr_tests()
+    # if any(m.should_run_r_tests for m in test_modules):
+        # run_sparkr_tests()
 
 
 def _test():

From 9129027ab0b0e21f87152ca2dcff31010df1f9cd Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 25 Jun 2015 22:21:05 -0700
Subject: [PATCH 03/15] Disable Spark UI in Python tests

---
 python/pyspark/java_gateway.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 3cee4ea6e3a3..90cd342a6cf7 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -51,6 +51,8 @@ def launch_gateway():
         on_windows = platform.system() == "Windows"
         script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit"
         submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell")
+        if os.environ.get("SPARK_TESTING"):
+            submit_args = "--conf spark.ui.enabled=false " + submit_args
         command = [os.path.join(SPARK_HOME, script)] + shlex.split(submit_args)
 
         # Start a socket that will be used by PythonGatewayServer to communicate its port to us

From 8309bfe46c0ed395f9288c772e2eedbfa28a3c14 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 00:18:40 -0700
Subject: [PATCH 04/15] Temporarily disable parallelism to debug a failure

---
 python/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index c1463f4ec606..9e156be3dcfb 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -105,7 +105,7 @@ def parse_opts():
         help="A comma-separated list of Python modules to test (default: %default)"
     )
     parser.add_option(
-        "-p", "--parallelism", type="int", default=4,
+        "-p", "--parallelism", type="int", default=1,
         help="The number of suites to test in parallel (default %default)"
     )
 

From 87cb988b2544a0a39dd6cf463609298feca8a2b0 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 09:47:33 -0700
Subject: [PATCH 05/15] Skip MLLib tests for PyPy

---
 python/run-tests.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 9e156be3dcfb..9090f494f76f 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -134,9 +134,12 @@ def main():
 
     task_queue = Queue.Queue()
     for python_exec in python_execs:
+        python_implementation = subprocess.check_output(
+            [python_exec, "-c", "import platform; print platform.python_implementation()"]).strip()
         for module in modules_to_test:
-            for test_goal in module.python_test_goals:
-                task_queue.put((python_exec, test_goal))
+            if python_implementation not in module.blacklisted_python_implementations:
+                for test_goal in module.python_test_goals:
+                    task_queue.put((python_exec, test_goal))
 
     def process_queue(task_queue):
         while True:

From 866b5b9948309b48928f0b1c1d74a043c34119f2 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 09:50:45 -0700
Subject: [PATCH 06/15] Fix lazy logging warnings in Prospector checks

---
 python/run-tests.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 9090f494f76f..ea6eb5e086b5 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -56,7 +56,7 @@ def print_red(text):
 
 def run_individual_python_test(test_name, pyspark_python):
     env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python)}
-    LOGGER.info("Starting test(%s): %s" % (pyspark_python, test_name))
+    LOGGER.info("Starting test(%s): %s", pyspark_python, test_name)
     start_time = time.time()
     per_test_output = tempfile.TemporaryFile()
     retcode = subprocess.Popen(
@@ -79,7 +79,7 @@ def run_individual_python_test(test_name, pyspark_python):
         # this code is invoked from a thread other than the main thread.
         os._exit(-1)
     else:
-        LOGGER.info("Finished test(%s): %s (%is)" % (pyspark_python, test_name, duration))
+        LOGGER.info("Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
 
 
 def get_default_python_executables():
@@ -118,7 +118,7 @@ def parse_opts():
 def main():
     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format="%(message)s")
     opts = parse_opts()
-    LOGGER.info("Running PySpark tests. Output is in python/%s" % LOG_FILE)
+    LOGGER.info("Running PySpark tests. Output is in python/%s", LOG_FILE)
     if os.path.exists(LOG_FILE):
         os.remove(LOG_FILE)
     python_execs = opts.python_executables.split(',')
@@ -129,8 +129,8 @@ def main():
         else:
             print("Error: unrecognized module %s" % module_name)
             sys.exit(-1)
-    LOGGER.info("Will test against the following Python executables: %s" % python_execs)
-    LOGGER.info("Will test the following Python modules: %s" % [x.name for x in modules_to_test])
+    LOGGER.info("Will test against the following Python executables: %s", python_execs)
+    LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test])
 
     task_queue = Queue.Queue()
     for python_exec in python_execs:
@@ -163,7 +163,7 @@ def process_queue(task_queue):
         print_red("Exiting due to interrupt")
         sys.exit(-1)
     total_duration = time.time() - start_time
-    LOGGER.info("Tests passed in %i seconds" % total_duration)
+    LOGGER.info("Tests passed in %i seconds", total_duration)
 
 
 if __name__ == "__main__":

From 5552380cd904b1d25575ea60ff88ff7333d718a4 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 10:02:33 -0700
Subject: [PATCH 07/15] Python 3 fix

---
 python/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index ea6eb5e086b5..0d273550453f 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -135,7 +135,7 @@ def main():
     task_queue = Queue.Queue()
     for python_exec in python_execs:
         python_implementation = subprocess.check_output(
-            [python_exec, "-c", "import platform; print platform.python_implementation()"]).strip()
+            [python_exec, "-c", "import platform; print(platform.python_implementation())"]).strip()
         for module in modules_to_test:
             if python_implementation not in module.blacklisted_python_implementations:
                 for test_goal in module.python_test_goals:

From a2b9094469dda569f3bb94d2ed0b0ab6ccb717f5 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 12:19:03 -0700
Subject: [PATCH 08/15] Bump up parallelism.

---
 python/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 0d273550453f..b173ea7a988c 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -105,7 +105,7 @@ def parse_opts():
         help="A comma-separated list of Python modules to test (default: %default)"
     )
     parser.add_option(
-        "-p", "--parallelism", type="int", default=1,
+        "-p", "--parallelism", type="int", default=4,
         help="The number of suites to test in parallel (default %default)"
     )
 

From 9e31127b3253c66869e3354c73b51c6b9d3ccdd6 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 13:41:59 -0700
Subject: [PATCH 09/15] Log Python --version output for each executable.

---
 python/run-tests.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/run-tests.py b/python/run-tests.py
index b173ea7a988c..deb1a9d19fd3 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -136,6 +136,8 @@ def main():
     for python_exec in python_execs:
         python_implementation = subprocess.check_output(
             [python_exec, "-c", "import platform; print(platform.python_implementation())"]).strip()
+        LOGGER.debug("`%s` version is: %s", python_exec, subprocess.check_output(
+            [python_exec, "--version"], stderr=subprocess.STDOUT).strip())
         for module in modules_to_test:
             if python_implementation not in module.blacklisted_python_implementations:
                 for test_goal in module.python_test_goals:

From cd13db84075c5ae828beeafa3043e7f9c0d150e9 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 13:43:30 -0700
Subject: [PATCH 10/15] Also log python_implementation

---
 python/run-tests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index deb1a9d19fd3..4012a448dfd4 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -136,7 +136,8 @@ def main():
     for python_exec in python_execs:
         python_implementation = subprocess.check_output(
             [python_exec, "-c", "import platform; print(platform.python_implementation())"]).strip()
-        LOGGER.debug("`%s` version is: %s", python_exec, subprocess.check_output(
+        LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation)
+        LOGGER.debug("%s version is: %s", python_exec, subprocess.check_output(
             [python_exec, "--version"], stderr=subprocess.STDOUT).strip())
         for module in modules_to_test:
             if python_implementation not in module.blacklisted_python_implementations:

From 110cd9dd9877eb5b6934639b5e5adde15748cd57 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 18:33:07 -0700
Subject: [PATCH 11/15] Fix universal_newlines for Python 3

---
 python/run-tests.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 4012a448dfd4..a2b0ac4aeb8e 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -135,10 +135,11 @@ def main():
     task_queue = Queue.Queue()
     for python_exec in python_execs:
         python_implementation = subprocess.check_output(
-            [python_exec, "-c", "import platform; print(platform.python_implementation())"]).strip()
+            [python_exec, "-c", "import platform; print(platform.python_implementation())"],
+            universal_newlines=True).strip()
         LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation)
         LOGGER.debug("%s version is: %s", python_exec, subprocess.check_output(
-            [python_exec, "--version"], stderr=subprocess.STDOUT).strip())
+            [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip())
         for module in modules_to_test:
             if python_implementation not in module.blacklisted_python_implementations:
                 for test_goal in module.python_test_goals:

From a2717e1ab33f3a3ec82106be4646e5692535c410 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 29 Jun 2015 17:08:06 -0700
Subject: [PATCH 12/15] Make parallelism configurable via dev/run-tests

---
 dev/run-tests                      |  2 +-
 dev/run-tests.py                   | 24 ++++++++++++++++++++++--
 dev/sparktestsupport/shellutils.py |  1 +
 python/run-tests.py                |  2 ++
 4 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/dev/run-tests b/dev/run-tests
index a00d9f0c2763..257d1e8d50bb 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -20,4 +20,4 @@
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-exec python -u ./dev/run-tests.py
+exec python -u ./dev/run-tests.py "$@"
diff --git a/dev/run-tests.py b/dev/run-tests.py
index b9058642205a..93234aad6875 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -19,6 +19,7 @@
 
 from __future__ import print_function
 import itertools
+from optparse import OptionParser
 import os
 import re
 import sys
@@ -360,12 +361,13 @@ def run_scala_tests(build_tool, hadoop_version, test_modules):
         run_scala_tests_sbt(test_modules, test_profiles)
 
 
-def run_python_tests(test_modules):
+def run_python_tests(test_modules, parallelism):
     set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
 
     command = [os.path.join(SPARK_HOME, "python", "run-tests")]
     if test_modules != [modules.root]:
         command.append("--modules=%s" % ','.join(m.name for m in test_modules))
+    command.append("--parallelism=%i" % parallelism)
     run_cmd(command)
 
 
@@ -379,7 +381,25 @@ def run_sparkr_tests():
         print("Ignoring SparkR tests as R was not found in PATH")
 
 
+def parse_opts():
+    parser = OptionParser(
+        prog="run-tests"
+    )
+    parser.add_option(
+        "-p", "--parallelism", type="int", default=4,
+        help="The number of suites to test in parallel (default %default)"
+    )
+
+    (opts, args) = parser.parse_args()
+    if args:
+        parser.error("Unsupported arguments: %s" % ' '.join(args))
+    if opts.parallelism < 1:
+        parser.error("Parallelism cannot be less than 1")
+    return opts
+
+
 def main():
+    opts = parse_opts()
     # Ensure the user home directory (HOME) is valid and is an absolute directory
     if not USER_HOME or not os.path.isabs(USER_HOME):
         print("[error] Cannot determine your home directory as an absolute path;",
@@ -461,7 +481,7 @@ def main():
 
     modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
     if modules_with_python_tests:
-        run_python_tests(modules_with_python_tests)
+        run_python_tests(modules_with_python_tests, opts.parallelism)
     # if any(m.should_run_r_tests for m in test_modules):
         # run_sparkr_tests()
 
diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index ad9b0cc89e4a..12bd0bf3a4fe 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+from __future__ import print_function
 import os
 import shutil
 import subprocess
diff --git a/python/run-tests.py b/python/run-tests.py
index a2b0ac4aeb8e..24681d427c56 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -112,6 +112,8 @@ def parse_opts():
     (opts, args) = parser.parse_args()
     if args:
         parser.error("Unsupported arguments: %s" % ' '.join(args))
+    if opts.parallelism < 1:
+        parser.error("Parallelism cannot be less than 1")
     return opts
 
 

From d4ded7368f6c3d9c7e7c958ee2925f3867116aa9 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 29 Jun 2015 17:23:09 -0700
Subject: [PATCH 13/15] Logging improvements

---
 python/run-tests.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 24681d427c56..78bf2a8b7502 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -56,7 +56,7 @@ def print_red(text):
 
 def run_individual_python_test(test_name, pyspark_python):
     env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python)}
-    LOGGER.info("Starting test(%s): %s", pyspark_python, test_name)
+    LOGGER.debug("Starting test(%s): %s", pyspark_python, test_name)
     start_time = time.time()
     per_test_output = tempfile.TemporaryFile()
     retcode = subprocess.Popen(
@@ -108,6 +108,10 @@ def parse_opts():
         "-p", "--parallelism", type="int", default=4,
         help="The number of suites to test in parallel (default %default)"
     )
+    parser.add_option(
+        "--verbose", action="store_true",
+        help="Enable additional debug logging"
+    )
 
     (opts, args) = parser.parse_args()
     if args:
@@ -118,8 +122,12 @@ def parse_opts():
 
 
 def main():
-    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format="%(message)s")
     opts = parse_opts()
+    if (opts.verbose):
+        log_level = logging.DEBUG
+    else:
+        log_level = logging.INFO
+    logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s")
     LOGGER.info("Running PySpark tests. Output is in python/%s", LOG_FILE)
     if os.path.exists(LOG_FILE):
         os.remove(LOG_FILE)

From f87ea8128eaf2e0285bf27f4f693e30f87b512b9 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 29 Jun 2015 17:32:07 -0700
Subject: [PATCH 14/15] Only log output from failed tests

---
 python/run-tests.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 78bf2a8b7502..aaa35e936a80 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -50,7 +50,7 @@ def print_red(text):
 
 
 LOG_FILE = os.path.join(SPARK_HOME, "python/unit-tests.log")
-LOG_FILE_LOCK = Lock()
+FAILURE_REPORTING_LOCK = Lock()
 LOGGER = logging.getLogger()
 
 
@@ -63,22 +63,23 @@ def run_individual_python_test(test_name, pyspark_python):
         [os.path.join(SPARK_HOME, "bin/pyspark"), test_name],
         stderr=per_test_output, stdout=per_test_output, env=env).wait()
     duration = time.time() - start_time
-    with LOG_FILE_LOCK:
-        with open(LOG_FILE, 'ab') as log_file:
-            per_test_output.seek(0)
-            log_file.writelines(per_test_output.readlines())
-    per_test_output.close()
     # Exit on the first failure.
     if retcode != 0:
-        with open(LOG_FILE, 'r') as log_file:
-            for line in log_file:
+        with FAILURE_REPORTING_LOCK:
+            with open(LOG_FILE, 'ab') as log_file:
+                per_test_output.seek(0)
+                log_file.writelines(per_test_output.readlines())
+            per_test_output.seek(0)
+            for line in per_test_output:
                 if not re.match('[0-9]+', line):
                     print(line, end='')
-        print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python))
-        # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
-        # this code is invoked from a thread other than the main thread.
-        os._exit(-1)
+            per_test_output.close()
+            print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python))
+            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
+            # this code is invoked from a thread other than the main thread.
+            os._exit(-1)
     else:
+        per_test_output.close()
         LOGGER.info("Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
 
 

From feb3763df3e7a25c833f4240ccf85c45dd56711f Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 29 Jun 2015 17:35:28 -0700
Subject: [PATCH 15/15] Re-enable other tests

---
 dev/run-tests.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 93234aad6875..4596e0701473 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -474,16 +474,16 @@ def main():
     build_apache_spark(build_tool, hadoop_version)
 
     # backwards compatibility checks
-    # detect_binary_inop_with_mima()
+    detect_binary_inop_with_mima()
 
     # run the test suites
-    # run_scala_tests(build_tool, hadoop_version, test_modules)
+    run_scala_tests(build_tool, hadoop_version, test_modules)
 
     modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
     if modules_with_python_tests:
         run_python_tests(modules_with_python_tests, opts.parallelism)
-    # if any(m.should_run_r_tests for m in test_modules):
-        # run_sparkr_tests()
+    if any(m.should_run_r_tests for m in test_modules):
+        run_sparkr_tests()
 
 
 def _test():