From 92f7268eac3efb2c2ff49239807eaea693d29027 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Wed, 30 Oct 2019 13:46:42 -0700
Subject: [PATCH 01/31] remove py27 support for tests

---
 python/run-tests.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index b1119b044d71..21d25cc8b6a9 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -160,11 +160,15 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
 
 
 def get_default_python_executables():
-    python_execs = [x for x in ["python2.7", "python3.6", "pypy"] if which(x)]
-    if "python2.7" not in python_execs:
-        LOGGER.warning("Not testing against `python2.7` because it could not be found; falling"
-                       " back to `python` instead")
-        python_execs.insert(0, "python")
+    python_execs = [x for x in ["python3.6", "pypy"] if which(x)]
+    if "python3.6" not in python_execs:
+        if which('python'):
+            LOGGER.warning("Not testing against `python3.6` because it could not be found; falling"
+                           " back to `python` instead")
+            python_execs.insert(0, "python")
+        else:
+            LOGGER.error("No python executable found!  Exiting!")
+            os._exit(1)
     return python_execs
 
 

From 7c0fe469175d42867e87870a95e2a930acd58e8f Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Wed, 30 Oct 2019 14:14:58 -0700
Subject: [PATCH 02/31] follow double quoted convention

---
 python/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 21d25cc8b6a9..c7241671efde 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -162,7 +162,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
 def get_default_python_executables():
     python_execs = [x for x in ["python3.6", "pypy"] if which(x)]
     if "python3.6" not in python_execs:
-        if which('python'):
+        if which("python"):
             LOGGER.warning("Not testing against `python3.6` because it could not be found; falling"
                            " back to `python` instead")
             python_execs.insert(0, "python")

From 226c1fa2d0aae8b7af64caac3ff671ec0c57de3d Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Wed, 30 Oct 2019 15:14:52 -0700
Subject: [PATCH 03/31] have pip packaging tests run with py36

---
 dev/run-pip-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-pip-tests b/dev/run-pip-tests
index 60cf4d820941..3ddd8cdae262 100755
--- a/dev/run-pip-tests
+++ b/dev/run-pip-tests
@@ -53,7 +53,7 @@ if hash virtualenv 2>/dev/null && [ ! -n "$USE_CONDA" ]; then
   fi
 elif hash conda 2>/dev/null; then
   echo "Using conda virtual environments"
-  PYTHON_EXECS=('3.5')
+  PYTHON_EXECS=('3.6')
   USE_CONDA=1
 else
   echo "Missing virtualenv & conda, skipping pip installability tests"

From f25c4b3e10340ac713cf8b8c3cc82ee65d9bdd76 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Wed, 30 Oct 2019 15:44:52 -0700
Subject: [PATCH 04/31] remove py27 altogether

---
 dev/run-pip-tests | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/dev/run-pip-tests b/dev/run-pip-tests
index 3ddd8cdae262..c25ee7839a80 100755
--- a/dev/run-pip-tests
+++ b/dev/run-pip-tests
@@ -39,17 +39,12 @@ PYTHON_EXECS=()
 # Some systems don't have pip or virtualenv - in those cases our tests won't work.
 if hash virtualenv 2>/dev/null && [ ! -n "$USE_CONDA" ]; then
   echo "virtualenv installed - using. Note if this is a conda virtual env you may wish to set USE_CONDA"
-  # Figure out which Python execs we should test pip installation with
-  if hash python2 2>/dev/null; then
-    # We do this since we are testing with virtualenv and the default virtual env python
-    # is in /usr/bin/python
-    PYTHON_EXECS+=('python2')
-  elif hash python 2>/dev/null; then
-    # If python2 isn't installed fallback to python if available
-    PYTHON_EXECS+=('python')
-  fi
+  # test only against python3
   if hash python3 2>/dev/null; then
-    PYTHON_EXECS+=('python3')
+    PYTHON_EXECS=('python3')
+  else
+    echo "Python3 not installed on system, skipping pip installability tests"
+    exit 0
   fi
 elif hash conda 2>/dev/null; then
   echo "Using conda virtual environments"

From b57acd9b5fe0d8df7e782b12ff92c3f01dbc9bd5 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 31 Oct 2019 12:49:26 -0700
Subject: [PATCH 05/31] testing pypy3

---
 python/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index c7241671efde..ebb407a84457 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -160,7 +160,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
 
 
 def get_default_python_executables():
-    python_execs = [x for x in ["python3.6", "pypy"] if which(x)]
+    python_execs = [x for x in ["python3.6", "pypy3"] if which(x)]
     if "python3.6" not in python_execs:
         if which("python"):
             LOGGER.warning("Not testing against `python3.6` because it could not be found; falling"

From 0b37b71167c192f726600fecd4a29b33e928c245 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 31 Oct 2019 19:06:30 -0700
Subject: [PATCH 06/31] Revert "testing pypy3"

This reverts commit b57acd9b5fe0d8df7e782b12ff92c3f01dbc9bd5.
---
 python/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index ebb407a84457..c7241671efde 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -160,7 +160,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
 
 
 def get_default_python_executables():
-    python_execs = [x for x in ["python3.6", "pypy3"] if which(x)]
+    python_execs = [x for x in ["python3.6", "pypy"] if which(x)]
     if "python3.6" not in python_execs:
         if which("python"):
             LOGGER.warning("Not testing against `python3.6` because it could not be found; falling"

From d0e49bbae5b350aa674286fbba3816b10867ec79 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Fri, 1 Nov 2019 10:07:21 -0700
Subject: [PATCH 07/31] fallback to python3, check if version to test against
 is compatible

---
 python/run-tests.py | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index c7241671efde..ecd2ff646535 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -57,6 +57,10 @@ def print_red(text):
 FAILURE_REPORTING_LOCK = Lock()
 LOGGER = logging.getLogger()
 
+PYTHON_MAJOR_VERSION=3
+PYTHON_MINOR_VERSION=6
+PYTHON_MICRO_VERSION=0
+
 # Find out where the assembly jars are located.
 # TODO: revisit for Scala 2.13
 for scala in ["2.12"]:
@@ -161,13 +165,31 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
 
 def get_default_python_executables():
     python_execs = [x for x in ["python3.6", "pypy"] if which(x)]
+
     if "python3.6" not in python_execs:
-        if which("python"):
-            LOGGER.warning("Not testing against `python3.6` because it could not be found; falling"
-                           " back to `python` instead")
-            python_execs.insert(0, "python")
+        p = which("python3")
+
+        if p:
+            py_out = subprocess.run([p, "--version"], stderr=subprocess.PIPE)
+            py_out = py_out.stderr.decode("utf-8")
+            python_version = tuple([int(i) for i in re.findall("\d", py_out)])
+
+            if python_version < (PYTHON_MAJOR_VERSION, PYTHON_MINOR_VERSION, PYTHON_MICRO_VERSION):
+                LOGGER.error("Python version %s.%s.%s is lower than minimum "
+                             "required: %s.%s.%s" % (python_version[0],
+                                                     python_version[1],
+                                                     python_version[2],
+                                                     PYTHON_MAJOR_VERSION,
+                                                     PYTHON_MINOR_VERSION,
+                                                     PYTHON_MICRO_VERSION))
+                LOGGER.error("Exiting!")
+                os._exit(1)
+            else:
+                LOGGER.warning("Not testing against `python3.6` because it could not be found; falling"
+                               " back to %s instead" % p)
+                python_execs.insert(0, "python3")
         else:
-            LOGGER.error("No python executable found!  Exiting!")
+            LOGGER.error("No python3 executable found.  Exiting!")
             os._exit(1)
     return python_execs
 

From be57b53b922f93275796a70ca5741a5f9f7f8e4b Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Fri, 1 Nov 2019 10:09:40 -0700
Subject: [PATCH 08/31] use the found python3 with full path

---
 python/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index ecd2ff646535..ac8ab38fe43e 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -187,7 +187,7 @@ def get_default_python_executables():
             else:
                 LOGGER.warning("Not testing against `python3.6` because it could not be found; falling"
                                " back to %s instead" % p)
-                python_execs.insert(0, "python3")
+                python_execs.insert(0, p)
         else:
             LOGGER.error("No python3 executable found.  Exiting!")
             os._exit(1)

From 425976f7c3c83b1f8765916a2425165aec5a952f Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Fri, 1 Nov 2019 10:43:54 -0700
Subject: [PATCH 09/31] python style

---
 python/flycheck_run-tests.py | 346 +++++++++++++++++++++++++++++++++++
 python/run-tests.py          |  12 +-
 2 files changed, 352 insertions(+), 6 deletions(-)
 create mode 100644 python/flycheck_run-tests.py

diff --git a/python/flycheck_run-tests.py b/python/flycheck_run-tests.py
new file mode 100644
index 000000000000..f45e5787b05b
--- /dev/null
+++ b/python/flycheck_run-tests.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+import logging
+from argparse import ArgumentParser
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+from threading import Thread, Lock
+import time
+import uuid
+if sys.version < '3':
+    import Queue
+else:
+    import queue as Queue
+from multiprocessing import Manager
+
+
+# Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
+sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../dev/"))
+
+
+from sparktestsupport import SPARK_HOME  # noqa (suppress pep8 warnings)
+from sparktestsupport.shellutils import which, subprocess_check_output  # noqa
+from sparktestsupport.modules import all_modules, pyspark_sql  # noqa
+
+
+python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals if m.name != 'root')
+
+
+def print_red(text):
+    print('\033[31m' + text + '\033[0m')
+
+
+SKIPPED_TESTS = Manager().dict()
+LOG_FILE = os.path.join(SPARK_HOME, "python/unit-tests.log")
+FAILURE_REPORTING_LOCK = Lock()
+LOGGER = logging.getLogger()
+
+PYTHON_MAJOR_VERSION = 3
+PYTHON_MINOR_VERSION = 6
+PYTHON_MICRO_VERSION = 0
+
+# Find out where the assembly jars are located.
+# TODO: revisit for Scala 2.13
+for scala in ["2.12"]:
+    build_dir = os.path.join(SPARK_HOME, "assembly", "target", "scala-" + scala)
+    if os.path.isdir(build_dir):
+        SPARK_DIST_CLASSPATH = os.path.join(build_dir, "jars", "*")
+        break
+else:
+    raise Exception("Cannot find assembly build directory, please build Spark first.")
+
+
+def run_individual_python_test(target_dir, test_name, pyspark_python):
+    env = dict(os.environ)
+    env.update({
+        'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH,
+        'SPARK_TESTING': '1',
+        'SPARK_PREPEND_CLASSES': '1',
+        'PYSPARK_PYTHON': which(pyspark_python),
+        'PYSPARK_DRIVER_PYTHON': which(pyspark_python)
+    })
+
+    # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is
+    # recognized by the tempfile module to override the default system temp directory.
+    tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
+    while os.path.isdir(tmp_dir):
+        tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
+    os.mkdir(tmp_dir)
+    env["TMPDIR"] = tmp_dir
+
+    # Also override the JVM's temp directory by setting driver and executor options.
+    spark_args = [
+        "--conf", "spark.driver.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
+        "--conf", "spark.executor.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
+        "pyspark-shell"
+    ]
+    env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args)
+
+    LOGGER.info("Starting test(%s): %s", pyspark_python, test_name)
+    start_time = time.time()
+    try:
+        per_test_output = tempfile.TemporaryFile()
+        retcode = subprocess.Popen(
+            [os.path.join(SPARK_HOME, "bin/pyspark")] + test_name.split(),
+            stderr=per_test_output, stdout=per_test_output, env=env).wait()
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+    except:
+        LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python)
+        # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
+        # this code is invoked from a thread other than the main thread.
+        os._exit(1)
+    duration = time.time() - start_time
+    # Exit on the first failure.
+    if retcode != 0:
+        try:
+            with FAILURE_REPORTING_LOCK:
+                with open(LOG_FILE, 'ab') as log_file:
+                    per_test_output.seek(0)
+                    log_file.writelines(per_test_output)
+                per_test_output.seek(0)
+                for line in per_test_output:
+                    decoded_line = line.decode("utf-8", "replace")
+                    if not re.match('[0-9]+', decoded_line):
+                        print(decoded_line, end='')
+                per_test_output.close()
+        except:
+            LOGGER.exception("Got an exception while trying to print failed test output")
+        finally:
+            print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python))
+            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
+            # this code is invoked from a thread other than the main thread.
+            os._exit(-1)
+    else:
+        skipped_counts = 0
+        try:
+            per_test_output.seek(0)
+            # Here expects skipped test output from unittest when verbosity level is
+            # 2 (or --verbose option is enabled).
+            decoded_lines = map(lambda line: line.decode("utf-8", "replace"), iter(per_test_output))
+            skipped_tests = list(filter(
+                lambda line: re.search(r'test_.* \(pyspark\..*\) ... (skip|SKIP)', line),
+                decoded_lines))
+            skipped_counts = len(skipped_tests)
+            if skipped_counts > 0:
+                key = (pyspark_python, test_name)
+                SKIPPED_TESTS[key] = skipped_tests
+            per_test_output.close()
+        except:
+            import traceback
+            print_red("\nGot an exception while trying to store "
+                      "skipped test output:\n%s" % traceback.format_exc())
+            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
+            # this code is invoked from a thread other than the main thread.
+            os._exit(-1)
+        if skipped_counts != 0:
+            LOGGER.info(
+                "Finished test(%s): %s (%is) ... %s tests were skipped", pyspark_python, test_name,
+                duration, skipped_counts)
+        else:
+            LOGGER.info(
+                "Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
+
+
+def get_default_python_executables():
+    python_execs = [x for x in ["python3.6", "pypy"] if which(x)]
+
+    if "python3.6" not in python_execs:
+        p = which("python3")
+
+        if p:
+            py_out = subprocess.run([p, "--version"], stderr=subprocess.PIPE)
+            py_out = py_out.stderr.decode("utf-8")
+            python_version = tuple([int(i) for i in re.findall(r"\d", py_out)])
+
+            if python_version < (PYTHON_MAJOR_VERSION, PYTHON_MINOR_VERSION, PYTHON_MICRO_VERSION):
+                LOGGER.error("Python version %s.%s.%s is lower than minimum "
+                             "required: %s.%s.%s" % (python_version[0],
+                                                     python_version[1],
+                                                     python_version[2],
+                                                     PYTHON_MAJOR_VERSION,
+                                                     PYTHON_MINOR_VERSION,
+                                                     PYTHON_MICRO_VERSION))
+                LOGGER.error("Exiting!")
+                os._exit(1)
+            else:
+                LOGGER.warning("Not testing against `python3.6` because it could not be found; "
+                               "falling back to %s instead" % p)
+                python_execs.insert(0, p)
+        else:
+            LOGGER.error("No python3 executable found.  Exiting!")
+            os._exit(1)
+    return python_execs
+
+
+def parse_opts():
+    parser = ArgumentParser(
+        prog="run-tests"
+    )
+    parser.add_argument(
+        "--python-executables", type=str, default=','.join(get_default_python_executables()),
+        help="A comma-separated list of Python executables to test against (default: %(default)s)"
+    )
+    parser.add_argument(
+        "--modules", type=str,
+        default=",".join(sorted(python_modules.keys())),
+        help="A comma-separated list of Python modules to test (default: %(default)s)"
+    )
+    parser.add_argument(
+        "-p", "--parallelism", type=int, default=4,
+        help="The number of suites to test in parallel (default %(default)d)"
+    )
+    parser.add_argument(
+        "--verbose", action="store_true",
+        help="Enable additional debug logging"
+    )
+
+    group = parser.add_argument_group("Developer Options")
+    group.add_argument(
+        "--testnames", type=str,
+        default=None,
+        help=(
+            "A comma-separated list of specific modules, classes and functions of doctest "
+            "or unittest to test. "
+            "For example, 'pyspark.sql.foo' to run the module as unittests or doctests, "
+            "'pyspark.sql.tests FooTests' to run the specific class of unittests, "
+            "'pyspark.sql.tests FooTests.test_foo' to run the specific unittest in the class. "
+            "'--modules' option is ignored if they are given.")
+    )
+
+    args, unknown = parser.parse_known_args()
+    if unknown:
+        parser.error("Unsupported arguments: %s" % ' '.join(unknown))
+    if args.parallelism < 1:
+        parser.error("Parallelism cannot be less than 1")
+    return args
+
+
+def _check_coverage(python_exec):
+    # Make sure if coverage is installed.
+    try:
+        subprocess_check_output(
+            [python_exec, "-c", "import coverage"],
+            stderr=open(os.devnull, 'w'))
+    except:
+        print_red("Coverage is not installed in Python executable '%s' "
+                  "but 'COVERAGE_PROCESS_START' environment variable is set, "
+                  "exiting." % python_exec)
+        sys.exit(-1)
+
+
+def main():
+    opts = parse_opts()
+    if opts.verbose:
+        log_level = logging.DEBUG
+    else:
+        log_level = logging.INFO
+    should_test_modules = opts.testnames is None
+    logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s")
+    LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE)
+    if os.path.exists(LOG_FILE):
+        os.remove(LOG_FILE)
+    python_execs = opts.python_executables.split(',')
+    LOGGER.info("Will test against the following Python executables: %s", python_execs)
+
+    if should_test_modules:
+        modules_to_test = []
+        for module_name in opts.modules.split(','):
+            if module_name in python_modules:
+                modules_to_test.append(python_modules[module_name])
+            else:
+                print("Error: unrecognized module '%s'. Supported modules: %s" %
+                      (module_name, ", ".join(python_modules)))
+                sys.exit(-1)
+        LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test])
+    else:
+        testnames_to_test = opts.testnames.split(',')
+        LOGGER.info("Will test the following Python tests: %s", testnames_to_test)
+
+    task_queue = Queue.PriorityQueue()
+    for python_exec in python_execs:
+        # Check if the python executable has coverage installed when 'COVERAGE_PROCESS_START'
+        # environmental variable is set.
+        if "COVERAGE_PROCESS_START" in os.environ:
+            _check_coverage(python_exec)
+
+        python_implementation = subprocess_check_output(
+            [python_exec, "-c", "import platform; print(platform.python_implementation())"],
+            universal_newlines=True).strip()
+        LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation)
+        LOGGER.debug("%s version is: %s", python_exec, subprocess_check_output(
+            [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip())
+        if should_test_modules:
+            for module in modules_to_test:
+                if python_implementation not in module.blacklisted_python_implementations:
+                    for test_goal in module.python_test_goals:
+                        heavy_tests = ['pyspark.streaming.tests', 'pyspark.mllib.tests',
+                                       'pyspark.tests', 'pyspark.sql.tests', 'pyspark.ml.tests']
+                        if any(map(lambda prefix: test_goal.startswith(prefix), heavy_tests)):
+                            priority = 0
+                        else:
+                            priority = 100
+                        task_queue.put((priority, (python_exec, test_goal)))
+        else:
+            for test_goal in testnames_to_test:
+                task_queue.put((0, (python_exec, test_goal)))
+
+    # Create the target directory before starting tasks to avoid races.
+    target_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'target'))
+    if not os.path.isdir(target_dir):
+        os.mkdir(target_dir)
+
+    def process_queue(task_queue):
+        while True:
+            try:
+                (priority, (python_exec, test_goal)) = task_queue.get_nowait()
+            except Queue.Empty:
+                break
+            try:
+                run_individual_python_test(target_dir, test_goal, python_exec)
+            finally:
+                task_queue.task_done()
+
+    start_time = time.time()
+    for _ in range(opts.parallelism):
+        worker = Thread(target=process_queue, args=(task_queue,))
+        worker.daemon = True
+        worker.start()
+    try:
+        task_queue.join()
+    except (KeyboardInterrupt, SystemExit):
+        print_red("Exiting due to interrupt")
+        sys.exit(-1)
+    total_duration = time.time() - start_time
+    LOGGER.info("Tests passed in %i seconds", total_duration)
+
+    for key, lines in sorted(SKIPPED_TESTS.items()):
+        pyspark_python, test_name = key
+        LOGGER.info("\nSkipped tests in %s with %s:" % (test_name, pyspark_python))
+        for line in lines:
+            LOGGER.info("    %s" % line.rstrip())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/run-tests.py b/python/run-tests.py
index ac8ab38fe43e..f45e5787b05b 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -57,9 +57,9 @@ def print_red(text):
 FAILURE_REPORTING_LOCK = Lock()
 LOGGER = logging.getLogger()
 
-PYTHON_MAJOR_VERSION=3
-PYTHON_MINOR_VERSION=6
-PYTHON_MICRO_VERSION=0
+PYTHON_MAJOR_VERSION = 3
+PYTHON_MINOR_VERSION = 6
+PYTHON_MICRO_VERSION = 0
 
 # Find out where the assembly jars are located.
 # TODO: revisit for Scala 2.13
@@ -172,7 +172,7 @@ def get_default_python_executables():
         if p:
             py_out = subprocess.run([p, "--version"], stderr=subprocess.PIPE)
             py_out = py_out.stderr.decode("utf-8")
-            python_version = tuple([int(i) for i in re.findall("\d", py_out)])
+            python_version = tuple([int(i) for i in re.findall(r"\d", py_out)])
 
             if python_version < (PYTHON_MAJOR_VERSION, PYTHON_MINOR_VERSION, PYTHON_MICRO_VERSION):
                 LOGGER.error("Python version %s.%s.%s is lower than minimum "
@@ -185,8 +185,8 @@ def get_default_python_executables():
                 LOGGER.error("Exiting!")
                 os._exit(1)
             else:
-                LOGGER.warning("Not testing against `python3.6` because it could not be found; falling"
-                               " back to %s instead" % p)
+                LOGGER.warning("Not testing against `python3.6` because it could not be found; "
+                               "falling back to %s instead" % p)
                 python_execs.insert(0, p)
         else:
             LOGGER.error("No python3 executable found.  Exiting!")

From 074947b274fb4e7ace9a405bcd1dbc9b95ee89d4 Mon Sep 17 00:00:00 2001
From: shane <incomplete@gmail.com>
Date: Fri, 1 Nov 2019 10:46:16 -0700
Subject: [PATCH 10/31] Delete flycheck_run-tests.py

no clue how this got here
---
 python/flycheck_run-tests.py | 346 -----------------------------------
 1 file changed, 346 deletions(-)
 delete mode 100644 python/flycheck_run-tests.py

diff --git a/python/flycheck_run-tests.py b/python/flycheck_run-tests.py
deleted file mode 100644
index f45e5787b05b..000000000000
--- a/python/flycheck_run-tests.py
+++ /dev/null
@@ -1,346 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-import logging
-from argparse import ArgumentParser
-import os
-import re
-import shutil
-import subprocess
-import sys
-import tempfile
-from threading import Thread, Lock
-import time
-import uuid
-if sys.version < '3':
-    import Queue
-else:
-    import queue as Queue
-from multiprocessing import Manager
-
-
-# Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
-sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../dev/"))
-
-
-from sparktestsupport import SPARK_HOME  # noqa (suppress pep8 warnings)
-from sparktestsupport.shellutils import which, subprocess_check_output  # noqa
-from sparktestsupport.modules import all_modules, pyspark_sql  # noqa
-
-
-python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals if m.name != 'root')
-
-
-def print_red(text):
-    print('\033[31m' + text + '\033[0m')
-
-
-SKIPPED_TESTS = Manager().dict()
-LOG_FILE = os.path.join(SPARK_HOME, "python/unit-tests.log")
-FAILURE_REPORTING_LOCK = Lock()
-LOGGER = logging.getLogger()
-
-PYTHON_MAJOR_VERSION = 3
-PYTHON_MINOR_VERSION = 6
-PYTHON_MICRO_VERSION = 0
-
-# Find out where the assembly jars are located.
-# TODO: revisit for Scala 2.13
-for scala in ["2.12"]:
-    build_dir = os.path.join(SPARK_HOME, "assembly", "target", "scala-" + scala)
-    if os.path.isdir(build_dir):
-        SPARK_DIST_CLASSPATH = os.path.join(build_dir, "jars", "*")
-        break
-else:
-    raise Exception("Cannot find assembly build directory, please build Spark first.")
-
-
-def run_individual_python_test(target_dir, test_name, pyspark_python):
-    env = dict(os.environ)
-    env.update({
-        'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH,
-        'SPARK_TESTING': '1',
-        'SPARK_PREPEND_CLASSES': '1',
-        'PYSPARK_PYTHON': which(pyspark_python),
-        'PYSPARK_DRIVER_PYTHON': which(pyspark_python)
-    })
-
-    # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is
-    # recognized by the tempfile module to override the default system temp directory.
-    tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
-    while os.path.isdir(tmp_dir):
-        tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
-    os.mkdir(tmp_dir)
-    env["TMPDIR"] = tmp_dir
-
-    # Also override the JVM's temp directory by setting driver and executor options.
-    spark_args = [
-        "--conf", "spark.driver.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
-        "--conf", "spark.executor.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
-        "pyspark-shell"
-    ]
-    env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args)
-
-    LOGGER.info("Starting test(%s): %s", pyspark_python, test_name)
-    start_time = time.time()
-    try:
-        per_test_output = tempfile.TemporaryFile()
-        retcode = subprocess.Popen(
-            [os.path.join(SPARK_HOME, "bin/pyspark")] + test_name.split(),
-            stderr=per_test_output, stdout=per_test_output, env=env).wait()
-        shutil.rmtree(tmp_dir, ignore_errors=True)
-    except:
-        LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python)
-        # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
-        # this code is invoked from a thread other than the main thread.
-        os._exit(1)
-    duration = time.time() - start_time
-    # Exit on the first failure.
-    if retcode != 0:
-        try:
-            with FAILURE_REPORTING_LOCK:
-                with open(LOG_FILE, 'ab') as log_file:
-                    per_test_output.seek(0)
-                    log_file.writelines(per_test_output)
-                per_test_output.seek(0)
-                for line in per_test_output:
-                    decoded_line = line.decode("utf-8", "replace")
-                    if not re.match('[0-9]+', decoded_line):
-                        print(decoded_line, end='')
-                per_test_output.close()
-        except:
-            LOGGER.exception("Got an exception while trying to print failed test output")
-        finally:
-            print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python))
-            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
-            # this code is invoked from a thread other than the main thread.
-            os._exit(-1)
-    else:
-        skipped_counts = 0
-        try:
-            per_test_output.seek(0)
-            # Here expects skipped test output from unittest when verbosity level is
-            # 2 (or --verbose option is enabled).
-            decoded_lines = map(lambda line: line.decode("utf-8", "replace"), iter(per_test_output))
-            skipped_tests = list(filter(
-                lambda line: re.search(r'test_.* \(pyspark\..*\) ... (skip|SKIP)', line),
-                decoded_lines))
-            skipped_counts = len(skipped_tests)
-            if skipped_counts > 0:
-                key = (pyspark_python, test_name)
-                SKIPPED_TESTS[key] = skipped_tests
-            per_test_output.close()
-        except:
-            import traceback
-            print_red("\nGot an exception while trying to store "
-                      "skipped test output:\n%s" % traceback.format_exc())
-            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
-            # this code is invoked from a thread other than the main thread.
-            os._exit(-1)
-        if skipped_counts != 0:
-            LOGGER.info(
-                "Finished test(%s): %s (%is) ... %s tests were skipped", pyspark_python, test_name,
-                duration, skipped_counts)
-        else:
-            LOGGER.info(
-                "Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
-
-
-def get_default_python_executables():
-    python_execs = [x for x in ["python3.6", "pypy"] if which(x)]
-
-    if "python3.6" not in python_execs:
-        p = which("python3")
-
-        if p:
-            py_out = subprocess.run([p, "--version"], stderr=subprocess.PIPE)
-            py_out = py_out.stderr.decode("utf-8")
-            python_version = tuple([int(i) for i in re.findall(r"\d", py_out)])
-
-            if python_version < (PYTHON_MAJOR_VERSION, PYTHON_MINOR_VERSION, PYTHON_MICRO_VERSION):
-                LOGGER.error("Python version %s.%s.%s is lower than minimum "
-                             "required: %s.%s.%s" % (python_version[0],
-                                                     python_version[1],
-                                                     python_version[2],
-                                                     PYTHON_MAJOR_VERSION,
-                                                     PYTHON_MINOR_VERSION,
-                                                     PYTHON_MICRO_VERSION))
-                LOGGER.error("Exiting!")
-                os._exit(1)
-            else:
-                LOGGER.warning("Not testing against `python3.6` because it could not be found; "
-                               "falling back to %s instead" % p)
-                python_execs.insert(0, p)
-        else:
-            LOGGER.error("No python3 executable found.  Exiting!")
-            os._exit(1)
-    return python_execs
-
-
-def parse_opts():
-    parser = ArgumentParser(
-        prog="run-tests"
-    )
-    parser.add_argument(
-        "--python-executables", type=str, default=','.join(get_default_python_executables()),
-        help="A comma-separated list of Python executables to test against (default: %(default)s)"
-    )
-    parser.add_argument(
-        "--modules", type=str,
-        default=",".join(sorted(python_modules.keys())),
-        help="A comma-separated list of Python modules to test (default: %(default)s)"
-    )
-    parser.add_argument(
-        "-p", "--parallelism", type=int, default=4,
-        help="The number of suites to test in parallel (default %(default)d)"
-    )
-    parser.add_argument(
-        "--verbose", action="store_true",
-        help="Enable additional debug logging"
-    )
-
-    group = parser.add_argument_group("Developer Options")
-    group.add_argument(
-        "--testnames", type=str,
-        default=None,
-        help=(
-            "A comma-separated list of specific modules, classes and functions of doctest "
-            "or unittest to test. "
-            "For example, 'pyspark.sql.foo' to run the module as unittests or doctests, "
-            "'pyspark.sql.tests FooTests' to run the specific class of unittests, "
-            "'pyspark.sql.tests FooTests.test_foo' to run the specific unittest in the class. "
-            "'--modules' option is ignored if they are given.")
-    )
-
-    args, unknown = parser.parse_known_args()
-    if unknown:
-        parser.error("Unsupported arguments: %s" % ' '.join(unknown))
-    if args.parallelism < 1:
-        parser.error("Parallelism cannot be less than 1")
-    return args
-
-
-def _check_coverage(python_exec):
-    # Make sure if coverage is installed.
-    try:
-        subprocess_check_output(
-            [python_exec, "-c", "import coverage"],
-            stderr=open(os.devnull, 'w'))
-    except:
-        print_red("Coverage is not installed in Python executable '%s' "
-                  "but 'COVERAGE_PROCESS_START' environment variable is set, "
-                  "exiting." % python_exec)
-        sys.exit(-1)
-
-
-def main():
-    opts = parse_opts()
-    if opts.verbose:
-        log_level = logging.DEBUG
-    else:
-        log_level = logging.INFO
-    should_test_modules = opts.testnames is None
-    logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s")
-    LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE)
-    if os.path.exists(LOG_FILE):
-        os.remove(LOG_FILE)
-    python_execs = opts.python_executables.split(',')
-    LOGGER.info("Will test against the following Python executables: %s", python_execs)
-
-    if should_test_modules:
-        modules_to_test = []
-        for module_name in opts.modules.split(','):
-            if module_name in python_modules:
-                modules_to_test.append(python_modules[module_name])
-            else:
-                print("Error: unrecognized module '%s'. Supported modules: %s" %
-                      (module_name, ", ".join(python_modules)))
-                sys.exit(-1)
-        LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test])
-    else:
-        testnames_to_test = opts.testnames.split(',')
-        LOGGER.info("Will test the following Python tests: %s", testnames_to_test)
-
-    task_queue = Queue.PriorityQueue()
-    for python_exec in python_execs:
-        # Check if the python executable has coverage installed when 'COVERAGE_PROCESS_START'
-        # environmental variable is set.
-        if "COVERAGE_PROCESS_START" in os.environ:
-            _check_coverage(python_exec)
-
-        python_implementation = subprocess_check_output(
-            [python_exec, "-c", "import platform; print(platform.python_implementation())"],
-            universal_newlines=True).strip()
-        LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation)
-        LOGGER.debug("%s version is: %s", python_exec, subprocess_check_output(
-            [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip())
-        if should_test_modules:
-            for module in modules_to_test:
-                if python_implementation not in module.blacklisted_python_implementations:
-                    for test_goal in module.python_test_goals:
-                        heavy_tests = ['pyspark.streaming.tests', 'pyspark.mllib.tests',
-                                       'pyspark.tests', 'pyspark.sql.tests', 'pyspark.ml.tests']
-                        if any(map(lambda prefix: test_goal.startswith(prefix), heavy_tests)):
-                            priority = 0
-                        else:
-                            priority = 100
-                        task_queue.put((priority, (python_exec, test_goal)))
-        else:
-            for test_goal in testnames_to_test:
-                task_queue.put((0, (python_exec, test_goal)))
-
-    # Create the target directory before starting tasks to avoid races.
-    target_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'target'))
-    if not os.path.isdir(target_dir):
-        os.mkdir(target_dir)
-
-    def process_queue(task_queue):
-        while True:
-            try:
-                (priority, (python_exec, test_goal)) = task_queue.get_nowait()
-            except Queue.Empty:
-                break
-            try:
-                run_individual_python_test(target_dir, test_goal, python_exec)
-            finally:
-                task_queue.task_done()
-
-    start_time = time.time()
-    for _ in range(opts.parallelism):
-        worker = Thread(target=process_queue, args=(task_queue,))
-        worker.daemon = True
-        worker.start()
-    try:
-        task_queue.join()
-    except (KeyboardInterrupt, SystemExit):
-        print_red("Exiting due to interrupt")
-        sys.exit(-1)
-    total_duration = time.time() - start_time
-    LOGGER.info("Tests passed in %i seconds", total_duration)
-
-    for key, lines in sorted(SKIPPED_TESTS.items()):
-        pyspark_python, test_name = key
-        LOGGER.info("\nSkipped tests in %s with %s:" % (test_name, pyspark_python))
-        for line in lines:
-            LOGGER.info("    %s" % line.rstrip())
-
-
-if __name__ == "__main__":
-    main()

From 9b52677e178374c5e917a2f7006b55d205e10eb9 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Tue, 5 Nov 2019 10:34:51 -0800
Subject: [PATCH 11/31] move python version check to bash wrapper

---
 python/run-tests    |  8 +++++++-
 python/run-tests.py | 28 ++--------------------------
 2 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/python/run-tests b/python/run-tests
index 24949657ed7a..b8c64d8a295a 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -21,4 +21,10 @@
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-exec python -u ./python/run-tests.py "$@"
+PYTHON_VERSION_CHECK=$(python3 -c 'import sys; print(sys.version_info < (3, 6, 0))')
+if [[ "$PYTHON_VERSION_CHECK" == "True" ]]; then
+  echo "Python versions prior to 3.6 are not supported."
+  exit -1
+fi
+
+exec python3 -u ./python/run-tests.py "$@"
diff --git a/python/run-tests.py b/python/run-tests.py
index f45e5787b05b..467469cbf6b2 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
@@ -57,10 +57,6 @@ def print_red(text):
 FAILURE_REPORTING_LOCK = Lock()
 LOGGER = logging.getLogger()
 
-PYTHON_MAJOR_VERSION = 3
-PYTHON_MINOR_VERSION = 6
-PYTHON_MICRO_VERSION = 0
-
 # Find out where the assembly jars are located.
 # TODO: revisit for Scala 2.13
 for scala in ["2.12"]:
@@ -168,27 +164,7 @@ def get_default_python_executables():
 
     if "python3.6" not in python_execs:
         p = which("python3")
-
-        if p:
-            py_out = subprocess.run([p, "--version"], stderr=subprocess.PIPE)
-            py_out = py_out.stderr.decode("utf-8")
-            python_version = tuple([int(i) for i in re.findall(r"\d", py_out)])
-
-            if python_version < (PYTHON_MAJOR_VERSION, PYTHON_MINOR_VERSION, PYTHON_MICRO_VERSION):
-                LOGGER.error("Python version %s.%s.%s is lower than minimum "
-                             "required: %s.%s.%s" % (python_version[0],
-                                                     python_version[1],
-                                                     python_version[2],
-                                                     PYTHON_MAJOR_VERSION,
-                                                     PYTHON_MINOR_VERSION,
-                                                     PYTHON_MICRO_VERSION))
-                LOGGER.error("Exiting!")
-                os._exit(1)
-            else:
-                LOGGER.warning("Not testing against `python3.6` because it could not be found; "
-                               "falling back to %s instead" % p)
-                python_execs.insert(0, p)
-        else:
+        if not p:
             LOGGER.error("No python3 executable found.  Exiting!")
             os._exit(1)
     return python_execs

From d3226d24c18bcd49180f454cf56ae80a6701f901 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Tue, 5 Nov 2019 10:36:30 -0800
Subject: [PATCH 12/31] bump test running infra to 3.6 min

---
 dev/run-tests            | 6 +++---
 dev/run-tests-jenkins    | 6 +++---
 dev/run-tests-jenkins.py | 2 +-
 dev/run-tests.py         | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/dev/run-tests b/dev/run-tests
index 9cf93d000d0e..bf4b46eb1e65 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -20,10 +20,10 @@
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-PYTHON_VERSION_CHECK=$(python -c 'import sys; print(sys.version_info < (2, 7, 0))')
+PYTHON_VERSION_CHECK=$(python -c 'import sys; print(sys.version_info < (3, 6, 0))')
 if [[ "$PYTHON_VERSION_CHECK" == "True" ]]; then
-  echo "Python versions prior to 2.7 are not supported."
+  echo "Python versions prior to 3.6 are not supported."
   exit -1
 fi
 
-exec python -u ./dev/run-tests.py "$@"
+exec python3 -u ./dev/run-tests.py "$@"
diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index 5bc03e41d1f2..27beb0352212 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -25,10 +25,10 @@
 FWDIR="$( cd "$( dirname "$0" )/.." && pwd )"
 cd "$FWDIR"
 
-PYTHON_VERSION_CHECK=$(python -c 'import sys; print(sys.version_info < (2, 7, 0))')
+PYTHON_VERSION_CHECK=$(python -c 'import sys; print(sys.version_info < (3, 6, 0))')
 if [[ "$PYTHON_VERSION_CHECK" == "True" ]]; then
-  echo "Python versions prior to 2.7 are not supported."
+  echo "Python versions prior to 3.6 are not supported."
   exit -1
 fi
 
-exec python -u ./dev/run-tests-jenkins.py "$@"
+exec python3 -u ./dev/run-tests-jenkins.py "$@"
diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
index e9b0b327603b..6ffe7d8cd136 100755
--- a/dev/run-tests-jenkins.py
+++ b/dev/run-tests-jenkins.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
diff --git a/dev/run-tests.py b/dev/run-tests.py
index ea515708124d..7a1729c66474 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more

From a8a249e491b6e38d0f0dbe8b02b81b9574384ad4 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Tue, 5 Nov 2019 10:36:51 -0800
Subject: [PATCH 13/31] pesky flycheck wtf

---
 python/flycheck_run-tests.py | 346 -----------------------------------
 1 file changed, 346 deletions(-)
 delete mode 100644 python/flycheck_run-tests.py

diff --git a/python/flycheck_run-tests.py b/python/flycheck_run-tests.py
deleted file mode 100644
index f45e5787b05b..000000000000
--- a/python/flycheck_run-tests.py
+++ /dev/null
@@ -1,346 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-import logging
-from argparse import ArgumentParser
-import os
-import re
-import shutil
-import subprocess
-import sys
-import tempfile
-from threading import Thread, Lock
-import time
-import uuid
-if sys.version < '3':
-    import Queue
-else:
-    import queue as Queue
-from multiprocessing import Manager
-
-
-# Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
-sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../dev/"))
-
-
-from sparktestsupport import SPARK_HOME  # noqa (suppress pep8 warnings)
-from sparktestsupport.shellutils import which, subprocess_check_output  # noqa
-from sparktestsupport.modules import all_modules, pyspark_sql  # noqa
-
-
-python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals if m.name != 'root')
-
-
-def print_red(text):
-    print('\033[31m' + text + '\033[0m')
-
-
-SKIPPED_TESTS = Manager().dict()
-LOG_FILE = os.path.join(SPARK_HOME, "python/unit-tests.log")
-FAILURE_REPORTING_LOCK = Lock()
-LOGGER = logging.getLogger()
-
-PYTHON_MAJOR_VERSION = 3
-PYTHON_MINOR_VERSION = 6
-PYTHON_MICRO_VERSION = 0
-
-# Find out where the assembly jars are located.
-# TODO: revisit for Scala 2.13
-for scala in ["2.12"]:
-    build_dir = os.path.join(SPARK_HOME, "assembly", "target", "scala-" + scala)
-    if os.path.isdir(build_dir):
-        SPARK_DIST_CLASSPATH = os.path.join(build_dir, "jars", "*")
-        break
-else:
-    raise Exception("Cannot find assembly build directory, please build Spark first.")
-
-
-def run_individual_python_test(target_dir, test_name, pyspark_python):
-    env = dict(os.environ)
-    env.update({
-        'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH,
-        'SPARK_TESTING': '1',
-        'SPARK_PREPEND_CLASSES': '1',
-        'PYSPARK_PYTHON': which(pyspark_python),
-        'PYSPARK_DRIVER_PYTHON': which(pyspark_python)
-    })
-
-    # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is
-    # recognized by the tempfile module to override the default system temp directory.
-    tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
-    while os.path.isdir(tmp_dir):
-        tmp_dir = os.path.join(target_dir, str(uuid.uuid4()))
-    os.mkdir(tmp_dir)
-    env["TMPDIR"] = tmp_dir
-
-    # Also override the JVM's temp directory by setting driver and executor options.
-    spark_args = [
-        "--conf", "spark.driver.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
-        "--conf", "spark.executor.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
-        "pyspark-shell"
-    ]
-    env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args)
-
-    LOGGER.info("Starting test(%s): %s", pyspark_python, test_name)
-    start_time = time.time()
-    try:
-        per_test_output = tempfile.TemporaryFile()
-        retcode = subprocess.Popen(
-            [os.path.join(SPARK_HOME, "bin/pyspark")] + test_name.split(),
-            stderr=per_test_output, stdout=per_test_output, env=env).wait()
-        shutil.rmtree(tmp_dir, ignore_errors=True)
-    except:
-        LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python)
-        # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
-        # this code is invoked from a thread other than the main thread.
-        os._exit(1)
-    duration = time.time() - start_time
-    # Exit on the first failure.
-    if retcode != 0:
-        try:
-            with FAILURE_REPORTING_LOCK:
-                with open(LOG_FILE, 'ab') as log_file:
-                    per_test_output.seek(0)
-                    log_file.writelines(per_test_output)
-                per_test_output.seek(0)
-                for line in per_test_output:
-                    decoded_line = line.decode("utf-8", "replace")
-                    if not re.match('[0-9]+', decoded_line):
-                        print(decoded_line, end='')
-                per_test_output.close()
-        except:
-            LOGGER.exception("Got an exception while trying to print failed test output")
-        finally:
-            print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python))
-            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
-            # this code is invoked from a thread other than the main thread.
-            os._exit(-1)
-    else:
-        skipped_counts = 0
-        try:
-            per_test_output.seek(0)
-            # Here expects skipped test output from unittest when verbosity level is
-            # 2 (or --verbose option is enabled).
-            decoded_lines = map(lambda line: line.decode("utf-8", "replace"), iter(per_test_output))
-            skipped_tests = list(filter(
-                lambda line: re.search(r'test_.* \(pyspark\..*\) ... (skip|SKIP)', line),
-                decoded_lines))
-            skipped_counts = len(skipped_tests)
-            if skipped_counts > 0:
-                key = (pyspark_python, test_name)
-                SKIPPED_TESTS[key] = skipped_tests
-            per_test_output.close()
-        except:
-            import traceback
-            print_red("\nGot an exception while trying to store "
-                      "skipped test output:\n%s" % traceback.format_exc())
-            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
-            # this code is invoked from a thread other than the main thread.
-            os._exit(-1)
-        if skipped_counts != 0:
-            LOGGER.info(
-                "Finished test(%s): %s (%is) ... %s tests were skipped", pyspark_python, test_name,
-                duration, skipped_counts)
-        else:
-            LOGGER.info(
-                "Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
-
-
-def get_default_python_executables():
-    python_execs = [x for x in ["python3.6", "pypy"] if which(x)]
-
-    if "python3.6" not in python_execs:
-        p = which("python3")
-
-        if p:
-            py_out = subprocess.run([p, "--version"], stderr=subprocess.PIPE)
-            py_out = py_out.stderr.decode("utf-8")
-            python_version = tuple([int(i) for i in re.findall(r"\d", py_out)])
-
-            if python_version < (PYTHON_MAJOR_VERSION, PYTHON_MINOR_VERSION, PYTHON_MICRO_VERSION):
-                LOGGER.error("Python version %s.%s.%s is lower than minimum "
-                             "required: %s.%s.%s" % (python_version[0],
-                                                     python_version[1],
-                                                     python_version[2],
-                                                     PYTHON_MAJOR_VERSION,
-                                                     PYTHON_MINOR_VERSION,
-                                                     PYTHON_MICRO_VERSION))
-                LOGGER.error("Exiting!")
-                os._exit(1)
-            else:
-                LOGGER.warning("Not testing against `python3.6` because it could not be found; "
-                               "falling back to %s instead" % p)
-                python_execs.insert(0, p)
-        else:
-            LOGGER.error("No python3 executable found.  Exiting!")
-            os._exit(1)
-    return python_execs
-
-
-def parse_opts():
-    parser = ArgumentParser(
-        prog="run-tests"
-    )
-    parser.add_argument(
-        "--python-executables", type=str, default=','.join(get_default_python_executables()),
-        help="A comma-separated list of Python executables to test against (default: %(default)s)"
-    )
-    parser.add_argument(
-        "--modules", type=str,
-        default=",".join(sorted(python_modules.keys())),
-        help="A comma-separated list of Python modules to test (default: %(default)s)"
-    )
-    parser.add_argument(
-        "-p", "--parallelism", type=int, default=4,
-        help="The number of suites to test in parallel (default %(default)d)"
-    )
-    parser.add_argument(
-        "--verbose", action="store_true",
-        help="Enable additional debug logging"
-    )
-
-    group = parser.add_argument_group("Developer Options")
-    group.add_argument(
-        "--testnames", type=str,
-        default=None,
-        help=(
-            "A comma-separated list of specific modules, classes and functions of doctest "
-            "or unittest to test. "
-            "For example, 'pyspark.sql.foo' to run the module as unittests or doctests, "
-            "'pyspark.sql.tests FooTests' to run the specific class of unittests, "
-            "'pyspark.sql.tests FooTests.test_foo' to run the specific unittest in the class. "
-            "'--modules' option is ignored if they are given.")
-    )
-
-    args, unknown = parser.parse_known_args()
-    if unknown:
-        parser.error("Unsupported arguments: %s" % ' '.join(unknown))
-    if args.parallelism < 1:
-        parser.error("Parallelism cannot be less than 1")
-    return args
-
-
-def _check_coverage(python_exec):
-    # Make sure if coverage is installed.
-    try:
-        subprocess_check_output(
-            [python_exec, "-c", "import coverage"],
-            stderr=open(os.devnull, 'w'))
-    except:
-        print_red("Coverage is not installed in Python executable '%s' "
-                  "but 'COVERAGE_PROCESS_START' environment variable is set, "
-                  "exiting." % python_exec)
-        sys.exit(-1)
-
-
-def main():
-    opts = parse_opts()
-    if opts.verbose:
-        log_level = logging.DEBUG
-    else:
-        log_level = logging.INFO
-    should_test_modules = opts.testnames is None
-    logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s")
-    LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE)
-    if os.path.exists(LOG_FILE):
-        os.remove(LOG_FILE)
-    python_execs = opts.python_executables.split(',')
-    LOGGER.info("Will test against the following Python executables: %s", python_execs)
-
-    if should_test_modules:
-        modules_to_test = []
-        for module_name in opts.modules.split(','):
-            if module_name in python_modules:
-                modules_to_test.append(python_modules[module_name])
-            else:
-                print("Error: unrecognized module '%s'. Supported modules: %s" %
-                      (module_name, ", ".join(python_modules)))
-                sys.exit(-1)
-        LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test])
-    else:
-        testnames_to_test = opts.testnames.split(',')
-        LOGGER.info("Will test the following Python tests: %s", testnames_to_test)
-
-    task_queue = Queue.PriorityQueue()
-    for python_exec in python_execs:
-        # Check if the python executable has coverage installed when 'COVERAGE_PROCESS_START'
-        # environmental variable is set.
-        if "COVERAGE_PROCESS_START" in os.environ:
-            _check_coverage(python_exec)
-
-        python_implementation = subprocess_check_output(
-            [python_exec, "-c", "import platform; print(platform.python_implementation())"],
-            universal_newlines=True).strip()
-        LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation)
-        LOGGER.debug("%s version is: %s", python_exec, subprocess_check_output(
-            [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip())
-        if should_test_modules:
-            for module in modules_to_test:
-                if python_implementation not in module.blacklisted_python_implementations:
-                    for test_goal in module.python_test_goals:
-                        heavy_tests = ['pyspark.streaming.tests', 'pyspark.mllib.tests',
-                                       'pyspark.tests', 'pyspark.sql.tests', 'pyspark.ml.tests']
-                        if any(map(lambda prefix: test_goal.startswith(prefix), heavy_tests)):
-                            priority = 0
-                        else:
-                            priority = 100
-                        task_queue.put((priority, (python_exec, test_goal)))
-        else:
-            for test_goal in testnames_to_test:
-                task_queue.put((0, (python_exec, test_goal)))
-
-    # Create the target directory before starting tasks to avoid races.
-    target_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'target'))
-    if not os.path.isdir(target_dir):
-        os.mkdir(target_dir)
-
-    def process_queue(task_queue):
-        while True:
-            try:
-                (priority, (python_exec, test_goal)) = task_queue.get_nowait()
-            except Queue.Empty:
-                break
-            try:
-                run_individual_python_test(target_dir, test_goal, python_exec)
-            finally:
-                task_queue.task_done()
-
-    start_time = time.time()
-    for _ in range(opts.parallelism):
-        worker = Thread(target=process_queue, args=(task_queue,))
-        worker.daemon = True
-        worker.start()
-    try:
-        task_queue.join()
-    except (KeyboardInterrupt, SystemExit):
-        print_red("Exiting due to interrupt")
-        sys.exit(-1)
-    total_duration = time.time() - start_time
-    LOGGER.info("Tests passed in %i seconds", total_duration)
-
-    for key, lines in sorted(SKIPPED_TESTS.items()):
-        pyspark_python, test_name = key
-        LOGGER.info("\nSkipped tests in %s with %s:" % (test_name, pyspark_python))
-        for line in lines:
-            LOGGER.info("    %s" % line.rstrip())
-
-
-if __name__ == "__main__":
-    main()

From 8441af0506d76c8351d8decfe37d49425aee04cf Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Tue, 5 Nov 2019 12:32:49 -0800
Subject: [PATCH 14/31] need to set PATH here otherwise it will fail the build

---
 dev/run-tests-jenkins | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index 27beb0352212..c3adc696a512 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -25,7 +25,9 @@
 FWDIR="$( cd "$( dirname "$0" )/.." && pwd )"
 cd "$FWDIR"
 
-PYTHON_VERSION_CHECK=$(python -c 'import sys; print(sys.version_info < (3, 6, 0))')
+export PATH=/home/anaconda/envs/py36/bin:$PATH
+
+PYTHON_VERSION_CHECK=$(python3 -c 'import sys; print(sys.version_info < (3, 6, 0))')
 if [[ "$PYTHON_VERSION_CHECK" == "True" ]]; then
   echo "Python versions prior to 3.6 are not supported."
   exit -1

From cab1683df562ec716a33b3eafe274db2520bed35 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Tue, 5 Nov 2019 12:33:13 -0800
Subject: [PATCH 15/31] s/python/python3

---
 dev/run-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests b/dev/run-tests
index bf4b46eb1e65..143d78ec6373 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -20,7 +20,7 @@
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-PYTHON_VERSION_CHECK=$(python -c 'import sys; print(sys.version_info < (3, 6, 0))')
+PYTHON_VERSION_CHECK=$(python3 -c 'import sys; print(sys.version_info < (3, 6, 0))')
 if [[ "$PYTHON_VERSION_CHECK" == "True" ]]; then
   echo "Python versions prior to 3.6 are not supported."
   exit -1

From a5b056b10b73dcf6c1aaafea65acca87ec03f1ea Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 7 Nov 2019 11:10:40 -0800
Subject: [PATCH 16/31] remove __future__.print_function import as it's not
 necessary and try to fix build output

---
 dev/pip-sanity-check.py            | 2 --
 dev/run-tests-jenkins.py           | 1 -
 dev/run-tests.py                   | 1 -
 dev/sparktestsupport/shellutils.py | 1 -
 python/run-tests.py                | 1 -
 5 files changed, 6 deletions(-)

diff --git a/dev/pip-sanity-check.py b/dev/pip-sanity-check.py
index 4171f28684d5..e9f10233b12b 100644
--- a/dev/pip-sanity-check.py
+++ b/dev/pip-sanity-check.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 from pyspark.mllib.linalg import *
 import sys
diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
index 6ffe7d8cd136..5429aeba8ea1 100755
--- a/dev/run-tests-jenkins.py
+++ b/dev/run-tests-jenkins.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
 import os
 import sys
 import json
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 7a1729c66474..e8f1bd0b371f 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
 import itertools
 from argparse import ArgumentParser
 import os
diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index ec6ea86269f5..f9c800b77764 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
 import os
 import shutil
 import subprocess
diff --git a/python/run-tests.py b/python/run-tests.py
index 467469cbf6b2..bf752b111351 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
 import logging
 from argparse import ArgumentParser
 import os

From aa893c60b2af080d887468042c8a2c403b9d4100 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 7 Nov 2019 13:37:57 -0800
Subject: [PATCH 17/31] python 3.5+ should use subprocess.run, also might fix
 bytestring encoding

---
 dev/sparktestsupport/shellutils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index f9c800b77764..756137847a1c 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -21,8 +21,6 @@
 import sys
 
 subprocess_check_output = subprocess.check_output
-subprocess_check_call = subprocess.check_call
-
 
 def exit_from_command_with_retcode(cmd, retcode):
     if retcode < 0:
@@ -56,7 +54,7 @@ def run_cmd(cmd, return_output=False):
         if return_output:
             return subprocess_check_output(cmd).decode(sys.getdefaultencoding())
         else:
-            return subprocess_check_call(cmd)
+            return subprocess.run(cmd, encoding=sys.getdefaultencoding()).returncode
     except subprocess.CalledProcessError as e:
         exit_from_command_with_retcode(e.cmd, e.returncode)
 

From 3036fa66826bac32d28cd9b8f39a50a32872393a Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 7 Nov 2019 13:54:32 -0800
Subject: [PATCH 18/31] pycodestyle

---
 dev/sparktestsupport/shellutils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index 756137847a1c..cb83f1d81b87 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -22,6 +22,7 @@
 
 subprocess_check_output = subprocess.check_output
 
+
 def exit_from_command_with_retcode(cmd, retcode):
     if retcode < 0:
         print("[error] running", ' '.join(cmd), "; process was terminated by signal", -retcode)

From 99a945bc517c4f54d5a8d235a076e5e763ef13c1 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 7 Nov 2019 14:45:23 -0800
Subject: [PATCH 19/31] check=True needed

---
 dev/sparktestsupport/shellutils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index cb83f1d81b87..7a67093755d5 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -55,7 +55,7 @@ def run_cmd(cmd, return_output=False):
         if return_output:
             return subprocess_check_output(cmd).decode(sys.getdefaultencoding())
         else:
-            return subprocess.run(cmd, encoding=sys.getdefaultencoding()).returncode
+            return subprocess.run(cmd, check=True, encoding=sys.getdefaultencoding()).returncode
     except subprocess.CalledProcessError as e:
         exit_from_command_with_retcode(e.cmd, e.returncode)
 

From 4986769ea094c97d06f2d328dc086982acd23229 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 7 Nov 2019 15:25:21 -0800
Subject: [PATCH 20/31] think i found a way to decode the bytestring

---
 dev/sparktestsupport/shellutils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index 7a67093755d5..e28379224f91 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -55,7 +55,12 @@ def run_cmd(cmd, return_output=False):
         if return_output:
             return subprocess_check_output(cmd).decode(sys.getdefaultencoding())
         else:
-            return subprocess.run(cmd, check=True, encoding=sys.getdefaultencoding()).returncode
+            popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True)
+            for stdout_line in popen.stdout:
+                print(stdout_line.decode(sys.getdefaultencoding()), end='')
+            popen.stdout.close()
+            return_code = popen.wait()
+            return return_code
     except subprocess.CalledProcessError as e:
         exit_from_command_with_retcode(e.cmd, e.returncode)
 

From 465acf183b30cc971ae9fefdc40861cad40a2d34 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 7 Nov 2019 16:30:43 -0800
Subject: [PATCH 21/31] encode/recode

---
 dev/sparktestsupport/shellutils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index e28379224f91..807b6cddb6f2 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -57,7 +57,8 @@ def run_cmd(cmd, return_output=False):
         else:
             popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True)
             for stdout_line in popen.stdout:
-                print(stdout_line.decode(sys.getdefaultencoding()), end='')
+                line = stdout_line.encode(sys.getdefaultencoding())
+                print(line.decode(sys.getdefaultencoding()), end='')
             popen.stdout.close()
             return_code = popen.wait()
             return return_code

From b66b3894c9a0f47e55cbada7b13255cbbb96de8a Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 7 Nov 2019 17:02:17 -0800
Subject: [PATCH 22/31] reading docs helps

---
 dev/sparktestsupport/shellutils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index 807b6cddb6f2..29773b1bfe35 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -55,10 +55,9 @@ def run_cmd(cmd, return_output=False):
         if return_output:
             return subprocess_check_output(cmd).decode(sys.getdefaultencoding())
         else:
-            popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True)
+            popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, encoding=sys.getdefaultencoding())
             for stdout_line in popen.stdout:
-                line = stdout_line.encode(sys.getdefaultencoding())
-                print(line.decode(sys.getdefaultencoding()), end='')
+                print(stdout_line.decode(sys.getdefaultencoding()), end='')
             popen.stdout.close()
             return_code = popen.wait()
             return return_code

From 64ccb8102dff674ab0d359797f18edebe34b7ef9 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 7 Nov 2019 17:06:00 -0800
Subject: [PATCH 23/31] reading docs helps even more

---
 dev/sparktestsupport/shellutils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index 29773b1bfe35..8ab784480c60 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -55,7 +55,7 @@ def run_cmd(cmd, return_output=False):
         if return_output:
             return subprocess_check_output(cmd).decode(sys.getdefaultencoding())
         else:
-            popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, encoding=sys.getdefaultencoding())
+            popen = subprocess.Popen(cmd, stdout=subprocess.PIPE)
             for stdout_line in popen.stdout:
                 print(stdout_line.decode(sys.getdefaultencoding()), end='')
             popen.stdout.close()

From 16ae236d0748abd5cbc9223480c7a3531a20e183 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 7 Nov 2019 17:45:23 -0800
Subject: [PATCH 24/31] whackamole with bytstring decoding

---
 dev/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index e8f1bd0b371f..d08e30f8785a 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -264,7 +264,7 @@ def exec_sbt(sbt_args=()):
     echo_proc.wait()
     for line in iter(sbt_proc.stdout.readline, b''):
         if not sbt_output_filter.match(line):
-            print(line, end='')
+            print(line.decode(sys.getdefaultencoding()), end='')
     retcode = sbt_proc.wait()
 
     if retcode != 0:

From 5b61ade3266d7af3418edacdf004a19c06e3a5c1 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 7 Nov 2019 19:21:04 -0800
Subject: [PATCH 25/31] remove pypy2.5.1

---
 python/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index bf752b111351..ebf6b63f64f4 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -159,7 +159,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
 
 
 def get_default_python_executables():
-    python_execs = [x for x in ["python3.6", "pypy"] if which(x)]
+    python_execs = [x for x in ["python3.6"] if which(x)]
 
     if "python3.6" not in python_execs:
         p = which("python3")

From 3bfd18c05ba6cc6265050cbdf32a2d33716edc30 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 7 Nov 2019 20:29:27 -0800
Subject: [PATCH 26/31] specify encoding rather than system default

---
 dev/run-tests.py                   | 2 +-
 dev/sparktestsupport/shellutils.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index d08e30f8785a..d82d15caba45 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -264,7 +264,7 @@ def exec_sbt(sbt_args=()):
     echo_proc.wait()
     for line in iter(sbt_proc.stdout.readline, b''):
         if not sbt_output_filter.match(line):
-            print(line.decode(sys.getdefaultencoding()), end='')
+            print(line.decode('utf-8', end='')
     retcode = sbt_proc.wait()
 
     if retcode != 0:
diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index 8ab784480c60..b52a43db4e6c 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -53,11 +53,11 @@ def run_cmd(cmd, return_output=False):
         cmd = cmd.split()
     try:
         if return_output:
-            return subprocess_check_output(cmd).decode(sys.getdefaultencoding())
+            return subprocess_check_output(cmd).decode('utf-8')
         else:
             popen = subprocess.Popen(cmd, stdout=subprocess.PIPE)
             for stdout_line in popen.stdout:
-                print(stdout_line.decode(sys.getdefaultencoding()), end='')
+                print(stdout_line.decode('utf-8', end='')
             popen.stdout.close()
             return_code = popen.wait()
             return return_code

From 137371d434f3c536c331a01dbb00a7929f27f9f8 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Thu, 7 Nov 2019 20:39:51 -0800
Subject: [PATCH 27/31] parens ftw

---
 dev/run-tests.py                   | 2 +-
 dev/sparktestsupport/shellutils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index d82d15caba45..82277720bb52 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -264,7 +264,7 @@ def exec_sbt(sbt_args=()):
     echo_proc.wait()
     for line in iter(sbt_proc.stdout.readline, b''):
         if not sbt_output_filter.match(line):
-            print(line.decode('utf-8', end='')
+            print(line.decode('utf-8'), end='')
     retcode = sbt_proc.wait()
 
     if retcode != 0:
diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index b52a43db4e6c..cd2c160e0580 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -57,7 +57,7 @@ def run_cmd(cmd, return_output=False):
         else:
             popen = subprocess.Popen(cmd, stdout=subprocess.PIPE)
             for stdout_line in popen.stdout:
-                print(stdout_line.decode('utf-8', end='')
+                print(stdout_line.decode('utf-8'), end='')
             popen.stdout.close()
             return_code = popen.wait()
             return return_code

From 48e318b9aa41e90f1768e5b6a583215a0cbb513a Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Fri, 8 Nov 2019 09:04:53 -0800
Subject: [PATCH 28/31] try subprocess.run()

---
 dev/sparktestsupport/shellutils.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index cd2c160e0580..d9cb8aa45c8d 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -55,12 +55,7 @@ def run_cmd(cmd, return_output=False):
         if return_output:
             return subprocess_check_output(cmd).decode('utf-8')
         else:
-            popen = subprocess.Popen(cmd, stdout=subprocess.PIPE)
-            for stdout_line in popen.stdout:
-                print(stdout_line.decode('utf-8'), end='')
-            popen.stdout.close()
-            return_code = popen.wait()
-            return return_code
+            return subprocess.run(cmd, universal_newlines=True, check=True)
     except subprocess.CalledProcessError as e:
         exit_from_command_with_retcode(e.cmd, e.returncode)
 

From a432afc74126a4376d893f2e2197b32af2ca8c7c Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Fri, 8 Nov 2019 10:59:10 -0800
Subject: [PATCH 29/31] add python3 to executable list if found

---
 python/run-tests.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/run-tests.py b/python/run-tests.py
index ebf6b63f64f4..182868af9ea1 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -166,6 +166,8 @@ def get_default_python_executables():
         if not p:
             LOGGER.error("No python3 executable found.  Exiting!")
             os._exit(1)
+        else:
+            python_execs.insert(0, p)
     return python_execs
 
 

From 4ad08b03c0f1de956be2fdc79ab7bfcdcf1abf45 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Tue, 12 Nov 2019 10:14:09 -0800
Subject: [PATCH 30/31] last batch of run-pip-tests python3 ports

---
 dev/run-pip-tests         | 6 +++---
 python/pyspark/context.py | 2 --
 python/pyspark/version.py | 2 +-
 python/setup.py           | 7 +++----
 4 files changed, 7 insertions(+), 10 deletions(-)
 mode change 100644 => 100755 python/setup.py

diff --git a/dev/run-pip-tests b/dev/run-pip-tests
index c25ee7839a80..1294a9096fb9 100755
--- a/dev/run-pip-tests
+++ b/dev/run-pip-tests
@@ -92,7 +92,7 @@ for python in "${PYTHON_EXECS[@]}"; do
     cd "$FWDIR"/python
     # Delete the egg info file if it exists, this can cache the setup file.
     rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
-    python setup.py sdist
+    python3 setup.py sdist
 
 
     echo "Installing dist into virtual env"
@@ -112,9 +112,9 @@ for python in "${PYTHON_EXECS[@]}"; do
     echo "Run basic sanity check on pip installed version with spark-submit"
     spark-submit "$FWDIR"/dev/pip-sanity-check.py
     echo "Run basic sanity check with import based"
-    python "$FWDIR"/dev/pip-sanity-check.py
+    python3 "$FWDIR"/dev/pip-sanity-check.py
     echo "Run the tests for context.py"
-    python "$FWDIR"/python/pyspark/context.py
+    python3 "$FWDIR"/python/pyspark/context.py
 
     cd "$FWDIR"
 
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 4d140f91f032..80aa41e59f76 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 import os
 import shutil
 import signal
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index ba2a40cec01e..1abc41279ebe 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
diff --git a/python/setup.py b/python/setup.py
old mode 100644
new mode 100755
index ea672309703b..092bdd3f9011
--- a/python/setup.py
+++ b/python/setup.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
@@ -16,15 +16,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
 import glob
 import os
 import sys
 from setuptools import setup
 from shutil import copyfile, copytree, rmtree
 
-if sys.version_info < (2, 7):
-    print("Python versions prior to 2.7 are not supported for pip installed PySpark.",
+if sys.version_info < (3, 6):
+    print("Python versions prior to 3.6 are not supported for pip installed PySpark.",
           file=sys.stderr)
     sys.exit(-1)
 

From b5bada3edaedb7b21d108e90047f5c9f77e6f6a9 Mon Sep 17 00:00:00 2001
From: shane knapp <incomplete@gmail.com>
Date: Wed, 13 Nov 2019 10:14:30 -0800
Subject: [PATCH 31/31] testing to see if pypy/py27 tests will work in py3 env

---
 python/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 182868af9ea1..5bcf8b066912 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -159,7 +159,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python):
 
 
 def get_default_python_executables():
-    python_execs = [x for x in ["python3.6"] if which(x)]
+    python_execs = [x for x in ["python3.6", "python2.7", "pypy"] if which(x)]
 
     if "python3.6" not in python_execs:
         p = which("python3")