diff --git a/README.md b/README.md index f3b90ceb923e..271f2f5f5b1c 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![Jenkins Build](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7/badge/icon)](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7) [![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark) +[![PySpark Coverage](https://img.shields.io/badge/dynamic/xml.svg?label=pyspark%20coverage&url=https%3A%2F%2Fspark-test.github.io%2Fpyspark-coverage-site&query=%2Fhtml%2Fbody%2Fdiv%5B1%5D%2Fdiv%2Fh1%2Fspan&colorB=brightgreen&style=plastic)](https://spark-test.github.io/pyspark-coverage-site) Spark is a fast and general cluster computing system for Big Data. It provides high-level APIs in Scala, Java, Python, and R, and an optimized engine that diff --git a/dev/run-tests.py b/dev/run-tests.py index e1ed2744d78b..edd89c9f0890 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -25,6 +25,8 @@ import re import sys import subprocess +import glob +import shutil from collections import namedtuple from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES @@ -400,15 +402,66 @@ def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags): run_scala_tests_sbt(test_modules, test_profiles) -def run_python_tests(test_modules, parallelism): +def run_python_tests(test_modules, parallelism, with_coverage=False): set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") - command = [os.path.join(SPARK_HOME, "python", "run-tests")] + if with_coverage: + # Coverage makes the PySpark tests flaky due to heavy parallelism. + # When we run PySpark tests with coverage, it uses 4 for now as + # workaround. + parallelism = 4 + script = "run-tests-with-coverage" + else: + script = "run-tests" + command = [os.path.join(SPARK_HOME, "python", script)] if test_modules != [modules.root]: command.append("--modules=%s" % ','.join(m.name for m in test_modules)) command.append("--parallelism=%i" % parallelism) run_cmd(command) + if with_coverage: + post_python_tests_results() + + +def post_python_tests_results(): + if "SPARK_TEST_KEY" not in os.environ: + print("[error] 'SPARK_TEST_KEY' environment variable was not set. Unable to post " + "PySpark coverage results.") + sys.exit(1) + spark_test_key = os.environ.get("SPARK_TEST_KEY") + # The steps below upload HTMLs to 'github.com/spark-test/pyspark-coverage-site'. + # 1. Clone PySpark coverage site. + run_cmd([ + "git", + "clone", + "https://spark-test:%s@github.com/spark-test/pyspark-coverage-site.git" % spark_test_key]) + # 2. Remove existing HTMLs. + run_cmd(["rm", "-fr"] + glob.glob("pyspark-coverage-site/*")) + # 3. Copy generated coverage HTMLs. + for f in glob.glob("%s/python/test_coverage/htmlcov/*" % SPARK_HOME): + shutil.copy(f, "pyspark-coverage-site/") + os.chdir("pyspark-coverage-site") + try: + # 4. Check out to a temporary branch. + run_cmd(["git", "symbolic-ref", "HEAD", "refs/heads/latest_branch"]) + # 5. Add all the files. + run_cmd(["git", "add", "-A"]) + # 6. Commit current HTMLs. + run_cmd([ + "git", + "commit", + "-am", + "Coverage report at latest commit in Apache Spark", + '--author="Apache Spark Test Account "']) + # 7. Delete the old branch. + run_cmd(["git", "branch", "-D", "gh-pages"]) + # 8. Rename the temporary branch to master. + run_cmd(["git", "branch", "-m", "gh-pages"]) + # 9. Finally, force update to our repository. + run_cmd(["git", "push", "-f", "origin", "gh-pages"]) + finally: + os.chdir("..") + def run_python_packaging_tests(): set_title_and_block("Running PySpark packaging tests", "BLOCK_PYSPARK_PIP_TESTS") @@ -567,7 +620,11 @@ def main(): modules_with_python_tests = [m for m in test_modules if m.python_test_goals] if modules_with_python_tests: - run_python_tests(modules_with_python_tests, opts.parallelism) + # We only run PySpark tests with coverage report in one specific job with + # Spark master with SBT in Jenkins. + is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ + run_python_tests( + modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job) run_python_packaging_tests() if any(m.should_run_r_tests for m in test_modules): run_sparkr_tests() diff --git a/python/pyspark/streaming/tests/test_dstream.py b/python/pyspark/streaming/tests/test_dstream.py index d14e346b7a68..61a816160490 100644 --- a/python/pyspark/streaming/tests/test_dstream.py +++ b/python/pyspark/streaming/tests/test_dstream.py @@ -22,12 +22,16 @@ import unittest from functools import reduce from itertools import chain +import platform from pyspark import SparkConf, SparkContext, RDD from pyspark.streaming import StreamingContext from pyspark.testing.streamingutils import PySparkStreamingTestCase +@unittest.skipIf( + "pypy" in platform.python_implementation().lower() and "COVERAGE_PROCESS_START" in os.environ, + "PyPy implementation causes to hang DStream tests forever when Coverage report is used.") class BasicOperationTests(PySparkStreamingTestCase): def test_map(self): @@ -389,6 +393,9 @@ def failed_func(i): self.fail("a failed func should throw an error") +@unittest.skipIf( + "pypy" in platform.python_implementation().lower() and "COVERAGE_PROCESS_START" in os.environ, + "PyPy implementation causes to hang DStream tests forever when Coverage report is used.") class WindowFunctionTests(PySparkStreamingTestCase): timeout = 15 @@ -466,6 +473,9 @@ def func(dstream): self._test_func(input, func, expected) +@unittest.skipIf( + "pypy" in platform.python_implementation().lower() and "COVERAGE_PROCESS_START" in os.environ, + "PyPy implementation causes to hang DStream tests forever when Coverage report is used.") class CheckpointTests(unittest.TestCase): setupCalled = False