From 7763f3c6d28a3246b40a849150746a220e03a112 Mon Sep 17 00:00:00 2001 From: Juliet Hougland Date: Thu, 14 Apr 2016 07:11:37 -0700 Subject: [PATCH 01/97] Adds setup.py --- python/setup.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 python/setup.py diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 000000000000..e9f643e2dea2 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,23 @@ +from setuptools import setup, find_packages + +VERSION = __version__ +JAR_PATH = "MY AWESOME JAR PATH" + +setup( + name='pyspark', + version=VERSION, + description='Apache Spark Python API', + author='Spark Developers', + author_email='dev@spark.apache.org', + url='https://github.com/apache/spark/tree/master/python', + packages=['pyspark', 'pyspark.mllib', 'pyspark.ml', 'pyspark.sql', 'pyspark.streaming'], + include_package_data = True, + package_data={ + 'spark.jar': [JAR_PATH]}, + license='http://www.apache.org/licenses/LICENSE-2.0', + install_requires=['py4j==0.9'], + extras_require = { + 'ml': ['numpy>=1.7'], + 'sql': ['pandas'] + } +) From 30debc7e6fa3a502d7991d2dee9cf48a69d92168 Mon Sep 17 00:00:00 2001 From: Juliet Hougland Date: Thu, 14 Apr 2016 09:31:01 -0700 Subject: [PATCH 02/97] Fix spacing. --- python/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/setup.py b/python/setup.py index e9f643e2dea2..0664f7925243 100644 --- a/python/setup.py +++ b/python/setup.py @@ -11,12 +11,12 @@ author_email='dev@spark.apache.org', url='https://github.com/apache/spark/tree/master/python', packages=['pyspark', 'pyspark.mllib', 'pyspark.ml', 'pyspark.sql', 'pyspark.streaming'], - include_package_data = True, + include_package_data=True, package_data={ 'spark.jar': [JAR_PATH]}, license='http://www.apache.org/licenses/LICENSE-2.0', install_requires=['py4j==0.9'], - extras_require = { + extras_require={ 'ml': ['numpy>=1.7'], 'sql': ['pandas'] } From 5155531fce49a0915d6a2187d9adaffc70bfa3f3 Mon Sep 17 00:00:00 2001 From: Juliet Hougland Date: Tue, 11 Oct 2016 22:54:36 -0700 Subject: [PATCH 03/97] updUpdate py4j dependency. Add mllib to extas_require, fix some indentation. --- python/setup.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/setup.py b/python/setup.py index 0664f7925243..bdb22cd17e37 100644 --- a/python/setup.py +++ b/python/setup.py @@ -10,14 +10,19 @@ author='Spark Developers', author_email='dev@spark.apache.org', url='https://github.com/apache/spark/tree/master/python', - packages=['pyspark', 'pyspark.mllib', 'pyspark.ml', 'pyspark.sql', 'pyspark.streaming'], + packages=['pyspark', + 'pyspark.mllib', + 'pyspark.ml', + 'pyspark.sql', + 'pyspark.streaming'], include_package_data=True, package_data={ 'spark.jar': [JAR_PATH]}, license='http://www.apache.org/licenses/LICENSE-2.0', - install_requires=['py4j==0.9'], + install_requires=['py4j==0.10.3'], extras_require={ 'ml': ['numpy>=1.7'], + 'mllib': ['numpy<=1.7'] 'sql': ['pandas'] } ) From 2f0bf9b89db9a3a9362b73f2130a2c779fb01a76 Mon Sep 17 00:00:00 2001 From: Juliet Hougland Date: Tue, 11 Oct 2016 23:03:22 -0700 Subject: [PATCH 04/97] Adds MANIFEST.in file. --- python/MANIFEST.in | 1 + 1 file changed, 1 insertion(+) create mode 100644 python/MANIFEST.in diff --git a/python/MANIFEST.in b/python/MANIFEST.in new file mode 100644 index 000000000000..5c8e7a3b94f1 --- /dev/null +++ b/python/MANIFEST.in @@ -0,0 +1 @@ +include somepath/to/spark/jar/*.jar From 7ff8d0f465360463d1cd3b503d1d5d8aded7e88f Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 12 Oct 2016 10:02:53 -0700 Subject: [PATCH 05/97] Start working towards post-2.0 pip installable PypSpark (so including list of jars, fix extras_require decl, etc.) --- pom.xml | 1 + python/MANIFEST.in | 2 +- python/setup.py | 8 ++++---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index 7d13c51b2a59..bfad4f7267aa 100644 --- a/pom.xml +++ b/pom.xml @@ -26,6 +26,7 @@ org.apache.spark spark-parent_2.11 + 2.1.0-SNAPSHOT pom Spark Project Parent POM diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 5c8e7a3b94f1..1ba76d33bc88 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -1 +1 @@ -include somepath/to/spark/jar/*.jar +include ../assembly/target/scala-2.11/jars/*.jar diff --git a/python/setup.py b/python/setup.py index bdb22cd17e37..5ab31bc69d20 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages -VERSION = __version__ -JAR_PATH = "MY AWESOME JAR PATH" +VERSION = '2.1.0-SNAPSHOT' +JARS_PATH = "../assembly/target/scala-2.11/jars/*.jar" setup( name='pyspark', @@ -17,12 +17,12 @@ 'pyspark.streaming'], include_package_data=True, package_data={ - 'spark.jar': [JAR_PATH]}, + 'jars': [JARS_PATH]}, license='http://www.apache.org/licenses/LICENSE-2.0', install_requires=['py4j==0.10.3'], extras_require={ 'ml': ['numpy>=1.7'], - 'mllib': ['numpy<=1.7'] + 'mllib': ['numpy<=1.7'], 'sql': ['pandas'] } ) From cb2e06d2e31e113dc29f5212fc9e05ba7d87fa8d Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 16 Oct 2016 11:47:52 -0700 Subject: [PATCH 06/97] So MANIFEST and setup can't refer to things above the root of the project, so create symlinks so we can package the JARs with it --- python/MANIFEST.in | 3 ++- python/setup.py | 58 ++++++++++++++++++++++++++++------------------ 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 1ba76d33bc88..ff588778bd07 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -1 +1,2 @@ -include ../assembly/target/scala-2.11/jars/*.jar +recursive-include deps/assembly/target/scala-2.11/jars *.jar +recursive-include deps/bin * diff --git a/python/setup.py b/python/setup.py index 5ab31bc69d20..f3de9c94f440 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,28 +1,40 @@ +import os from setuptools import setup, find_packages VERSION = '2.1.0-SNAPSHOT' JARS_PATH = "../assembly/target/scala-2.11/jars/*.jar" +SCRIPTS = "../bin/*" +TEMP_PATH = "deps" -setup( - name='pyspark', - version=VERSION, - description='Apache Spark Python API', - author='Spark Developers', - author_email='dev@spark.apache.org', - url='https://github.com/apache/spark/tree/master/python', - packages=['pyspark', - 'pyspark.mllib', - 'pyspark.ml', - 'pyspark.sql', - 'pyspark.streaming'], - include_package_data=True, - package_data={ - 'jars': [JARS_PATH]}, - license='http://www.apache.org/licenses/LICENSE-2.0', - install_requires=['py4j==0.10.3'], - extras_require={ - 'ml': ['numpy>=1.7'], - 'mllib': ['numpy<=1.7'], - 'sql': ['pandas'] - } -) +# Construct links for setup +if os.path.isfile(TEMP_PATH): + os.remove(TEMP_PATH) +try: + os.symlink("../", TEMP_PATH) + + setup( + name='pyspark', + version=VERSION, + description='Apache Spark Python API', + author='Spark Developers', + author_email='dev@spark.apache.org', + url='https://github.com/apache/spark/tree/master/python', + packages=['pyspark', + 'pyspark.mllib', + 'pyspark.ml', + 'pyspark.sql', + 'pyspark.streaming'], + include_package_data=True, + package_data={ + 'pyspark': [JARS_PATH]}, + scripts=[SCRIPTS], + license='http://www.apache.org/licenses/LICENSE-2.0', + install_requires=['py4j==0.10.3'], + extras_require={ + 'ml': ['numpy>=1.7'], + 'mllib': ['numpy<=1.7'], + 'sql': ['pandas'] + } + ) +finally: + os.remove(TEMP_PATH) From e2e4d1c9f42522db6ec981e6d650855a58150897 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 18 Oct 2016 09:14:48 -0700 Subject: [PATCH 07/97] Keep the symlink --- .gitignore | 1 + python/setup.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 39d17e1793f7..2399a6cd5d90 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ project/plugins/project/build.properties project/plugins/src_managed/ project/plugins/target/ python/lib/pyspark.zip +python/deps reports/ scalastyle-on-compile.generated.xml scalastyle-output.xml diff --git a/python/setup.py b/python/setup.py index f3de9c94f440..313f91a57326 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,17 +1,20 @@ -import os +from __future__ import print_function +import os, sys from setuptools import setup, find_packages VERSION = '2.1.0-SNAPSHOT' -JARS_PATH = "../assembly/target/scala-2.11/jars/*.jar" -SCRIPTS = "../bin/*" +# A temporary path so we can access above the Python project root and fetch scripts and jars we need TEMP_PATH = "deps" +JARS_PATH = "%s/assembly/target/scala-2.11/jars/*.jar" % TEMP_PATH +SCRIPTS = "%s/bin/*" % TEMP_PATH # Construct links for setup -if os.path.isfile(TEMP_PATH): - os.remove(TEMP_PATH) try: os.symlink("../", TEMP_PATH) +except: + print("temp path for symlink to parent already exists %s" % TEMP_PATH, file=sys.stderr) +try: setup( name='pyspark', version=VERSION, From fb15d7e3e6b3be7c8c69d776649f4d556656f3f0 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 18 Oct 2016 10:38:40 -0700 Subject: [PATCH 08/97] Some progress we need to use SDIST but is ok --- python/MANIFEST.in | 2 +- python/setup.py | 25 ++++++++++++++++++------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index ff588778bd07..ba230599ca7e 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -1,2 +1,2 @@ -recursive-include deps/assembly/target/scala-2.11/jars *.jar +recursive-include deps/jars *.jar recursive-include deps/bin * diff --git a/python/setup.py b/python/setup.py index 313f91a57326..77285d0d5731 100644 --- a/python/setup.py +++ b/python/setup.py @@ -5,16 +5,24 @@ VERSION = '2.1.0-SNAPSHOT' # A temporary path so we can access above the Python project root and fetch scripts and jars we need TEMP_PATH = "deps" -JARS_PATH = "%s/assembly/target/scala-2.11/jars/*.jar" % TEMP_PATH -SCRIPTS = "%s/bin/*" % TEMP_PATH +SPARK_HOME = os.path.abspath("../") +JARS_PATH = "%s/assembly/target/scala-2.11/jars/" % SPARK_HOME +SCRIPTS_PATH = "%s/bin" % SPARK_HOME +SCRIPTS = "%s/bin" % TEMP_PATH +JARS = "%s/jars" % TEMP_PATH + # Construct links for setup try: - os.symlink("../", TEMP_PATH) + os.mkdir(TEMP_PATH) except: - print("temp path for symlink to parent already exists %s" % TEMP_PATH, file=sys.stderr) + print("Temp path for symlink to parent already exists %s" % TEMP_PATH, file=sys.stderr) + exit(-1) try: + os.symlink(JARS_PATH, JARS) + os.symlink(SCRIPTS_PATH, SCRIPTS) + setup( name='pyspark', version=VERSION, @@ -29,8 +37,8 @@ 'pyspark.streaming'], include_package_data=True, package_data={ - 'pyspark': [JARS_PATH]}, - scripts=[SCRIPTS], + 'pyspark': [JARS]}, + scripts=[SCRIPTS + "/*"], license='http://www.apache.org/licenses/LICENSE-2.0', install_requires=['py4j==0.10.3'], extras_require={ @@ -40,4 +48,7 @@ } ) finally: - os.remove(TEMP_PATH) + True +# os.remove("%s/jars" % TEMP_PATH) +# os.remove("%s/bin" % TEMP_PATH) +# os.rmdir(TEMP_PATH) From aab7ee4fcd3bb4825a91f5c5a9baace9944c68d0 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 18 Oct 2016 13:47:14 -0700 Subject: [PATCH 09/97] Reenable cleanup --- python/setup.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/setup.py b/python/setup.py index 77285d0d5731..cb551a93a7b2 100644 --- a/python/setup.py +++ b/python/setup.py @@ -48,7 +48,6 @@ } ) finally: - True -# os.remove("%s/jars" % TEMP_PATH) -# os.remove("%s/bin" % TEMP_PATH) -# os.rmdir(TEMP_PATH) + os.remove("%s/jars" % TEMP_PATH) + os.remove("%s/bin" % TEMP_PATH) + os.rmdir(TEMP_PATH) From 5a5762001946959fbcc96f8daf1510166ad5665e Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 19 Oct 2016 07:13:50 -0700 Subject: [PATCH 10/97] Try and provide a clear error message when pip installed directly, fix symlink farm issue, fix scripts issue, TODO: fix SPARK_HOME and find out why JARs aren't ending up in the install --- python/setup.py | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/python/setup.py b/python/setup.py index cb551a93a7b2..f4e79b29029f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -8,20 +8,34 @@ SPARK_HOME = os.path.abspath("../") JARS_PATH = "%s/assembly/target/scala-2.11/jars/" % SPARK_HOME SCRIPTS_PATH = "%s/bin" % SPARK_HOME -SCRIPTS = "%s/bin" % TEMP_PATH -JARS = "%s/jars" % TEMP_PATH +SCRIPTS_TARGET = "%s/bin" % TEMP_PATH +JARS_TARGET = "%s/jars" % TEMP_PATH -# Construct links for setup -try: - os.mkdir(TEMP_PATH) -except: - print("Temp path for symlink to parent already exists %s" % TEMP_PATH, file=sys.stderr) - exit(-1) +# Check and see if we are under the spark path in which case we need to build the symlink farm. +# The py4j src file is used to check this since for pip installed we use the py4j libraries rather +# than the source zip. +in_spark = os.path.isfile("lib/py4j-0.10.3-src.zip") +if (in_spark): + # Construct links for setup + try: + os.mkdir(TEMP_PATH) + except: + print("Temp path for symlink to parent already exists %s" % TEMP_PATH, file=sys.stderr) + exit(-1) try: - os.symlink(JARS_PATH, JARS) - os.symlink(SCRIPTS_PATH, SCRIPTS) + if (in_spark): + os.symlink(JARS_PATH, JARS_TARGET) + os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) + + if not os.path.isdir(SCRIPTS_TARGET): + print("For packaging reasons you must first create a source dist and install that source dist.", file=sys.stderr) + exit(-1) + + # Scripts directive requires a list of each script path and does not take wild cards. + script_names = os.listdir(SCRIPTS_TARGET) + scripts = map(lambda script: SCRIPTS_TARGET + "/" + script, script_names) setup( name='pyspark', @@ -37,8 +51,8 @@ 'pyspark.streaming'], include_package_data=True, package_data={ - 'pyspark': [JARS]}, - scripts=[SCRIPTS + "/*"], + 'pyspark': [JARS_TARGET + "/*"]}, + scripts=scripts, license='http://www.apache.org/licenses/LICENSE-2.0', install_requires=['py4j==0.10.3'], extras_require={ @@ -48,6 +62,9 @@ } ) finally: - os.remove("%s/jars" % TEMP_PATH) - os.remove("%s/bin" % TEMP_PATH) - os.rmdir(TEMP_PATH) + # We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than + # packaging. + if (in_spark): + os.remove("%s/jars" % TEMP_PATH) + os.remove("%s/bin" % TEMP_PATH) + os.rmdir(TEMP_PATH) From 646aa231cc8646b7bde3ec0df455bd64ec48eb00 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 19 Oct 2016 15:56:01 -0700 Subject: [PATCH 11/97] Add two scripts --- bin/find-spark-home | 34 +++++++++++++++++++++++++ bin/find-spark-home.py | 57 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 bin/find-spark-home create mode 100755 bin/find-spark-home.py diff --git a/bin/find-spark-home b/bin/find-spark-home new file mode 100644 index 000000000000..e1181e3a4152 --- /dev/null +++ b/bin/find-spark-home @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Attempts to find a proper value for SPARK_HOME +if [! -z "${SPARK_HOME}" ]; then + exit 0 +fi + +# Default to standard python interpreter unless told otherwise +if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then + PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}" +fi + +# If the user has Python installed use the Python script to search for a valid SPARK_HOME incase +# we are pip installed +if hash $PYSPARK_DRIVER_PYTHON 2>/dev/null; then + export SPARK_HOME=`$PYSPARK_DRIVER_PYTHON $(cd "`dirname "$0"`"/..; pwd) find-spark-home.py` +fi \ No newline at end of file diff --git a/bin/find-spark-home.py b/bin/find-spark-home.py new file mode 100755 index 000000000000..4c2a27ce09b8 --- /dev/null +++ b/bin/find-spark-home.py @@ -0,0 +1,57 @@ +#!/usr/bin/python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This script attempt to determine the correct setting for SPARK_HOME given +# that Spark may have been installed on the system with pip. + +from __future__ import print_function +import os,sys + +def is_spark_home(path): + """Takes a path and returns true if the provided path could be a reasonable SPARK_HOME""" + return (os.path.isdir(path + "/assembly") + and os.path.isdir(path + "/assembly/target") and False) + +paths = ["../", os.path.dirname(sys.argv[0]) + "/../"] + +# Add the path of the PySpark module if it exists +import sys +if sys.version < "3": + import imp + try: + paths.append(imp.find_module("pyspark")[1]) + except ImportError: + # Not pip installed no worries + True +else: + import importlib + try: + paths.append(importlib.util.find_spec("pyspark").origin) + except ImportError: + # Not pip installed no worries + True + +# Normalize the paths +paths = map(lambda path:os.path.abspath(path), paths) + +try: + print(next(path for path in paths if is_spark_home(path))) +except StopIteration: + print("Could not find valid SPARK_HOME while searching %s" % paths, file=sys.stderr) + exit(-1) From 36c9d45e741929d301ef54dadf33ae56a464f479 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 19 Oct 2016 16:45:18 -0700 Subject: [PATCH 12/97] package_data doesn't work so well with nested directories so instead add pyspark.bin and pyspark.jars packages and set their package dirs as desired, make the spark scripts check and see if they are in a pip installed enviroment and if SPARK_HOME in unset then resolve it with Python [otherwise use the current behaviour] --- bin/beeline | 2 +- bin/find-spark-home | 24 +++++++++++++----------- bin/load-spark-env.sh | 2 +- bin/pyspark | 2 +- bin/run-example | 2 +- bin/spark-class | 4 ++-- bin/spark-shell | 2 +- bin/spark-sql | 2 +- bin/sparkR | 2 +- python/MANIFEST.in | 1 + {bin => python}/find-spark-home.py | 4 ++-- python/setup.py | 13 ++++++++++--- 12 files changed, 35 insertions(+), 25 deletions(-) mode change 100644 => 100755 bin/find-spark-home rename {bin => python}/find-spark-home.py (94%) diff --git a/bin/beeline b/bin/beeline index 1627626941a7..19b3bc2db422 100755 --- a/bin/beeline +++ b/bin/beeline @@ -25,7 +25,7 @@ set -o posix # Figure out if SPARK_HOME is set if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" + source `dirname $0`/find-spark-home fi CLASS="org.apache.hive.beeline.BeeLine" diff --git a/bin/find-spark-home b/bin/find-spark-home old mode 100644 new mode 100755 index e1181e3a4152..3d45ed9d67ec --- a/bin/find-spark-home +++ b/bin/find-spark-home @@ -18,17 +18,19 @@ # # Attempts to find a proper value for SPARK_HOME -if [! -z "${SPARK_HOME}" ]; then - exit 0 -fi -# Default to standard python interpreter unless told otherwise -if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then - PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}" -fi +FIND_SPARK_HOME_PYTHON_SCRIPT="$(cd "`dirname "$0"`"; pwd)/find-spark-home.py" -# If the user has Python installed use the Python script to search for a valid SPARK_HOME incase -# we are pip installed -if hash $PYSPARK_DRIVER_PYTHON 2>/dev/null; then - export SPARK_HOME=`$PYSPARK_DRIVER_PYTHON $(cd "`dirname "$0"`"/..; pwd) find-spark-home.py` +if [ ! -z "${SPARK_HOME}" ]; then + exit 0 +elif [ ! -f $FIND_SPARK_HOME_PYTHON_SCRIPT ]; then + # If we are not in the same directory as find-spark-home.py we are not pip installed + export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" +else + # We are pip installed, use the Python script to resolve a reasonable SPARK_HOME + # Default to standard python interpreter unless told otherwise + if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then + PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}" + fi + export SPARK_HOME=`$PYSPARK_DRIVER_PYTHON $FIND_SPARK_HOME_PYTHON_SCRIPT` fi \ No newline at end of file diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh index eaea964ed5b3..489967da4f57 100644 --- a/bin/load-spark-env.sh +++ b/bin/load-spark-env.sh @@ -23,7 +23,7 @@ # Figure out where Spark is installed if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" + source `dirname $0`/find-spark-home fi if [ -z "$SPARK_ENV_LOADED" ]; then diff --git a/bin/pyspark b/bin/pyspark index 7590309b442e..b6290adf9e72 100755 --- a/bin/pyspark +++ b/bin/pyspark @@ -18,7 +18,7 @@ # if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" + source `dirname $0`/find-spark-home fi source "${SPARK_HOME}"/bin/load-spark-env.sh diff --git a/bin/run-example b/bin/run-example index dd0e3c412026..b1a436e35813 100755 --- a/bin/run-example +++ b/bin/run-example @@ -18,7 +18,7 @@ # if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" + source `dirname $0`/find-spark-home fi export _SPARK_CMD_USAGE="Usage: ./bin/run-example [options] example-class [example args]" diff --git a/bin/spark-class b/bin/spark-class index 377c8d1add3f..846fe5622629 100755 --- a/bin/spark-class +++ b/bin/spark-class @@ -18,7 +18,7 @@ # if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" + source `dirname $0`/find-spark-home fi . "${SPARK_HOME}"/bin/load-spark-env.sh @@ -36,7 +36,7 @@ else fi # Find Spark jars. -if [ -f "${SPARK_HOME}/RELEASE" ]; then +if [ -d "${SPARK_HOME}/jars" ]; then SPARK_JARS_DIR="${SPARK_HOME}/jars" else SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars" diff --git a/bin/spark-shell b/bin/spark-shell index 6583b5bd880e..87eefbcbcd27 100755 --- a/bin/spark-shell +++ b/bin/spark-shell @@ -29,7 +29,7 @@ esac set -o posix if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" + source `dirname $0`/find-spark-home fi export _SPARK_CMD_USAGE="Usage: ./bin/spark-shell [options]" diff --git a/bin/spark-sql b/bin/spark-sql index 970d12cbf51d..5f702d63a763 100755 --- a/bin/spark-sql +++ b/bin/spark-sql @@ -18,7 +18,7 @@ # if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" + source `dirname $0`/find-spark-home fi export _SPARK_CMD_USAGE="Usage: ./bin/spark-sql [options] [cli option]" diff --git a/bin/sparkR b/bin/sparkR index 2c07a82e2173..e8ef3d73e3ad 100755 --- a/bin/sparkR +++ b/bin/sparkR @@ -18,7 +18,7 @@ # if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" + source `dirname $0`/find-spark-home fi source "${SPARK_HOME}"/bin/load-spark-env.sh diff --git a/python/MANIFEST.in b/python/MANIFEST.in index ba230599ca7e..2b50ed4e5c2b 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -1,2 +1,3 @@ recursive-include deps/jars *.jar recursive-include deps/bin * +include find-spark-home.py diff --git a/bin/find-spark-home.py b/python/find-spark-home.py similarity index 94% rename from bin/find-spark-home.py rename to python/find-spark-home.py index 4c2a27ce09b8..0a77e6924270 100755 --- a/bin/find-spark-home.py +++ b/python/find-spark-home.py @@ -25,8 +25,8 @@ def is_spark_home(path): """Takes a path and returns true if the provided path could be a reasonable SPARK_HOME""" - return (os.path.isdir(path + "/assembly") - and os.path.isdir(path + "/assembly/target") and False) + return (os.path.isdir(path + "/bin") + and os.path.isfile(path + "/bin/spark-submit")) paths = ["../", os.path.dirname(sys.argv[0]) + "/../"] diff --git a/python/setup.py b/python/setup.py index f4e79b29029f..521be6022bd4 100644 --- a/python/setup.py +++ b/python/setup.py @@ -28,6 +28,10 @@ if (in_spark): os.symlink(JARS_PATH, JARS_TARGET) os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) + else: + # We add find-spark-home.py to the bin directory we install so that pip installed PySpark + # will search for SPARK_HOME with Python + os.synlink("find-spark-home.py", SCRIPTS_TARGET + "/") if not os.path.isdir(SCRIPTS_TARGET): print("For packaging reasons you must first create a source dist and install that source dist.", file=sys.stderr) @@ -36,6 +40,7 @@ # Scripts directive requires a list of each script path and does not take wild cards. script_names = os.listdir(SCRIPTS_TARGET) scripts = map(lambda script: SCRIPTS_TARGET + "/" + script, script_names) + scripts.append("find-spark-home.py") setup( name='pyspark', @@ -48,10 +53,12 @@ 'pyspark.mllib', 'pyspark.ml', 'pyspark.sql', - 'pyspark.streaming'], + 'pyspark.streaming', + 'pyspark.bin', + 'pyspark.jars'], include_package_data=True, - package_data={ - 'pyspark': [JARS_TARGET + "/*"]}, + package_dir={'pyspark.jars': 'deps/jars', 'pyspark.bin': 'deps/bin'}, + package_data={'pyspark.jars': ['*.jar'], 'pyspark.bin': ['*']}, scripts=scripts, license='http://www.apache.org/licenses/LICENSE-2.0', install_requires=['py4j==0.10.3'], From a78754b778c28fe406ac8c60ede7dbea076a19a1 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 19 Oct 2016 17:07:15 -0700 Subject: [PATCH 13/97] Use copyfile also check for jars dir too --- python/find-spark-home.py | 5 ++--- python/setup.py | 5 +++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/find-spark-home.py b/python/find-spark-home.py index 0a77e6924270..11ff17479be0 100755 --- a/python/find-spark-home.py +++ b/python/find-spark-home.py @@ -21,12 +21,11 @@ # that Spark may have been installed on the system with pip. from __future__ import print_function -import os,sys +import os, sys def is_spark_home(path): """Takes a path and returns true if the provided path could be a reasonable SPARK_HOME""" - return (os.path.isdir(path + "/bin") - and os.path.isfile(path + "/bin/spark-submit")) + return (os.path.isfile(path + "/bin/spark-submit") and os.path.isdir(path + "/jars")) paths = ["../", os.path.dirname(sys.argv[0]) + "/../"] diff --git a/python/setup.py b/python/setup.py index 521be6022bd4..117aed8c88c4 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,6 +1,7 @@ from __future__ import print_function import os, sys from setuptools import setup, find_packages +from shutil import copyfile VERSION = '2.1.0-SNAPSHOT' # A temporary path so we can access above the Python project root and fetch scripts and jars we need @@ -30,8 +31,8 @@ os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) else: # We add find-spark-home.py to the bin directory we install so that pip installed PySpark - # will search for SPARK_HOME with Python - os.synlink("find-spark-home.py", SCRIPTS_TARGET + "/") + # will search for SPARK_HOME with Python. + copyfile("find-spark-home.py", SCRIPTS_TARGET + "/find-spark-home.py") if not os.path.isdir(SCRIPTS_TARGET): print("For packaging reasons you must first create a source dist and install that source dist.", file=sys.stderr) From 955e92b556b2af3f22acd78e8b800a44d900cb31 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 19 Oct 2016 17:17:26 -0700 Subject: [PATCH 14/97] Check if pip installed when finding the shell file --- bin/pyspark | 7 ++++++- bin/pyspark2.cmd | 5 +++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/bin/pyspark b/bin/pyspark index b6290adf9e72..9dc7fff9c3b1 100755 --- a/bin/pyspark +++ b/bin/pyspark @@ -61,7 +61,12 @@ export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.3-src.zip:$PYTHONPATH" # Load the PySpark shell.py script when ./pyspark is used interactively: export OLD_PYTHONSTARTUP="$PYTHONSTARTUP" -export PYTHONSTARTUP="${SPARK_HOME}/python/pyspark/shell.py" +# Check if we are pip installed or otherwise +if [ -d ${SPARK_HOME}/python/pyspark ]; then + export PYTHONSTARTUP="${SPARK_HOME}/python/pyspark/shell.py" +else + export PYTHONSTARTUP="${SPARK_HOME}/shell.py" +fi # For pyspark tests if [[ -n "$SPARK_TESTING" ]]; then diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd index 1217a4f2f97a..83f03d0970c4 100644 --- a/bin/pyspark2.cmd +++ b/bin/pyspark2.cmd @@ -33,6 +33,11 @@ set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH% set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.3-src.zip;%PYTHONPATH% set OLD_PYTHONSTARTUP=%PYTHONSTARTUP% +rem Check if we are pip installed or not +IF EXIST %SPARK_HOME%\python\pyspark\shell.py ( set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py +) ELSE ( +set PYTHONSTARTUP=%SPARK_HOME%\shell.py +) call "%SPARK_HOME%\bin\spark-submit2.cmd" pyspark-shell-main --name "PySparkShell" %* From 2d88a40c3c6236715b9fbe3af49dafb0999ccf00 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 19 Oct 2016 17:19:40 -0700 Subject: [PATCH 15/97] Check if jars dir exists rather than release file --- .../java/org/apache/spark/launcher/CommandBuilderUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java b/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java index 62a22008d0d5..250b2a882feb 100644 --- a/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java +++ b/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java @@ -357,7 +357,7 @@ static int javaMajorVersion(String javaVersion) { static String findJarsDir(String sparkHome, String scalaVersion, boolean failIfNotFound) { // TODO: change to the correct directory once the assembly build is changed. File libdir; - if (new File(sparkHome, "RELEASE").isFile()) { + if (new File(sparkHome, "jars").isDirectory()) { libdir = new File(sparkHome, "jars"); checkState(!failIfNotFound || libdir.isDirectory(), "Library directory '%s' does not exist.", From 9e5c5328e42a462b0f76a2ebad989dfa5b5dcdd5 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 23 Oct 2016 08:52:48 -0700 Subject: [PATCH 16/97] Start working a bit on the docs --- dev/make-distribution.sh | 4 ++++ docs/building-spark.md | 6 ++++++ docs/index.md | 4 +++- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index 9be4fdfa51c9..5b8ac6766464 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -201,6 +201,10 @@ fi # Copy data files cp -r "$SPARK_HOME/data" "$DISTDIR" +# Make pip package +cd python +python setup.py sdist + # Copy other things mkdir "$DISTDIR"/conf cp "$SPARK_HOME"/conf/*.template "$DISTDIR"/conf diff --git a/docs/building-spark.md b/docs/building-spark.md index f5acee6b9005..ed37a11f0303 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -261,6 +261,12 @@ or Java 8 tests are automatically enabled when a Java 8 JDK is detected. If you have JDK 8 installed but it is not the system default, you can set JAVA_HOME to point to JDK 8 before running the tests. +## PySpark pip installable + +If your are building Spark for use in a Python enviroment and you wish to pip install it, you will first need to build the Spark JARs as describeded above. Then you can construct an sdist package suitable for setup.py and pip installable package. + +**Note:** Due to packaging requirements you can not directly pip install from the Python directory, rather you must first build the sdist package as described above. + ## PySpark Tests with Maven If you are building PySpark and wish to run the PySpark tests you will need to build Spark with Hive support. diff --git a/docs/index.md b/docs/index.md index a7a92f6c4f6d..286d9d978b63 100644 --- a/docs/index.md +++ b/docs/index.md @@ -14,7 +14,9 @@ It also supports a rich set of higher-level tools including [Spark SQL](sql-prog Get Spark from the [downloads page](http://spark.apache.org/downloads.html) of the project website. This documentation is for Spark version {{site.SPARK_VERSION}}. Spark uses Hadoop's client libraries for HDFS and YARN. Downloads are pre-packaged for a handful of popular Hadoop versions. Users can also download a "Hadoop free" binary and run Spark with any Hadoop version -[by augmenting Spark's classpath](hadoop-provided.html). +[by augmenting Spark's classpath](hadoop-provided.html). +Scala and Java users can include Spark in their projects using it's maven cooridnates and Python users can also install Spark from PyPi. + If you'd like to build Spark from source, visit [Building Spark](building-spark.html). From 07d384982caa069e96cc2ac64b9faa9dc19ddc00 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 23 Oct 2016 14:22:59 -0700 Subject: [PATCH 17/97] Try and include pyspark zip file for yarn use --- python/MANIFEST.in | 1 + python/setup.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 2b50ed4e5c2b..6d715ffbe58c 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -1,3 +1,4 @@ recursive-include deps/jars *.jar recursive-include deps/bin * +include lib/pyspark.zip include find-spark-home.py diff --git a/python/setup.py b/python/setup.py index 117aed8c88c4..f1008836264e 100644 --- a/python/setup.py +++ b/python/setup.py @@ -56,10 +56,18 @@ 'pyspark.sql', 'pyspark.streaming', 'pyspark.bin', - 'pyspark.jars'], + 'pyspark.jars', + 'pyspark.python.lib'], include_package_data=True, - package_dir={'pyspark.jars': 'deps/jars', 'pyspark.bin': 'deps/bin'}, - package_data={'pyspark.jars': ['*.jar'], 'pyspark.bin': ['*']}, + package_dir={ + 'pyspark.jars': 'deps/jars', + 'pyspark.bin': 'deps/bin', + 'pyspark.python.lib': 'lib', + }, + package_data={ + 'pyspark.jars': ['*.jar'], + 'pyspark.bin': ['*'], + 'pyspark.python.lib': ['pyspark.zip']}, scripts=scripts, license='http://www.apache.org/licenses/LICENSE-2.0', install_requires=['py4j==0.10.3'], From 11b5fa85cbaed0866455a28e88f7868428c36219 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 23 Oct 2016 16:46:28 -0700 Subject: [PATCH 18/97] Copy pyspark zip for use in yarn cluster mode --- python/MANIFEST.in | 2 +- python/setup.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 6d715ffbe58c..cf4932a25b57 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -1,4 +1,4 @@ recursive-include deps/jars *.jar recursive-include deps/bin * -include lib/pyspark.zip +recursive-include lib *.zip include find-spark-home.py diff --git a/python/setup.py b/python/setup.py index f1008836264e..247b2757ec49 100644 --- a/python/setup.py +++ b/python/setup.py @@ -14,9 +14,7 @@ # Check and see if we are under the spark path in which case we need to build the symlink farm. -# The py4j src file is used to check this since for pip installed we use the py4j libraries rather -# than the source zip. -in_spark = os.path.isfile("lib/py4j-0.10.3-src.zip") +in_spark = os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") if (in_spark): # Construct links for setup try: @@ -67,7 +65,7 @@ package_data={ 'pyspark.jars': ['*.jar'], 'pyspark.bin': ['*'], - 'pyspark.python.lib': ['pyspark.zip']}, + 'pyspark.python.lib': ['*.zip']}, scripts=scripts, license='http://www.apache.org/licenses/LICENSE-2.0', install_requires=['py4j==0.10.3'], From 8791f829469f163ff195647d6250bee6f53d0dc4 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 24 Oct 2016 05:56:06 -0700 Subject: [PATCH 19/97] Start adding scripts to test pip installability --- dev/run-pip-tests | 31 +++++++++++++++++++++++++++++++ dev/run-pip-tests-2 | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100755 dev/run-pip-tests create mode 100755 dev/run-pip-tests-2 diff --git a/dev/run-pip-tests b/dev/run-pip-tests new file mode 100755 index 000000000000..bd4df95a6e72 --- /dev/null +++ b/dev/run-pip-tests @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +FWDIR="$(cd "`dirname $0`"/..; pwd)" +cd "$FWDIR" + +# Run the tests +./run-pip-test-2 +export success=$? + +# Clean up the virtual env enviroment used +rm `cat ./virtual_env_temp-dir` + +exit $success diff --git a/dev/run-pip-tests-2 b/dev/run-pip-tests-2 new file mode 100755 index 000000000000..5242344d16a7 --- /dev/null +++ b/dev/run-pip-tests-2 @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Stop on error +set -e +set -x + +# Create a temp directory for us to work in and save its name to a file for cleanup +echo "Constucting virtual env for testing" +mktemp -d > ./virtual_env_temp_dir +virtualenv `cat ./virtual_env_temp_dir` + +echo "Creating pip installable source dist" +cd python +python setup.py sdist + +echo "Installing dist into virtual env" +cd dist +pip install *.tar.gz + +cd "$FWDIR" From 92837a3a561cf96746c795d11aa60c2e82e6fa2d Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 24 Oct 2016 06:40:05 -0700 Subject: [PATCH 20/97] Works on yarn, works with spark submit, still need to fix import based spark home finder --- bin/spark-submit | 2 +- dev/run-pip-tests | 3 ++- dev/run-pip-tests-2 | 29 +++++++++++++++++++++++++++-- python/MANIFEST.in | 1 + python/setup.py | 12 +++++++++--- 5 files changed, 40 insertions(+), 7 deletions(-) diff --git a/bin/spark-submit b/bin/spark-submit index 023f9c162f4b..b00034971eb0 100755 --- a/bin/spark-submit +++ b/bin/spark-submit @@ -18,7 +18,7 @@ # if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" + source `dirname $0`/find-spark-home fi # disable randomized hash for string in Python 3.3+ diff --git a/dev/run-pip-tests b/dev/run-pip-tests index bd4df95a6e72..6dc0cb54deb3 100755 --- a/dev/run-pip-tests +++ b/dev/run-pip-tests @@ -21,7 +21,8 @@ FWDIR="$(cd "`dirname $0`"/..; pwd)" cd "$FWDIR" -# Run the tests +# Run the tests, we wrap the underlying test script for cleanup and because early exit +# doesn't always properly exit a virtualenv. ./run-pip-test-2 export success=$? diff --git a/dev/run-pip-tests-2 b/dev/run-pip-tests-2 index 5242344d16a7..84fc80562c51 100755 --- a/dev/run-pip-tests-2 +++ b/dev/run-pip-tests-2 @@ -21,10 +21,30 @@ set -e set -x +FWDIR="$(cd "`dirname $0`"/..; pwd)" +cd "$FWDIR" +# Some systems don't have pip or virtualenv - in those cases our tests won't work. +if ! hash virtualenv 2>/dev/null; then + echo "Missing virtualenv skipping pip installability tests." + exit 0 +fi +if ! hash pip 2>/dev/null; then + echo "Missing pip, skipping pip installability tests." + exit 0 +fi + +if [ -d ~/.cache/pip/wheels/ ]; then + echo "Cleaning up pip wheel cache so we install the fresh package" + rm -rf ~/.cache/pip/wheels/ +fi + # Create a temp directory for us to work in and save its name to a file for cleanup echo "Constucting virtual env for testing" mktemp -d > ./virtual_env_temp_dir -virtualenv `cat ./virtual_env_temp_dir` +VIRTUALENV_BASE=`cat ./virtual_env_temp_dir` +echo "Using $VIRTUALENV_BASE for virtualenv" +virtualenv $VIRTUALENV_BASE +source $VIRTUALENV_BASE/bin/activate echo "Creating pip installable source dist" cd python @@ -32,6 +52,11 @@ python setup.py sdist echo "Installing dist into virtual env" cd dist -pip install *.tar.gz +pip install --upgrade --force-reinstall *.tar.gz + +echo "Run basic sanity check on pip installed version with spark-submit" +spark-submit $FWDIR/dev/pip-sanity-check.py +echo "Run basic sanity check with import based" +python $FWDIR/dev/pip-sanity-check.py cd "$FWDIR" diff --git a/python/MANIFEST.in b/python/MANIFEST.in index cf4932a25b57..9033b734feb8 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -1,4 +1,5 @@ recursive-include deps/jars *.jar recursive-include deps/bin * +recursive-include deps/examples *.py recursive-include lib *.zip include find-spark-home.py diff --git a/python/setup.py b/python/setup.py index 247b2757ec49..afde3699d63f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -8,10 +8,11 @@ TEMP_PATH = "deps" SPARK_HOME = os.path.abspath("../") JARS_PATH = "%s/assembly/target/scala-2.11/jars/" % SPARK_HOME +EXAMPLES_PATH = "%s/examples/src/main/python" % SPARK_HOME SCRIPTS_PATH = "%s/bin" % SPARK_HOME SCRIPTS_TARGET = "%s/bin" % TEMP_PATH JARS_TARGET = "%s/jars" % TEMP_PATH - +EXAMPLES_TARGET = "%s/examples" % TEMP_PATH # Check and see if we are under the spark path in which case we need to build the symlink farm. in_spark = os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") @@ -27,6 +28,7 @@ if (in_spark): os.symlink(JARS_PATH, JARS_TARGET) os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) + os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET) else: # We add find-spark-home.py to the bin directory we install so that pip installed PySpark # will search for SPARK_HOME with Python. @@ -55,17 +57,20 @@ 'pyspark.streaming', 'pyspark.bin', 'pyspark.jars', - 'pyspark.python.lib'], + 'pyspark.python.lib', + 'pyspark.examples.src.main.python'], include_package_data=True, package_dir={ 'pyspark.jars': 'deps/jars', 'pyspark.bin': 'deps/bin', 'pyspark.python.lib': 'lib', + 'pyspark.examples.src.main.python': 'deps/examples', }, package_data={ 'pyspark.jars': ['*.jar'], 'pyspark.bin': ['*'], - 'pyspark.python.lib': ['*.zip']}, + 'pyspark.python.lib': ['*.zip'], + 'pyspark.examples.src.main.python': ['*.py', '*/*.py']}, scripts=scripts, license='http://www.apache.org/licenses/LICENSE-2.0', install_requires=['py4j==0.10.3'], @@ -81,4 +86,5 @@ if (in_spark): os.remove("%s/jars" % TEMP_PATH) os.remove("%s/bin" % TEMP_PATH) + os.remove("%s/examples" % TEMP_PATH) os.rmdir(TEMP_PATH) From 6947a855f5567eba80b6c3a9cfe97a3fc53fe863 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 24 Oct 2016 07:00:00 -0700 Subject: [PATCH 21/97] Start updating find-spark-home to be available in many cases. --- python/find-spark-home.py | 64 +++++++++++++++++++--------------- python/pyspark/java_gateway.py | 3 +- python/setup.py | 2 ++ 3 files changed, 38 insertions(+), 31 deletions(-) diff --git a/python/find-spark-home.py b/python/find-spark-home.py index 11ff17479be0..a384cf0783b5 100755 --- a/python/find-spark-home.py +++ b/python/find-spark-home.py @@ -23,34 +23,40 @@ from __future__ import print_function import os, sys -def is_spark_home(path): - """Takes a path and returns true if the provided path could be a reasonable SPARK_HOME""" - return (os.path.isfile(path + "/bin/spark-submit") and os.path.isdir(path + "/jars")) +def _find_spark_home(): + """Find the SPARK_HOME.""" + if "SPARK_HOME" in os.environ: + return os.environ["SPARK_HOME"] + + def is_spark_home(path): + """Takes a path and returns true if the provided path could be a reasonable SPARK_HOME""" + return (os.path.isfile(path + "/bin/spark-submit") and os.path.isdir(path + "/jars")) + + paths = ["../", os.path.dirname(sys.argv[0]) + "/../"] + + # Add the path of the PySpark module if it exists + if sys.version < "3": + import imp + try: + paths.append(imp.find_module("pyspark")[1]) + except ImportError: + # Not pip installed no worries + True + else: + import importlib + try: + paths.append(importlib.util.find_spec("pyspark").origin) + except ImportError: + # Not pip installed no worries + True + + # Normalize the paths + paths = map(lambda path:os.path.abspath(path), paths) -paths = ["../", os.path.dirname(sys.argv[0]) + "/../"] - -# Add the path of the PySpark module if it exists -import sys -if sys.version < "3": - import imp - try: - paths.append(imp.find_module("pyspark")[1]) - except ImportError: - # Not pip installed no worries - True -else: - import importlib try: - paths.append(importlib.util.find_spec("pyspark").origin) - except ImportError: - # Not pip installed no worries - True - -# Normalize the paths -paths = map(lambda path:os.path.abspath(path), paths) - -try: - print(next(path for path in paths if is_spark_home(path))) -except StopIteration: - print("Could not find valid SPARK_HOME while searching %s" % paths, file=sys.stderr) - exit(-1) + return next(path for path in paths if is_spark_home(path)) + except StopIteration: + print("Could not find valid SPARK_HOME while searching %s" % paths, file=sys.stderr) + +if __name__ == "__main__": + print(_find_spark_home()) diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py index c1cf843d8438..9ab3573a2bce 100644 --- a/python/pyspark/java_gateway.py +++ b/python/pyspark/java_gateway.py @@ -31,7 +31,6 @@ from py4j.java_gateway import java_import, JavaGateway, GatewayClient from pyspark.serializers import read_int - def launch_gateway(conf=None): """ launch jvm gateway @@ -41,7 +40,7 @@ def launch_gateway(conf=None): if "PYSPARK_GATEWAY_PORT" in os.environ: gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"]) else: - SPARK_HOME = os.environ["SPARK_HOME"] + SPARK_HOME = _find_spark_home() # Launch the Py4j gateway using Spark's run command so that we pick up the # proper classpath and settings from spark-env.sh on_windows = platform.system() == "Windows" diff --git a/python/setup.py b/python/setup.py index afde3699d63f..4e49e0ff28d2 100644 --- a/python/setup.py +++ b/python/setup.py @@ -33,6 +33,8 @@ # We add find-spark-home.py to the bin directory we install so that pip installed PySpark # will search for SPARK_HOME with Python. copyfile("find-spark-home.py", SCRIPTS_TARGET + "/find-spark-home.py") + # We also want to use find-spark-home from java_gateway + copyfile("find-spark-home.py", "pyspark/find-spark-home.py") if not os.path.isdir(SCRIPTS_TARGET): print("For packaging reasons you must first create a source dist and install that source dist.", file=sys.stderr) From 944160cabbaa96ed00a3d6ff4b7ddff9d29d204a Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 24 Oct 2016 07:08:51 -0700 Subject: [PATCH 22/97] Use Switch to find_spark_home.py --- bin/find-spark-home | 6 +++--- python/{find-spark-home.py => find_spark_home.py} | 0 python/setup.py | 8 +++----- 3 files changed, 6 insertions(+), 8 deletions(-) rename python/{find-spark-home.py => find_spark_home.py} (100%) diff --git a/bin/find-spark-home b/bin/find-spark-home index 3d45ed9d67ec..169590dc3082 100755 --- a/bin/find-spark-home +++ b/bin/find-spark-home @@ -19,12 +19,12 @@ # Attempts to find a proper value for SPARK_HOME -FIND_SPARK_HOME_PYTHON_SCRIPT="$(cd "`dirname "$0"`"; pwd)/find-spark-home.py" +FIND_SPARK_HOME_PYTHON_SCRIPT="$(cd "`dirname "$0"`"; pwd)/find_spark_home.py" if [ ! -z "${SPARK_HOME}" ]; then exit 0 elif [ ! -f $FIND_SPARK_HOME_PYTHON_SCRIPT ]; then - # If we are not in the same directory as find-spark-home.py we are not pip installed + # If we are not in the same directory as find_spark_home.py we are not pip installed export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" else # We are pip installed, use the Python script to resolve a reasonable SPARK_HOME @@ -33,4 +33,4 @@ else PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}" fi export SPARK_HOME=`$PYSPARK_DRIVER_PYTHON $FIND_SPARK_HOME_PYTHON_SCRIPT` -fi \ No newline at end of file +fi diff --git a/python/find-spark-home.py b/python/find_spark_home.py similarity index 100% rename from python/find-spark-home.py rename to python/find_spark_home.py diff --git a/python/setup.py b/python/setup.py index 4e49e0ff28d2..a5ddf3c57eb0 100644 --- a/python/setup.py +++ b/python/setup.py @@ -30,11 +30,9 @@ os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET) else: - # We add find-spark-home.py to the bin directory we install so that pip installed PySpark + # We add find_spark_home.py to the bin directory we install so that pip installed PySpark # will search for SPARK_HOME with Python. - copyfile("find-spark-home.py", SCRIPTS_TARGET + "/find-spark-home.py") - # We also want to use find-spark-home from java_gateway - copyfile("find-spark-home.py", "pyspark/find-spark-home.py") + copyfile("pyspark/find_spark_home.py", SCRIPTS_TARGET + "/find_spark_home.py") if not os.path.isdir(SCRIPTS_TARGET): print("For packaging reasons you must first create a source dist and install that source dist.", file=sys.stderr) @@ -43,7 +41,7 @@ # Scripts directive requires a list of each script path and does not take wild cards. script_names = os.listdir(SCRIPTS_TARGET) scripts = map(lambda script: SCRIPTS_TARGET + "/" + script, script_names) - scripts.append("find-spark-home.py") + scripts.append("find_spark_home.py") setup( name='pyspark', From 5bf0746dea5db4421a6ae8edc96de6d567f460e3 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 24 Oct 2016 07:09:03 -0700 Subject: [PATCH 23/97] Move to under pyspark --- python/{ => pyspark}/find_spark_home.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename python/{ => pyspark}/find_spark_home.py (100%) diff --git a/python/find_spark_home.py b/python/pyspark/find_spark_home.py similarity index 100% rename from python/find_spark_home.py rename to python/pyspark/find_spark_home.py From 435f8427a6ca5bdfae25ba439822e44b7fd4eff4 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 24 Oct 2016 07:13:12 -0700 Subject: [PATCH 24/97] Update to py4j 0.10.4 in the deps, also switch how we are copying find_spark_home.py around --- python/MANIFEST.in | 1 - python/setup.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 9033b734feb8..15026c59f3b0 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -2,4 +2,3 @@ recursive-include deps/jars *.jar recursive-include deps/bin * recursive-include deps/examples *.py recursive-include lib *.zip -include find-spark-home.py diff --git a/python/setup.py b/python/setup.py index a5ddf3c57eb0..0b861a06c6ff 100644 --- a/python/setup.py +++ b/python/setup.py @@ -41,7 +41,6 @@ # Scripts directive requires a list of each script path and does not take wild cards. script_names = os.listdir(SCRIPTS_TARGET) scripts = map(lambda script: SCRIPTS_TARGET + "/" + script, script_names) - scripts.append("find_spark_home.py") setup( name='pyspark', @@ -73,7 +72,7 @@ 'pyspark.examples.src.main.python': ['*.py', '*/*.py']}, scripts=scripts, license='http://www.apache.org/licenses/LICENSE-2.0', - install_requires=['py4j==0.10.3'], + install_requires=['py4j==0.10.4'], extras_require={ 'ml': ['numpy>=1.7'], 'mllib': ['numpy<=1.7'], From 27ca27eda451cc4edbdb1811bef4c07bdafc98ef Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 24 Oct 2016 07:16:59 -0700 Subject: [PATCH 25/97] Update java gateway to use _find_spark_home function, add quick sanity check file --- dev/pip-sanity-check.py | 36 ++++++++++++++++++++++++++++++++++ python/pyspark/java_gateway.py | 1 + 2 files changed, 37 insertions(+) create mode 100644 dev/pip-sanity-check.py diff --git a/dev/pip-sanity-check.py b/dev/pip-sanity-check.py new file mode 100644 index 000000000000..12f540a3b4a3 --- /dev/null +++ b/dev/pip-sanity-check.py @@ -0,0 +1,36 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark.sql import SparkSession +import sys + +if __name__ == "__main__": + spark = SparkSession\ + .builder\ + .appName("PipSanityCheck")\ + .getOrCreate() + sc = spark.sparkContext + rdd = sc.parallelize(range(100), 10) + value = rdd.reduce(lambda x, y: x + y) + if (value != 4950): + print("Value %d did not match expected value." % value, file=sys.stderr) + sys.exit(-1) + print("Successfuly ran pip sanity check") + + spark.stop() diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py index 9ab3573a2bce..344a9fde6eea 100644 --- a/python/pyspark/java_gateway.py +++ b/python/pyspark/java_gateway.py @@ -29,6 +29,7 @@ xrange = range from py4j.java_gateway import java_import, JavaGateway, GatewayClient +from pyspark.find_spark_home import _find_spark_home from pyspark.serializers import read_int def launch_gateway(conf=None): From df126cf219b9367792e9a25b7d3493b7a060daee Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 24 Oct 2016 07:45:23 -0700 Subject: [PATCH 26/97] Lint fixes --- python/pyspark/find_spark_home.py | 7 +++++-- python/pyspark/java_gateway.py | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py index a384cf0783b5..a98ffc4c9591 100755 --- a/python/pyspark/find_spark_home.py +++ b/python/pyspark/find_spark_home.py @@ -21,10 +21,13 @@ # that Spark may have been installed on the system with pip. from __future__ import print_function -import os, sys +import os +import sys + def _find_spark_home(): """Find the SPARK_HOME.""" + # If the enviroment has SPARK_HOME set trust it. if "SPARK_HOME" in os.environ: return os.environ["SPARK_HOME"] @@ -51,7 +54,7 @@ def is_spark_home(path): True # Normalize the paths - paths = map(lambda path:os.path.abspath(path), paths) + paths = map(lambda path: os.path.abspath(path), paths) try: return next(path for path in paths if is_spark_home(path)) diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py index 344a9fde6eea..3c783ae541a1 100644 --- a/python/pyspark/java_gateway.py +++ b/python/pyspark/java_gateway.py @@ -32,6 +32,7 @@ from pyspark.find_spark_home import _find_spark_home from pyspark.serializers import read_int + def launch_gateway(conf=None): """ launch jvm gateway From 555d4437f32f658e673b060c84c260f18d6376ec Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 24 Oct 2016 08:02:23 -0700 Subject: [PATCH 27/97] More progress on running the pip installability tests --- dev/run-pip-tests | 9 ++++++--- dev/run-pip-tests-2 | 3 +-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/dev/run-pip-tests b/dev/run-pip-tests index 6dc0cb54deb3..3690f14fec0e 100755 --- a/dev/run-pip-tests +++ b/dev/run-pip-tests @@ -23,10 +23,13 @@ cd "$FWDIR" # Run the tests, we wrap the underlying test script for cleanup and because early exit # doesn't always properly exit a virtualenv. -./run-pip-test-2 +$FWDIR/dev/run-pip-tests-2 export success=$? -# Clean up the virtual env enviroment used -rm `cat ./virtual_env_temp-dir` +# Clean up the virtual env enviroment used if we created one. +if [ -f ./virtual_env_tmp_dir ]; then + rm -rf `cat ./virtual_env_temp_dir` + rm ./virtaul_env_tmp_dir +fi exit $success diff --git a/dev/run-pip-tests-2 b/dev/run-pip-tests-2 index 84fc80562c51..2f4ebbbb39bb 100755 --- a/dev/run-pip-tests-2 +++ b/dev/run-pip-tests-2 @@ -19,7 +19,6 @@ # Stop on error set -e -set -x FWDIR="$(cd "`dirname $0`"/..; pwd)" cd "$FWDIR" @@ -59,4 +58,4 @@ spark-submit $FWDIR/dev/pip-sanity-check.py echo "Run basic sanity check with import based" python $FWDIR/dev/pip-sanity-check.py -cd "$FWDIR" +exit 0 From 051abe5e319623e946a6841f8f88d7c5ca5263bf Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Oct 2016 04:28:16 -0700 Subject: [PATCH 28/97] Try and unify path used for shell script file, add a README.md file for Python and use it for long_description in PyPi --- bin/pyspark | 7 +------ bin/pyspark2.cmd | 4 ---- python/MANIFEST.in | 1 + python/setup.py | 21 +++++++++++++++++++++ 4 files changed, 23 insertions(+), 10 deletions(-) diff --git a/bin/pyspark b/bin/pyspark index e7b7a6997897..deb81ca046f8 100755 --- a/bin/pyspark +++ b/bin/pyspark @@ -61,12 +61,7 @@ export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.4-src.zip:$PYTHONPATH" # Load the PySpark shell.py script when ./pyspark is used interactively: export OLD_PYTHONSTARTUP="$PYTHONSTARTUP" -# Check if we are pip installed or otherwise -if [ -d ${SPARK_HOME}/python/pyspark ]; then - export PYTHONSTARTUP="${SPARK_HOME}/python/pyspark/shell.py" -else - export PYTHONSTARTUP="${SPARK_HOME}/shell.py" -fi +export PYTHONSTARTUP="${SPARK_HOME}/python/pyspark/shell.py" # For pyspark tests if [[ -n "$SPARK_TESTING" ]]; then diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd index 0e7aee383371..11d9d16222f6 100644 --- a/bin/pyspark2.cmd +++ b/bin/pyspark2.cmd @@ -34,10 +34,6 @@ set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.4-src.zip;%PYTHONPATH% set OLD_PYTHONSTARTUP=%PYTHONSTARTUP% rem Check if we are pip installed or not -IF EXIST %SPARK_HOME%\python\pyspark\shell.py ( set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py -) ELSE ( -set PYTHONSTARTUP=%SPARK_HOME%\shell.py -) call "%SPARK_HOME%\bin\spark-submit2.cmd" pyspark-shell-main --name "PySparkShell" %* diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 15026c59f3b0..49df6917441f 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -2,3 +2,4 @@ recursive-include deps/jars *.jar recursive-include deps/bin * recursive-include deps/examples *.py recursive-include lib *.zip +inlcude README.md diff --git a/python/setup.py b/python/setup.py index 0b861a06c6ff..b502120a41a2 100644 --- a/python/setup.py +++ b/python/setup.py @@ -15,6 +15,9 @@ EXAMPLES_TARGET = "%s/examples" % TEMP_PATH # Check and see if we are under the spark path in which case we need to build the symlink farm. +# This is important because we only want to build the symlink farm while under Spark otherwise we +# want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a +# partially built sdist) we should error and have the user sort it out. in_spark = os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") if (in_spark): # Construct links for setup @@ -26,13 +29,29 @@ try: if (in_spark): + # Construct the symlink farm os.symlink(JARS_PATH, JARS_TARGET) os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET) + # Parse the README markdown file into rst for PyPi + try: + import pypandoc + long_description = pypandoc.convert('README.md', 'rst') + except ImportError: + print("Could not import pypandoc - required to package PySpark", file=sys.stderr) + long_description = "!!!!! missing pandoc do not upload to PyPi !!!!" else: # We add find_spark_home.py to the bin directory we install so that pip installed PySpark # will search for SPARK_HOME with Python. + # We only do this copy when we aren't inside of Spark (e.g. the packaging tool has copied + # all the files into a temp directory) since otherwise the copy would go into the symlinked + # directory. copyfile("pyspark/find_spark_home.py", SCRIPTS_TARGET + "/find_spark_home.py") + # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts + # find it where expected. The rest of the files aren't copied because they are accessed + # using Python imports instead which will be resolved correctly. + os.makedirs("pyspark/python/pyspark") + copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py") if not os.path.isdir(SCRIPTS_TARGET): print("For packaging reasons you must first create a source dist and install that source dist.", file=sys.stderr) @@ -46,6 +65,7 @@ name='pyspark', version=VERSION, description='Apache Spark Python API', + long_description=long_description, author='Spark Developers', author_email='dev@spark.apache.org', url='https://github.com/apache/spark/tree/master/python', @@ -73,6 +93,7 @@ scripts=scripts, license='http://www.apache.org/licenses/LICENSE-2.0', install_requires=['py4j==0.10.4'], + setup_requires=['pypandoc'], extras_require={ 'ml': ['numpy>=1.7'], 'mllib': ['numpy<=1.7'], From b345bdb72e2b175169c82f863b5f14840017266b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Oct 2016 04:31:53 -0700 Subject: [PATCH 29/97] Add README file --- python/README.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 python/README.md diff --git a/python/README.md b/python/README.md new file mode 100644 index 000000000000..db2ce8611656 --- /dev/null +++ b/python/README.md @@ -0,0 +1,32 @@ +# Apache Spark + +Spark is a fast and general cluster computing system for Big Data. It provides +high-level APIs in Scala, Java, Python, and R, and an optimized engine that +supports general computation graphs for data analysis. It also supports a +rich set of higher-level tools including Spark SQL for SQL and DataFrames, +MLlib for machine learning, GraphX for graph processing, +and Spark Streaming for stream processing. + + + +## Online Documentation + +You can find the latest Spark documentation, including a programming +guide, on the [project web page](http://spark.apache.org/documentation.html) + + +## Python Packaging + +This README file only contains basic information related to pip installed PySpark. +This packaging is currently expiremental and may change in future versions (although we will do our best to keep compatability). +Using PySpark requires the Spark JARs, and if you are building this from source please see the builder instractions at at +["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html). + +The Python packaging for Spark is not intended to replace all of the other use cases. This Python packaged version of Spark is suitable for interacting with an existing cluster (be it Spark standalone, YARN, or Mesos) - but does not contain the tools required to setup your own standalone Spark cluster. You can download the full version of Spark from the [Apache Spark downloads page](http://spark.apache.org/downloads.html). + + +**NOTE:** If you are using this with a Spark standalone cluster you must ensure that the version (including minor version) matches or you may experience odd errors. + +## Python Requirements + +At its core PySpark depends on Py4J (currently version 0.10.4), but additional sub-packages have their own requirements (including numpy and pandas). \ No newline at end of file From 28da44b776481f92f597623e9b5ccbc0c6b637ee Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Oct 2016 04:32:41 -0700 Subject: [PATCH 30/97] Switch version to a PEP440 version otherwise it can't go on PyPiTest, include the README file --- python/MANIFEST.in | 2 +- python/setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 49df6917441f..59e56fbcd020 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -2,4 +2,4 @@ recursive-include deps/jars *.jar recursive-include deps/bin * recursive-include deps/examples *.py recursive-include lib *.zip -inlcude README.md +include README.md diff --git a/python/setup.py b/python/setup.py index b502120a41a2..f09b71dc0aff 100644 --- a/python/setup.py +++ b/python/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages from shutil import copyfile -VERSION = '2.1.0-SNAPSHOT' +VERSION = '2.1.0.dev' # A temporary path so we can access above the Python project root and fetch scripts and jars we need TEMP_PATH = "deps" SPARK_HOME = os.path.abspath("../") From 0f16c087fa16e55f9452efc4645ebf0b7041225d Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Oct 2016 04:36:53 -0700 Subject: [PATCH 31/97] More notes --- bin/find-spark-home | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bin/find-spark-home b/bin/find-spark-home index 169590dc3082..d869ae666b1b 100755 --- a/bin/find-spark-home +++ b/bin/find-spark-home @@ -17,14 +17,19 @@ # limitations under the License. # -# Attempts to find a proper value for SPARK_HOME +# Attempts to find a proper value for SPARK_HOME. Should be included using "source" directive. FIND_SPARK_HOME_PYTHON_SCRIPT="$(cd "`dirname "$0"`"; pwd)/find_spark_home.py" +# Short cirtuit if the user already has this set. if [ ! -z "${SPARK_HOME}" ]; then exit 0 elif [ ! -f $FIND_SPARK_HOME_PYTHON_SCRIPT ]; then - # If we are not in the same directory as find_spark_home.py we are not pip installed + # If we are not in the same directory as find_spark_home.py we are not pip installed so we don't + # need to search the different Python directories for a Spark installation. + # Note only that, if the user has pip installed PySpark but is directly calling pyspark-shell or + # spark-submit in another directory we want to use that version of PySpark rather than the + # pip installed version of PySpark. export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" else # We are pip installed, use the Python script to resolve a reasonable SPARK_HOME From 574c1f05bcbecee7441a545d3410e6b7c33d7e68 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Oct 2016 04:44:18 -0700 Subject: [PATCH 32/97] Add pip-sanity-check.py to the linter list and add a note that we should fix the rest of the pep8 errors under dev at some point in the future --- dev/lint-python | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev/lint-python b/dev/lint-python index 63487043a50b..69bf8df9469d 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -20,7 +20,9 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")" PATHS_TO_CHECK="./python/pyspark/ ./examples/src/main/python/ ./dev/sparktestsupport" +# TODO: fix pep8 errors with the rest of the Python scripts under dev PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py ./dev/run-tests-jenkins.py" +PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/pip-sanity-check.py" PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt" PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt" PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt" From 62997446ab802ecbe4af81e89b9dff1ee7070ca2 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Oct 2016 05:00:30 -0700 Subject: [PATCH 33/97] Fix handling of long_description, add check for existing artifacts in dist dir which can give incorrect results during pip installation validation --- dev/run-pip-tests-2 | 14 ++++++++++++++ python/setup.py | 21 +++++++++++++-------- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/dev/run-pip-tests-2 b/dev/run-pip-tests-2 index 2f4ebbbb39bb..c2875c01709d 100755 --- a/dev/run-pip-tests-2 +++ b/dev/run-pip-tests-2 @@ -19,6 +19,8 @@ # Stop on error set -e +# Set nullglob for when we are checking existence based on globs +shopt -s nullglob FWDIR="$(cd "`dirname $0`"/..; pwd)" cd "$FWDIR" @@ -44,18 +46,30 @@ VIRTUALENV_BASE=`cat ./virtual_env_temp_dir` echo "Using $VIRTUALENV_BASE for virtualenv" virtualenv $VIRTUALENV_BASE source $VIRTUALENV_BASE/bin/activate +# Upgrade pip +pip install --upgrade pip echo "Creating pip installable source dist" cd python python setup.py sdist + echo "Installing dist into virtual env" cd dist +# Verify that the dist directory only contains one thing to install +sdists=(*.targ.gz) +if [ ! $sdists == 1 ]; then + echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first." + exit -1 +fi +# Do the actual installation pip install --upgrade --force-reinstall *.tar.gz echo "Run basic sanity check on pip installed version with spark-submit" spark-submit $FWDIR/dev/pip-sanity-check.py echo "Run basic sanity check with import based" python $FWDIR/dev/pip-sanity-check.py +echo "Run the tests for context.py" +python $FWDIR/python/pyspark/context.py exit 0 diff --git a/python/setup.py b/python/setup.py index f09b71dc0aff..7d7c772fdc00 100644 --- a/python/setup.py +++ b/python/setup.py @@ -33,13 +33,6 @@ os.symlink(JARS_PATH, JARS_TARGET) os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET) - # Parse the README markdown file into rst for PyPi - try: - import pypandoc - long_description = pypandoc.convert('README.md', 'rst') - except ImportError: - print("Could not import pypandoc - required to package PySpark", file=sys.stderr) - long_description = "!!!!! missing pandoc do not upload to PyPi !!!!" else: # We add find_spark_home.py to the bin directory we install so that pip installed PySpark # will search for SPARK_HOME with Python. @@ -50,7 +43,11 @@ # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts # find it where expected. The rest of the files aren't copied because they are accessed # using Python imports instead which will be resolved correctly. - os.makedirs("pyspark/python/pyspark") + try: + os.makedirs("pyspark/python/pyspark") + except OSError: + # Don't worry if the directory already exists. + True copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py") if not os.path.isdir(SCRIPTS_TARGET): @@ -61,6 +58,14 @@ script_names = os.listdir(SCRIPTS_TARGET) scripts = map(lambda script: SCRIPTS_TARGET + "/" + script, script_names) + # Parse the README markdown file into rst for PyPi + long_description = "!!!!! missing pandoc do not upload to PyPi !!!!" + try: + import pypandoc + long_description = pypandoc.convert('README.md', 'rst') + except ImportError: + print("Could not import pypandoc - required to package PySpark", file=sys.stderr) + setup( name='pyspark', version=VERSION, From 17104c18355e363ff6f1ec1b105be4b060721daa Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Oct 2016 05:04:40 -0700 Subject: [PATCH 34/97] Fix check for number of sdists --- dev/run-pip-tests-2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/run-pip-tests-2 b/dev/run-pip-tests-2 index c2875c01709d..4fb326f49df0 100755 --- a/dev/run-pip-tests-2 +++ b/dev/run-pip-tests-2 @@ -57,8 +57,8 @@ python setup.py sdist echo "Installing dist into virtual env" cd dist # Verify that the dist directory only contains one thing to install -sdists=(*.targ.gz) -if [ ! $sdists == 1 ]; then +sdists=(*.tar.gz) +if [ ${#sdists[@]} -ne 1 ]; then echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first." exit -1 fi From 0447ea2079ffe1211acb7864d35f291cc6ea37a9 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Oct 2016 05:09:26 -0700 Subject: [PATCH 35/97] Typo fixes, make sure SPARK_HOME isn't being set based on PWD during tests --- bin/pyspark2.cmd | 1 - dev/make-distribution.sh | 1 + dev/run-pip-tests-2 | 2 ++ docs/building-spark.md | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd index 11d9d16222f6..f211c0873ad2 100644 --- a/bin/pyspark2.cmd +++ b/bin/pyspark2.cmd @@ -33,7 +33,6 @@ set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH% set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.4-src.zip;%PYTHONPATH% set OLD_PYTHONSTARTUP=%PYTHONSTARTUP% -rem Check if we are pip installed or not set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py call "%SPARK_HOME%\bin\spark-submit2.cmd" pyspark-shell-main --name "PySparkShell" %* diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index 5b8ac6766464..64a420558902 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -204,6 +204,7 @@ cp -r "$SPARK_HOME/data" "$DISTDIR" # Make pip package cd python python setup.py sdist +cd .. # Copy other things mkdir "$DISTDIR"/conf diff --git a/dev/run-pip-tests-2 b/dev/run-pip-tests-2 index 4fb326f49df0..4d294202605c 100755 --- a/dev/run-pip-tests-2 +++ b/dev/run-pip-tests-2 @@ -65,6 +65,8 @@ fi # Do the actual installation pip install --upgrade --force-reinstall *.tar.gz +cd / + echo "Run basic sanity check on pip installed version with spark-submit" spark-submit $FWDIR/dev/pip-sanity-check.py echo "Run basic sanity check with import based" diff --git a/docs/building-spark.md b/docs/building-spark.md index 95423c7ec441..5a8c652afd85 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -261,7 +261,7 @@ If you have JDK 8 installed but it is not the system default, you can set JAVA_H ## PySpark pip installable -If your are building Spark for use in a Python enviroment and you wish to pip install it, you will first need to build the Spark JARs as describeded above. Then you can construct an sdist package suitable for setup.py and pip installable package. +If your are building Spark for use in a Python environment and you wish to pip install it, you will first need to build the Spark JARs as described above. Then you can construct an sdist package suitable for setup.py and pip installable package. **Note:** Due to packaging requirements you can not directly pip install from the Python directory, rather you must first build the sdist package as described above. From c335c80f5a2fe4318084c958328ba6e7e8ed360f Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Oct 2016 05:10:27 -0700 Subject: [PATCH 36/97] More typo fixes --- python/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/README.md b/python/README.md index db2ce8611656..0a5c8010b848 100644 --- a/python/README.md +++ b/python/README.md @@ -18,8 +18,8 @@ guide, on the [project web page](http://spark.apache.org/documentation.html) ## Python Packaging This README file only contains basic information related to pip installed PySpark. -This packaging is currently expiremental and may change in future versions (although we will do our best to keep compatability). -Using PySpark requires the Spark JARs, and if you are building this from source please see the builder instractions at at +This packaging is currently experimental and may change in future versions (although we will do our best to keep compatibility). +Using PySpark requires the Spark JARs, and if you are building this from source please see the builder instructions at ["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html). The Python packaging for Spark is not intended to replace all of the other use cases. This Python packaged version of Spark is suitable for interacting with an existing cluster (be it Spark standalone, YARN, or Mesos) - but does not contain the tools required to setup your own standalone Spark cluster. You can download the full version of Spark from the [Apache Spark downloads page](http://spark.apache.org/downloads.html). From 146567b0c8bada74319824662e624d2d4c35ab18 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Oct 2016 05:22:27 -0700 Subject: [PATCH 37/97] We are python 2 and 3 compat :) --- python/setup.cfg | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 python/setup.cfg diff --git a/python/setup.cfg b/python/setup.cfg new file mode 100644 index 000000000000..7c2b2874c477 --- /dev/null +++ b/python/setup.cfg @@ -0,0 +1,2 @@ +[bdist_wheel] +universal = 1 \ No newline at end of file From 0e2223ddffb81c1eeec04c58940615b398f39908 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Oct 2016 05:30:47 -0700 Subject: [PATCH 38/97] Use more standard version.py file, check sys version is greater than 2.7, add some classifiers to the setup file (based on the ones libcloud uses that apply to us as well) --- python/pyspark/__init__.py | 2 +- python/pyspark/version.py | 2 ++ python/setup.py | 40 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 python/pyspark/version.py diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index ec1687415a7f..e7d6c96707c8 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -50,7 +50,7 @@ from pyspark.serializers import MarshalSerializer, PickleSerializer from pyspark.status import * from pyspark.profiler import Profiler, BasicProfiler - +from pyspark.version import __version__ def since(version): """ diff --git a/python/pyspark/version.py b/python/pyspark/version.py new file mode 100644 index 000000000000..b79038b1d201 --- /dev/null +++ b/python/pyspark/version.py @@ -0,0 +1,2 @@ +# +__version__ = '2.1.0.dev1' diff --git a/python/setup.py b/python/setup.py index 7d7c772fdc00..b0e2250ad0c6 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,9 +1,28 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function import os, sys from setuptools import setup, find_packages from shutil import copyfile -VERSION = '2.1.0.dev' +exec(open('pyspark/version.py').read()) +VERSION = __version__ # A temporary path so we can access above the Python project root and fetch scripts and jars we need TEMP_PATH = "deps" SPARK_HOME = os.path.abspath("../") @@ -14,6 +33,10 @@ JARS_TARGET = "%s/jars" % TEMP_PATH EXAMPLES_TARGET = "%s/examples" % TEMP_PATH +if sys.version_info < (2, 7): + print("Python versions prior to 2.7 are not supported.", file=sys.stderr) + exit(-1) + # Check and see if we are under the spark path in which case we need to build the symlink farm. # This is important because we only want to build the symlink farm while under Spark otherwise we # want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a @@ -103,7 +126,20 @@ 'ml': ['numpy>=1.7'], 'mllib': ['numpy<=1.7'], 'sql': ['pandas'] - } + }, + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: Apache Software License', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.0', + 'Programming Language :: Python :: 3.1', + 'Programming Language :: Python :: 3.2', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: PyPy'] ) finally: # We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than From 849ded008b2f9646e5fc945dd3796a98dd38625b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Oct 2016 05:48:39 -0700 Subject: [PATCH 39/97] First pass at updating the release-build script --- dev/create-release/release-build.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 96f9b5714ebb..d6afa29eebce 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -162,6 +162,9 @@ if [[ "$1" == "package" ]]; then export ZINC_PORT=$ZINC_PORT echo "Creating distribution: $NAME ($FLAGS)" + # Write out the NAME and VERSION to PySpark version info + echo "__version__='$VERSION.$NAME'" > ./spark-$SPARK_VERSION-bin-$NAME/python/pyspark/version.py + # Get maven home set by MVN MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'` @@ -169,7 +172,6 @@ if [[ "$1" == "package" ]]; then -DzincPort=$ZINC_PORT 2>&1 > ../binary-release-$NAME.log cd .. cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz . - echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \ --output spark-$SPARK_VERSION-bin-$NAME.tgz.asc \ --detach-sig spark-$SPARK_VERSION-bin-$NAME.tgz @@ -179,6 +181,19 @@ if [[ "$1" == "package" ]]; then echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ SHA512 spark-$SPARK_VERSION-bin-$NAME.tgz > \ spark-$SPARK_VERSION-bin-$NAME.tgz.sha + + PYTHON_DIST_NAME=`ls -a1 python/dist/*.tar.gz` + cp spark-$SPARK_VERSION-bin-$NAME/python/pyspark/dist/*.tgz . + + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \ + --output $PYTHON_DIST_NAME.asc \ + --detach-sig $PYTHON_DIST_NAME.tgz + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ + MD5 $PYTHON_DIST_NAME.tgz > \ + $PYTHON_DIST_NAME.tgz.md5 + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ + SHA512 spark-$SPARK_VERSION-bin-$NAME.tgz > \ + $PYTHON_DIST_NAME.tgz.sha } # TODO: Check exit codes of children here: From cf5ab7eb4c452e7daa3e2f1d1eafe63363bc5981 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Oct 2016 16:42:40 -0700 Subject: [PATCH 40/97] consider handling being inside a release --- python/setup.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/setup.py b/python/setup.py index b0e2250ad0c6..d417845fa943 100644 --- a/python/setup.py +++ b/python/setup.py @@ -17,7 +17,9 @@ # limitations under the License. from __future__ import print_function -import os, sys +import glob +import os +import sys from setuptools import setup, find_packages from shutil import copyfile @@ -41,7 +43,9 @@ # This is important because we only want to build the symlink farm while under Spark otherwise we # want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a # partially built sdist) we should error and have the user sort it out. -in_spark = os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") +in_spark = (os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or + (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1)) + if (in_spark): # Construct links for setup try: From 3788bfbd7d88fdc551d87fc86bb0ddbfa25d7719 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Oct 2016 07:04:12 -0700 Subject: [PATCH 41/97] Fix up make-distribution to build the python artifacts, update release-build to translate the Spark version string into a PEP440 compat one. Note: Left out is uploading to pypi or pypitest for now as I'm not a committer. I've tested this part locally but I can't do a full test because I'm not a committer. --- dev/create-release/release-build.sh | 45 +++++++++++++++++------------ dev/make-distribution.sh | 3 +- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index d6afa29eebce..99ad6b1a64ac 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -162,15 +162,34 @@ if [[ "$1" == "package" ]]; then export ZINC_PORT=$ZINC_PORT echo "Creating distribution: $NAME ($FLAGS)" - # Write out the NAME and VERSION to PySpark version info - echo "__version__='$VERSION.$NAME'" > ./spark-$SPARK_VERSION-bin-$NAME/python/pyspark/version.py + # Write out the NAME and VERSION to PySpark version info we rewrite the - into a . and SNAPSHOT + # to dev0 to be closer to PEP440. We use the NAME as a "local version". + PYSPARK_VERSION=`echo "$SPARK_VERSION+$NAME" | sed -r "s/-/./" | sed -r "s/SNAPSHOT/dev0/"` + echo "__version__='$PYSPARK_VERSION'" > python/pyspark/version.py # Get maven home set by MVN MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'` + echo "Creating distribution" ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \ -DzincPort=$ZINC_PORT 2>&1 > ../binary-release-$NAME.log cd .. + + echo "Copying and signing python distribution" + PYTHON_DIST_NAME=pyspark-$PYSPARK_VERSION.tar.gz + cp spark-$SPARK_VERSION-bin-$NAME/python/dist/$PYTHON_DIST_NAME . + + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \ + --output $PYTHON_DIST_NAME.asc \ + --detach-sig $PYTHON_DIST_NAME + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ + MD5 $PYTHON_DIST_NAME.gz > \ + $PYTHON_DIST_NAME.md5 + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ + SHA512 $PYTHON_DIST_NAME > \ + $PYTHON_DIST_NAME.sha + + echo "Copying and signing regular binary distribution" cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz . echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \ --output spark-$SPARK_VERSION-bin-$NAME.tgz.asc \ @@ -181,19 +200,6 @@ if [[ "$1" == "package" ]]; then echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ SHA512 spark-$SPARK_VERSION-bin-$NAME.tgz > \ spark-$SPARK_VERSION-bin-$NAME.tgz.sha - - PYTHON_DIST_NAME=`ls -a1 python/dist/*.tar.gz` - cp spark-$SPARK_VERSION-bin-$NAME/python/pyspark/dist/*.tgz . - - echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \ - --output $PYTHON_DIST_NAME.asc \ - --detach-sig $PYTHON_DIST_NAME.tgz - echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ - MD5 $PYTHON_DIST_NAME.tgz > \ - $PYTHON_DIST_NAME.tgz.md5 - echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ - SHA512 spark-$SPARK_VERSION-bin-$NAME.tgz > \ - $PYTHON_DIST_NAME.tgz.sha } # TODO: Check exit codes of children here: @@ -202,10 +208,10 @@ if [[ "$1" == "package" ]]; then # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds # share the same Zinc server. FLAGS="-Psparkr -Phive -Phive-thriftserver -Pyarn -Pmesos" - make_binary_release "hadoop2.3" "-Phadoop2.3 $FLAGS" "3033" & - make_binary_release "hadoop2.4" "-Phadoop2.4 $FLAGS" "3034" & - make_binary_release "hadoop2.6" "-Phadoop2.6 $FLAGS" "3035" & - make_binary_release "hadoop2.7" "-Phadoop2.7 $FLAGS" "3036" & + make_binary_release "hadoop2.3" "-Phadoop-2.3 $FLAGS" "3033" & + make_binary_release "hadoop2.4" "-Phadoop-2.4 $FLAGS" "3034" & + make_binary_release "hadoop2.6" "-Phadoop-2.6 $FLAGS" "3035" & + make_binary_release "hadoop2.7" "-Phadoop-2.7 $FLAGS" "3036" & make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn -Pmesos" "3037" & make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn -Pmesos" "3038" & wait @@ -223,6 +229,7 @@ if [[ "$1" == "package" ]]; then # Re-upload a second time and leave the files in the timestamped upload directory: LFTP mkdir -p $dest_dir LFTP mput -O $dest_dir 'spark-*' + LFTP mput -O $dest_dir 'pyspark-*' exit 0 fi diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index 64a420558902..741745a47c15 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -202,7 +202,8 @@ fi cp -r "$SPARK_HOME/data" "$DISTDIR" # Make pip package -cd python +echo "Building python distribution package" +cd $SPARK_HOME/python python setup.py sdist cd .. From 308a168cea345cdabd8a8c047fcf190d71505ebd Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Oct 2016 07:09:29 -0700 Subject: [PATCH 42/97] Fix python lint errors and add linting to setup.py --- dev/lint-python | 2 +- python/pyspark/__init__.py | 1 + python/setup.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dev/lint-python b/dev/lint-python index 69bf8df9469d..3f878c2dad6b 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -21,7 +21,7 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")" PATHS_TO_CHECK="./python/pyspark/ ./examples/src/main/python/ ./dev/sparktestsupport" # TODO: fix pep8 errors with the rest of the Python scripts under dev -PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py ./dev/run-tests-jenkins.py" +PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/*.py ./dev/run-tests-jenkins.py" PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/pip-sanity-check.py" PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt" PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt" diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index e7d6c96707c8..5f93586a48a5 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -52,6 +52,7 @@ from pyspark.profiler import Profiler, BasicProfiler from pyspark.version import __version__ + def since(version): """ A decorator that annotates a function to append the version of Spark the function was added. diff --git a/python/setup.py b/python/setup.py index d417845fa943..acf9a1260593 100644 --- a/python/setup.py +++ b/python/setup.py @@ -78,7 +78,7 @@ copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py") if not os.path.isdir(SCRIPTS_TARGET): - print("For packaging reasons you must first create a source dist and install that source dist.", file=sys.stderr) + print("You must first create a source dist and install that source dist.", file=sys.stderr) exit(-1) # Scripts directive requires a list of each script path and does not take wild cards. From 74b79c4b1f0ddddcdbfd72841ede88b1f1954c89 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Oct 2016 07:21:33 -0700 Subject: [PATCH 43/97] Add python packaging tests to run-tests script --- dev/run-tests.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dev/run-tests.py b/dev/run-tests.py index 5d661f5f1a1c..bc6b5c3a5ee7 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -432,6 +432,12 @@ def run_python_tests(test_modules, parallelism): run_cmd(command) +def run_python_packaging_tests(): + set_title_and_block("Running PySpark packaging tests", "BLOCK_PYSPARK_PIP_TESTS") + command = [os.path.join(SPARK_HOME, "dev", "./dev/run-pip-tests")] + run_cmd(command) + + def run_build_tests(): set_title_and_block("Running build tests", "BLOCK_BUILD_TESTS") run_cmd([os.path.join(SPARK_HOME, "dev", "test-dependencies.sh")]) @@ -583,6 +589,7 @@ def main(): modules_with_python_tests = [m for m in test_modules if m.python_test_goals] if modules_with_python_tests: run_python_tests(modules_with_python_tests, opts.parallelism) + run_python_packaging_tests() if any(m.should_run_r_tests for m in test_modules): run_sparkr_tests() From 305655378974762acf23f45a02ee85f802ce7f2a Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Oct 2016 07:22:19 -0700 Subject: [PATCH 44/97] Add license header to setup.cfg --- python/setup.cfg | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/python/setup.cfg b/python/setup.cfg index 7c2b2874c477..ebe3cc06a5f8 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -1,2 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + [bdist_wheel] universal = 1 \ No newline at end of file From 125ae2a60d9396b19e9b33047d9580b3520583c0 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Oct 2016 16:57:24 -0700 Subject: [PATCH 45/97] Fix typo PyPi to PyPI --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 286d9d978b63..689324e68721 100644 --- a/docs/index.md +++ b/docs/index.md @@ -15,7 +15,7 @@ It also supports a rich set of higher-level tools including [Spark SQL](sql-prog Get Spark from the [downloads page](http://spark.apache.org/downloads.html) of the project website. This documentation is for Spark version {{site.SPARK_VERSION}}. Spark uses Hadoop's client libraries for HDFS and YARN. Downloads are pre-packaged for a handful of popular Hadoop versions. Users can also download a "Hadoop free" binary and run Spark with any Hadoop version [by augmenting Spark's classpath](hadoop-provided.html). -Scala and Java users can include Spark in their projects using it's maven cooridnates and Python users can also install Spark from PyPi. +Scala and Java users can include Spark in their projects using it's maven cooridnates and in the future Python users can also install Spark from PyPI. If you'd like to build Spark from From d2da8b05d3aec0111223223cb04867adb7d6ebaa Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Oct 2016 16:57:53 -0700 Subject: [PATCH 46/97] Fix typo PyPi to PyPI (2) --- python/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/setup.py b/python/setup.py index acf9a1260593..717bd4c903a3 100644 --- a/python/setup.py +++ b/python/setup.py @@ -85,8 +85,8 @@ script_names = os.listdir(SCRIPTS_TARGET) scripts = map(lambda script: SCRIPTS_TARGET + "/" + script, script_names) - # Parse the README markdown file into rst for PyPi - long_description = "!!!!! missing pandoc do not upload to PyPi !!!!" + # Parse the README markdown file into rst for PyPI + long_description = "!!!!! missing pandoc do not upload to PyPI !!!!" try: import pypandoc long_description = pypandoc.convert('README.md', 'rst') From 595409fc7f0667f9acc303a57e2746bbd43735eb Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Oct 2016 17:47:52 -0700 Subject: [PATCH 47/97] Use copytree and rmtree on windows - note: still not explicitly tested on windows --- python/setup.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/python/setup.py b/python/setup.py index 717bd4c903a3..7eb6768651b8 100644 --- a/python/setup.py +++ b/python/setup.py @@ -21,7 +21,7 @@ import os import sys from setuptools import setup, find_packages -from shutil import copyfile +from shutil import copyfile, copytree, rmtree exec(open('pyspark/version.py').read()) VERSION = __version__ @@ -29,6 +29,10 @@ TEMP_PATH = "deps" SPARK_HOME = os.path.abspath("../") JARS_PATH = "%s/assembly/target/scala-2.11/jars/" % SPARK_HOME + +if (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1): + JARS_PATH= "%s/jars/" % SPARK_HOME + EXAMPLES_PATH = "%s/examples/src/main/python" % SPARK_HOME SCRIPTS_PATH = "%s/bin" % SPARK_HOME SCRIPTS_TARGET = "%s/bin" % TEMP_PATH @@ -57,16 +61,16 @@ try: if (in_spark): # Construct the symlink farm - os.symlink(JARS_PATH, JARS_TARGET) - os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) - os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET) + if getattr(os, "symlink", None) != None: + os.symlink(JARS_PATH, JARS_TARGET) + os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) + os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET) + else: + # For windows fall back to the slower copytree + copytree(JARS_PATH, JARS_TARGET) + copytree(SCRIPTS_PATH, SCRIPTS_TARGET) + copytree(EXAMPLES_PATH, EXAMPLES_TARGET) else: - # We add find_spark_home.py to the bin directory we install so that pip installed PySpark - # will search for SPARK_HOME with Python. - # We only do this copy when we aren't inside of Spark (e.g. the packaging tool has copied - # all the files into a temp directory) since otherwise the copy would go into the symlinked - # directory. - copyfile("pyspark/find_spark_home.py", SCRIPTS_TARGET + "/find_spark_home.py") # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts # find it where expected. The rest of the files aren't copied because they are accessed # using Python imports instead which will be resolved correctly. @@ -84,6 +88,9 @@ # Scripts directive requires a list of each script path and does not take wild cards. script_names = os.listdir(SCRIPTS_TARGET) scripts = map(lambda script: SCRIPTS_TARGET + "/" + script, script_names) + # We add find_spark_home.py to the bin directory we install so that pip installed PySpark + # will search for SPARK_HOME with Python. + scripts.append("pyspark/find_spark_home.py") # Parse the README markdown file into rst for PyPI long_description = "!!!!! missing pandoc do not upload to PyPI !!!!" @@ -149,7 +156,13 @@ # We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than # packaging. if (in_spark): - os.remove("%s/jars" % TEMP_PATH) - os.remove("%s/bin" % TEMP_PATH) - os.remove("%s/examples" % TEMP_PATH) + # Depending on cleaning up the symlink farm or copied version + if getattr(os, "symlink", None) != None: + os.remove("%s/jars" % TEMP_PATH) + os.remove("%s/bin" % TEMP_PATH) + os.remove("%s/examples" % TEMP_PATH) + else: + rmtree("%s/jars" % TEMP_PATH) + rmtree("%s/bin" % TEMP_PATH) + rmtree("%s/examples" % TEMP_PATH) os.rmdir(TEMP_PATH) From cf421b0c2ba48f22984d6030256241518c8dc0d1 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Oct 2016 17:59:06 -0700 Subject: [PATCH 48/97] Fix style issues --- python/setup.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/setup.py b/python/setup.py index 7eb6768651b8..a2dcb5999ea3 100644 --- a/python/setup.py +++ b/python/setup.py @@ -30,8 +30,9 @@ SPARK_HOME = os.path.abspath("../") JARS_PATH = "%s/assembly/target/scala-2.11/jars/" % SPARK_HOME +# Use the release jars path if we are in release mode. if (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1): - JARS_PATH= "%s/jars/" % SPARK_HOME + JARS_PATH = "%s/jars/" % SPARK_HOME EXAMPLES_PATH = "%s/examples/src/main/python" % SPARK_HOME SCRIPTS_PATH = "%s/bin" % SPARK_HOME @@ -61,7 +62,7 @@ try: if (in_spark): # Construct the symlink farm - if getattr(os, "symlink", None) != None: + if getattr(os, "symlink", None) is not None: os.symlink(JARS_PATH, JARS_TARGET) os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET) @@ -157,7 +158,7 @@ # packaging. if (in_spark): # Depending on cleaning up the symlink farm or copied version - if getattr(os, "symlink", None) != None: + if getattr(os, "symlink", None) is not None: os.remove("%s/jars" % TEMP_PATH) os.remove("%s/bin" % TEMP_PATH) os.remove("%s/examples" % TEMP_PATH) From 31ac8e2941869dd82e4ca8fefe925ee32e01a323 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 27 Oct 2016 03:50:04 -0700 Subject: [PATCH 49/97] Add license header to version.py and manifest.in --- python/MANIFEST.in | 18 ++++++++++++++++++ python/pyspark/version.py | 17 +++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 59e56fbcd020..75df801fcb28 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -1,3 +1,21 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + recursive-include deps/jars *.jar recursive-include deps/bin * recursive-include deps/examples *.py diff --git a/python/pyspark/version.py b/python/pyspark/version.py index b79038b1d201..e853cbfcd2d3 100644 --- a/python/pyspark/version.py +++ b/python/pyspark/version.py @@ -1,2 +1,19 @@ +#!/usr/bin/env python + # +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + __version__ = '2.1.0.dev1' From 0e9cb8d78353d8348279856349e8730bb3dc87da Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 27 Oct 2016 05:36:13 -0700 Subject: [PATCH 50/97] newer version of numpy are fine --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index a2dcb5999ea3..53401a560a65 100644 --- a/python/setup.py +++ b/python/setup.py @@ -136,7 +136,7 @@ setup_requires=['pypandoc'], extras_require={ 'ml': ['numpy>=1.7'], - 'mllib': ['numpy<=1.7'], + 'mllib': ['numpy>=1.7'], 'sql': ['pandas'] }, classifiers=[ From 264b25343b5d81056bfcff97879b8e47f761d0f7 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 27 Oct 2016 06:08:36 -0700 Subject: [PATCH 51/97] Add BLOCK_PYSPARK_PIP_TESTS to jenkins test error codes --- dev/run-tests-jenkins.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py index a48d918f9dc1..1d1e72faccf2 100755 --- a/dev/run-tests-jenkins.py +++ b/dev/run-tests-jenkins.py @@ -128,6 +128,7 @@ def run_tests(tests_timeout): ERROR_CODES["BLOCK_MIMA"]: 'MiMa tests', ERROR_CODES["BLOCK_SPARK_UNIT_TESTS"]: 'Spark unit tests', ERROR_CODES["BLOCK_PYSPARK_UNIT_TESTS"]: 'PySpark unit tests', + ERROR_CODES["BLOCK_PYSPARK_PIP_TESTS"]: 'PySpark pip packaging tests', ERROR_CODES["BLOCK_SPARKR_UNIT_TESTS"]: 'SparkR unit tests', ERROR_CODES["BLOCK_TIMEOUT"]: 'from timeout after a configured wait of \`%s\`' % ( tests_timeout) From 802f682b0b7678a9120814c00fa9577215405c75 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 27 Oct 2016 06:12:36 -0700 Subject: [PATCH 52/97] Add README.md as description file to metadata in setup.cfg --- python/setup.cfg | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/setup.cfg b/python/setup.cfg index ebe3cc06a5f8..635082d570b2 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -16,4 +16,7 @@ # [bdist_wheel] -universal = 1 \ No newline at end of file +universal = 1 + +[metadata] +description-file = README.md \ No newline at end of file From fba37a057f376b200790ff54d03cf06e3c767c1b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 27 Oct 2016 06:13:09 -0700 Subject: [PATCH 53/97] We store version in a different file now --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a17573800f90..19c86de57125 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ org.apache.spark spark-parent_2.11 - + 2.1.0-SNAPSHOT pom Spark Project Parent POM From 8ba499f1bd0e978a300756c202ce1f83d544bdf0 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 27 Oct 2016 09:01:32 -0700 Subject: [PATCH 54/97] Early PR feedback, switch to os.path.join rather than strings, add a note about why we have our somewhat weird symlink farm, add a newline to end of the setup.cfg, and for the include * switch to graft --- python/MANIFEST.in | 2 +- python/pyspark/find_spark_home.py | 5 +++-- python/setup.cfg | 2 +- python/setup.py | 5 +++-- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 75df801fcb28..a10902410496 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -17,7 +17,7 @@ # limitations under the License. recursive-include deps/jars *.jar -recursive-include deps/bin * +graft deps/bin recursive-include deps/examples *.py recursive-include lib *.zip include README.md diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py index a98ffc4c9591..d236afd0275d 100755 --- a/python/pyspark/find_spark_home.py +++ b/python/pyspark/find_spark_home.py @@ -33,9 +33,10 @@ def _find_spark_home(): def is_spark_home(path): """Takes a path and returns true if the provided path could be a reasonable SPARK_HOME""" - return (os.path.isfile(path + "/bin/spark-submit") and os.path.isdir(path + "/jars")) + return (os.path.isfile(os.path.join(path, "bin/spark-submit")) and + (os.path.isdir(os.path.join(path, "jars")))) - paths = ["../", os.path.dirname(sys.argv[0]) + "/../"] + paths = ["../", os.path.join(os.path.dirname(sys.argv[0]), "../")] # Add the path of the PySpark module if it exists if sys.version < "3": diff --git a/python/setup.cfg b/python/setup.cfg index 635082d570b2..d100b932bbaf 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -19,4 +19,4 @@ universal = 1 [metadata] -description-file = README.md \ No newline at end of file +description-file = README.md diff --git a/python/setup.py b/python/setup.py index 53401a560a65..a4ea3f785fa8 100644 --- a/python/setup.py +++ b/python/setup.py @@ -61,7 +61,8 @@ try: if (in_spark): - # Construct the symlink farm + # Construct the symlink farm - this is necessary since we can't refer to the path above the + # package root and we need to copy the jars and scripts which are up above the python root. if getattr(os, "symlink", None) is not None: os.symlink(JARS_PATH, JARS_TARGET) os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) @@ -88,7 +89,7 @@ # Scripts directive requires a list of each script path and does not take wild cards. script_names = os.listdir(SCRIPTS_TARGET) - scripts = map(lambda script: SCRIPTS_TARGET + "/" + script, script_names) + scripts = map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names) # We add find_spark_home.py to the bin directory we install so that pip installed PySpark # will search for SPARK_HOME with Python. scripts.append("pyspark/find_spark_home.py") From 1c177f3bf0e64974e5ad65151c3ea815b11db777 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 27 Oct 2016 09:06:59 -0700 Subject: [PATCH 55/97] Add BLOCK_PYSPARK_PIP_TESTS to error code set --- dev/sparktestsupport/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/sparktestsupport/__init__.py b/dev/sparktestsupport/__init__.py index 89015f8c4fb9..38f25da41f77 100644 --- a/dev/sparktestsupport/__init__.py +++ b/dev/sparktestsupport/__init__.py @@ -33,5 +33,6 @@ "BLOCK_SPARKR_UNIT_TESTS": 20, "BLOCK_JAVA_STYLE": 21, "BLOCK_BUILD_TESTS": 22, + "BLOCK_PYSPARK_PIP_TESTS": 23, "BLOCK_TIMEOUT": 124 } From 6ace0701762740472d9c7d73f7a23e0fd2542ac4 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 28 Oct 2016 03:37:48 -0700 Subject: [PATCH 56/97] Fix path used to run the pip tests in jenkins --- dev/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index bc6b5c3a5ee7..ab285ac96af7 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -434,7 +434,7 @@ def run_python_tests(test_modules, parallelism): def run_python_packaging_tests(): set_title_and_block("Running PySpark packaging tests", "BLOCK_PYSPARK_PIP_TESTS") - command = [os.path.join(SPARK_HOME, "dev", "./dev/run-pip-tests")] + command = [os.path.join(SPARK_HOME, "dev", "run-pip-tests")] run_cmd(command) From ab8ca53465cdac5e2af4b365830f8904d166e3da Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 28 Oct 2016 03:38:26 -0700 Subject: [PATCH 57/97] Fix typo --- dev/pip-sanity-check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/pip-sanity-check.py b/dev/pip-sanity-check.py index 12f540a3b4a3..1d1e524b2633 100644 --- a/dev/pip-sanity-check.py +++ b/dev/pip-sanity-check.py @@ -31,6 +31,6 @@ if (value != 4950): print("Value %d did not match expected value." % value, file=sys.stderr) sys.exit(-1) - print("Successfuly ran pip sanity check") + print("Successfully ran pip sanity check") spark.stop() From 77f8eca93a82da40f93376f91961b1bba1cc01f7 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 30 Oct 2016 07:29:23 -0700 Subject: [PATCH 58/97] Show how to build the sdist in building-spark.md --- docs/building-spark.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/building-spark.md b/docs/building-spark.md index 5a8c652afd85..70441ce48561 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -263,6 +263,8 @@ If you have JDK 8 installed but it is not the system default, you can set JAVA_H If your are building Spark for use in a Python environment and you wish to pip install it, you will first need to build the Spark JARs as described above. Then you can construct an sdist package suitable for setup.py and pip installable package. + cd python; python setup.py sdist + **Note:** Due to packaging requirements you can not directly pip install from the Python directory, rather you must first build the sdist package as described above. ## PySpark Tests with Maven From f590898cf8e0b817938b161c723fef6c038f3a53 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 30 Oct 2016 07:30:31 -0700 Subject: [PATCH 59/97] Have clearer messages (as suggested by @viirya) --- python/setup.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index a4ea3f785fa8..e8786a1213a9 100644 --- a/python/setup.py +++ b/python/setup.py @@ -23,7 +23,12 @@ from setuptools import setup, find_packages from shutil import copyfile, copytree, rmtree -exec(open('pyspark/version.py').read()) +try: + exec(open('pyspark/version.py').read()) +except IOError: + print("Failed to load PySpark version file - to build packaging you must be in the python dir.", + file=sys.stderr) + sys.exit(-1) VERSION = __version__ # A temporary path so we can access above the Python project root and fetch scripts and jars we need TEMP_PATH = "deps" @@ -73,6 +78,10 @@ copytree(SCRIPTS_PATH, SCRIPTS_TARGET) copytree(EXAMPLES_PATH, EXAMPLES_TARGET) else: + # If we are not inside of SPARK_HOME verify we have the required symlink farm + if not os.path.exists(JARS_TARGET): + print("To build packaging must be in the python directory under the SPARK_HOME.", + file=sys.stderr) # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts # find it where expected. The rest of the files aren't copied because they are accessed # using Python imports instead which will be resolved correctly. From f956a5dce0481f24a7569d9bb6f9522cc0c59750 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 30 Oct 2016 07:31:35 -0700 Subject: [PATCH 60/97] Try and improve the wording a little bit --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index e8786a1213a9..0a7c1283d1ae 100644 --- a/python/setup.py +++ b/python/setup.py @@ -26,7 +26,7 @@ try: exec(open('pyspark/version.py').read()) except IOError: - print("Failed to load PySpark version file - to build packaging you must be in the python dir.", + print("Failed to load PySpark version file for packaging you must be in Spark's python dir.", file=sys.stderr) sys.exit(-1) VERSION = __version__ From 489d4e37955599dc10e8fe835799bd71bd922c99 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 30 Oct 2016 18:40:05 -0700 Subject: [PATCH 61/97] Fix typo --- docs/building-spark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/building-spark.md b/docs/building-spark.md index 70441ce48561..24baab218fa9 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -261,7 +261,7 @@ If you have JDK 8 installed but it is not the system default, you can set JAVA_H ## PySpark pip installable -If your are building Spark for use in a Python environment and you wish to pip install it, you will first need to build the Spark JARs as described above. Then you can construct an sdist package suitable for setup.py and pip installable package. +If you are building Spark for use in a Python environment and you wish to pip install it, you will first need to build the Spark JARs as described above. Then you can construct an sdist package suitable for setup.py and pip installable package. cd python; python setup.py sdist From 9e4fdb58fafdfbec59c2d79c6778fb94cd3eaf73 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 31 Oct 2016 10:22:12 -0700 Subject: [PATCH 62/97] Drop extra .gz --- dev/create-release/release-build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 99ad6b1a64ac..da2b68ba102c 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -183,7 +183,7 @@ if [[ "$1" == "package" ]]; then --output $PYTHON_DIST_NAME.asc \ --detach-sig $PYTHON_DIST_NAME echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ - MD5 $PYTHON_DIST_NAME.gz > \ + MD5 $PYTHON_DIST_NAME > \ $PYTHON_DIST_NAME.md5 echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ SHA512 $PYTHON_DIST_NAME > \ From e668af63e9ee26a7d54f3a8092f32498ab287d67 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 31 Oct 2016 10:23:57 -0700 Subject: [PATCH 63/97] Drop ' --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 689324e68721..1f041725ca61 100644 --- a/docs/index.md +++ b/docs/index.md @@ -15,7 +15,7 @@ It also supports a rich set of higher-level tools including [Spark SQL](sql-prog Get Spark from the [downloads page](http://spark.apache.org/downloads.html) of the project website. This documentation is for Spark version {{site.SPARK_VERSION}}. Spark uses Hadoop's client libraries for HDFS and YARN. Downloads are pre-packaged for a handful of popular Hadoop versions. Users can also download a "Hadoop free" binary and run Spark with any Hadoop version [by augmenting Spark's classpath](hadoop-provided.html). -Scala and Java users can include Spark in their projects using it's maven cooridnates and in the future Python users can also install Spark from PyPI. +Scala and Java users can include Spark in their projects using its maven cooridnates and in the future Python users can also install Spark from PyPI. If you'd like to build Spark from From c9d48d3e24a88ee18d8d4e40e9e07834ab2a7b70 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 1 Nov 2016 07:02:44 -0700 Subject: [PATCH 64/97] Make packaging PySpark as pip optional part of make-distirbution the same as the tgz build --- dev/create-release/release-build.sh | 2 +- dev/make-distribution.sh | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index da2b68ba102c..1dbfa3b6e361 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -171,7 +171,7 @@ if [[ "$1" == "package" ]]; then MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'` echo "Creating distribution" - ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \ + ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz --pip $FLAGS \ -DzincPort=$ZINC_PORT 2>&1 > ../binary-release-$NAME.log cd .. diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index 741745a47c15..49b46fbc3fb2 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -33,6 +33,7 @@ SPARK_HOME="$(cd "`dirname "$0"`/.."; pwd)" DISTDIR="$SPARK_HOME/dist" MAKE_TGZ=false +MAKE_PIP=false NAME=none MVN="$SPARK_HOME/build/mvn" @@ -40,7 +41,7 @@ function exit_with_usage { echo "make-distribution.sh - tool for making binary distributions of Spark" echo "" echo "usage:" - cl_options="[--name] [--tgz] [--mvn ]" + cl_options="[--name] [--tgz] [--pip] [--mvn ]" echo "make-distribution.sh $cl_options " echo "See Spark's \"Building Spark\" doc for correct Maven options." echo "" @@ -67,6 +68,9 @@ while (( "$#" )); do --tgz) MAKE_TGZ=true ;; + --pip) + MAKE_PIP=true + ;; --mvn) MVN="$2" shift @@ -202,10 +206,14 @@ fi cp -r "$SPARK_HOME/data" "$DISTDIR" # Make pip package -echo "Building python distribution package" -cd $SPARK_HOME/python -python setup.py sdist -cd .. +if [ "$MAKE_PIP" == "true" ]; then + echo "Building python distribution package" + cd $SPARK_HOME/python + python setup.py sdist + cd .. +else + echo "Skipping creating pip installable PySpark" +fi # Copy other things mkdir "$DISTDIR"/conf From e9f1e8ee96679bb8a7790e6224cd6f0902d1a7a4 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 1 Nov 2016 07:13:22 -0700 Subject: [PATCH 65/97] Fix indentation and clarify error message (since we still technically support Python 2.6 but its deprecated and I'm not making new functionality work with a deprecated version. --- python/setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/setup.py b/python/setup.py index 0a7c1283d1ae..6f32e63c0ca2 100644 --- a/python/setup.py +++ b/python/setup.py @@ -46,8 +46,9 @@ EXAMPLES_TARGET = "%s/examples" % TEMP_PATH if sys.version_info < (2, 7): - print("Python versions prior to 2.7 are not supported.", file=sys.stderr) - exit(-1) + print("Python versions prior to 2.7 are not supported for pip installed PySpark.", + file=sys.stderr) + exit(-1) # Check and see if we are under the spark path in which case we need to build the symlink farm. # This is important because we only want to build the symlink farm while under Spark otherwise we From 7af912a30d6472684838b8ff424495d28d845682 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 1 Nov 2016 21:59:30 -0700 Subject: [PATCH 66/97] Move Python version check up earlier. --- python/setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/setup.py b/python/setup.py index 6f32e63c0ca2..786fd33e3b27 100644 --- a/python/setup.py +++ b/python/setup.py @@ -23,6 +23,11 @@ from setuptools import setup, find_packages from shutil import copyfile, copytree, rmtree +if sys.version_info < (2, 7): + print("Python versions prior to 2.7 are not supported for pip installed PySpark.", + file=sys.stderr) + exit(-1) + try: exec(open('pyspark/version.py').read()) except IOError: @@ -45,11 +50,6 @@ JARS_TARGET = "%s/jars" % TEMP_PATH EXAMPLES_TARGET = "%s/examples" % TEMP_PATH -if sys.version_info < (2, 7): - print("Python versions prior to 2.7 are not supported for pip installed PySpark.", - file=sys.stderr) - exit(-1) - # Check and see if we are under the spark path in which case we need to build the symlink farm. # This is important because we only want to build the symlink farm while under Spark otherwise we # want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a From c77d9fdc4ff68010816c308e6bbb9a373b6c50c5 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 2 Nov 2016 09:21:00 -0700 Subject: [PATCH 67/97] Fix python3 setup --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 786fd33e3b27..171c714396da 100644 --- a/python/setup.py +++ b/python/setup.py @@ -99,7 +99,7 @@ # Scripts directive requires a list of each script path and does not take wild cards. script_names = os.listdir(SCRIPTS_TARGET) - scripts = map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names) + scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names)) # We add find_spark_home.py to the bin directory we install so that pip installed PySpark # will search for SPARK_HOME with Python. scripts.append("pyspark/find_spark_home.py") From 7b1d8b7092d55bfb488e898fd3f95df77ac30b04 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 2 Nov 2016 10:18:13 -0700 Subject: [PATCH 68/97] test both python/python3 if they are installed on the system for pip installability tests. --- dev/run-pip-tests-2 | 75 +++++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/dev/run-pip-tests-2 b/dev/run-pip-tests-2 index 4d294202605c..2092859b59a8 100755 --- a/dev/run-pip-tests-2 +++ b/dev/run-pip-tests-2 @@ -39,39 +39,56 @@ if [ -d ~/.cache/pip/wheels/ ]; then rm -rf ~/.cache/pip/wheels/ fi -# Create a temp directory for us to work in and save its name to a file for cleanup -echo "Constucting virtual env for testing" -mktemp -d > ./virtual_env_temp_dir -VIRTUALENV_BASE=`cat ./virtual_env_temp_dir` -echo "Using $VIRTUALENV_BASE for virtualenv" -virtualenv $VIRTUALENV_BASE -source $VIRTUALENV_BASE/bin/activate -# Upgrade pip -pip install --upgrade pip +# Figure out which Python execs we should test pip installation with +PYTHON_EXECS=() +if hash python 2>/dev/null; then + # We do this since we are testing with virtualenv and the default virtual env python + # is in /usr/bin/python + PYTHON_EXECS+=('python') +fi +if hash python3 2>/dev/null; then + PYTHON_EXECS+=('python3') +fi -echo "Creating pip installable source dist" -cd python -python setup.py sdist +for python in $PYTHON_EXECS; do + echo "Testing pip installation with python $python" + # Create a temp directory for us to work in and save its name to a file for cleanup + echo "Constucting virtual env for testing" + mktemp -d > ./virtual_env_temp_dir + VIRTUALENV_BASE=`cat ./virtual_env_temp_dir` + echo "Using $VIRTUALENV_BASE for virtualenv" + virtualenv --python=$python $VIRTUALENV_BASE + source $VIRTUALENV_BASE/bin/activate + # Upgrade pip + pip install --upgrade pip + echo "Creating pip installable source dist" + cd $FWDIR/python + $python setup.py sdist -echo "Installing dist into virtual env" -cd dist -# Verify that the dist directory only contains one thing to install -sdists=(*.tar.gz) -if [ ${#sdists[@]} -ne 1 ]; then - echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first." - exit -1 -fi -# Do the actual installation -pip install --upgrade --force-reinstall *.tar.gz -cd / + echo "Installing dist into virtual env" + cd dist + # Verify that the dist directory only contains one thing to install + sdists=(*.tar.gz) + if [ ${#sdists[@]} -ne 1 ]; then + echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first." + exit -1 + fi + # Do the actual installation + pip install --upgrade --force-reinstall *.tar.gz + + cd / + + echo "Run basic sanity check on pip installed version with spark-submit" + spark-submit $FWDIR/dev/pip-sanity-check.py + echo "Run basic sanity check with import based" + python $FWDIR/dev/pip-sanity-check.py + echo "Run the tests for context.py" + python $FWDIR/python/pyspark/context.py + + cd $FWDIR -echo "Run basic sanity check on pip installed version with spark-submit" -spark-submit $FWDIR/dev/pip-sanity-check.py -echo "Run basic sanity check with import based" -python $FWDIR/dev/pip-sanity-check.py -echo "Run the tests for context.py" -python $FWDIR/python/pyspark/context.py +done exit 0 From 97702607d28239fbc220ecc8f5c674d4c6578dbb Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 2 Nov 2016 21:30:22 -0700 Subject: [PATCH 69/97] Actually run the python3 packaging tests and fix path finding --- dev/run-pip-tests-2 | 2 +- python/pyspark/find_spark_home.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dev/run-pip-tests-2 b/dev/run-pip-tests-2 index 2092859b59a8..d1122a2e5707 100755 --- a/dev/run-pip-tests-2 +++ b/dev/run-pip-tests-2 @@ -50,7 +50,7 @@ if hash python3 2>/dev/null; then PYTHON_EXECS+=('python3') fi -for python in $PYTHON_EXECS; do +for python in "${PYTHON_EXECS[@]}"; do echo "Testing pip installation with python $python" # Create a temp directory for us to work in and save its name to a file for cleanup echo "Constucting virtual env for testing" diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py index d236afd0275d..dece997be8c1 100755 --- a/python/pyspark/find_spark_home.py +++ b/python/pyspark/find_spark_home.py @@ -47,9 +47,9 @@ def is_spark_home(path): # Not pip installed no worries True else: - import importlib + from importlib.util import find_spec try: - paths.append(importlib.util.find_spec("pyspark").origin) + paths.append(os.path.dirname(find_spec("pyspark").origin)) except ImportError: # Not pip installed no worries True From f6806b253341895c7be45a1a6c60df98ba4dcec9 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 5 Nov 2016 15:57:08 -0700 Subject: [PATCH 70/97] Break up sentence in setup.py error message, drop 3.0-3.3 tags from setup.py tag list --- python/setup.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/setup.py b/python/setup.py index 171c714396da..f1c7b670e911 100644 --- a/python/setup.py +++ b/python/setup.py @@ -31,7 +31,7 @@ try: exec(open('pyspark/version.py').read()) except IOError: - print("Failed to load PySpark version file for packaging you must be in Spark's python dir.", + print("Failed to load PySpark version file for packaging. You must be in Spark's python dir.", file=sys.stderr) sys.exit(-1) VERSION = __version__ @@ -155,10 +155,6 @@ 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.0', - 'Programming Language :: Python :: 3.1', - 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: Implementation :: CPython', From b0cd6556f588388270a15d765a263c13d6be2901 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 6 Nov 2016 10:44:28 -0800 Subject: [PATCH 71/97] Just copy shell in advance because the setup time copy has issues with python3 venv install for some reason --- python/pyspark/python/pyspark/shell.py | 1 + python/setup.py | 9 --------- 2 files changed, 1 insertion(+), 9 deletions(-) create mode 120000 python/pyspark/python/pyspark/shell.py diff --git a/python/pyspark/python/pyspark/shell.py b/python/pyspark/python/pyspark/shell.py new file mode 120000 index 000000000000..ac054dc86801 --- /dev/null +++ b/python/pyspark/python/pyspark/shell.py @@ -0,0 +1 @@ +pyspark/shell.py \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index f1c7b670e911..49ca1257ba87 100644 --- a/python/setup.py +++ b/python/setup.py @@ -83,15 +83,6 @@ if not os.path.exists(JARS_TARGET): print("To build packaging must be in the python directory under the SPARK_HOME.", file=sys.stderr) - # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts - # find it where expected. The rest of the files aren't copied because they are accessed - # using Python imports instead which will be resolved correctly. - try: - os.makedirs("pyspark/python/pyspark") - except OSError: - # Don't worry if the directory already exists. - True - copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py") if not os.path.isdir(SCRIPTS_TARGET): print("You must first create a source dist and install that source dist.", file=sys.stderr) From 6bb422e78c5df007597447210798edea8ed7cac9 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 6 Nov 2016 10:50:47 -0800 Subject: [PATCH 72/97] Change shell symlink --- python/pyspark/python/pyspark/shell.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/python/pyspark/shell.py b/python/pyspark/python/pyspark/shell.py index ac054dc86801..2c251005f5a2 120000 --- a/python/pyspark/python/pyspark/shell.py +++ b/python/pyspark/python/pyspark/shell.py @@ -1 +1 @@ -pyspark/shell.py \ No newline at end of file +./python/pyspark/shell.py \ No newline at end of file From b5b4713b0f6cb7e1e66e3ce173161618403bfe85 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 6 Nov 2016 11:11:38 -0800 Subject: [PATCH 73/97] Move the copy up earlier for python3 venv install issue --- .gitignore | 1 + python/pyspark/python/pyspark/shell.py | 1 - python/setup.py | 11 +++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) delete mode 120000 python/pyspark/python/pyspark/shell.py diff --git a/.gitignore b/.gitignore index 2399a6cd5d90..5634a434db0c 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,7 @@ project/plugins/src_managed/ project/plugins/target/ python/lib/pyspark.zip python/deps +python/pyspark/python reports/ scalastyle-on-compile.generated.xml scalastyle-output.xml diff --git a/python/pyspark/python/pyspark/shell.py b/python/pyspark/python/pyspark/shell.py deleted file mode 120000 index 2c251005f5a2..000000000000 --- a/python/pyspark/python/pyspark/shell.py +++ /dev/null @@ -1 +0,0 @@ -./python/pyspark/shell.py \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index 49ca1257ba87..c5bff7c4fd4a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -66,6 +66,16 @@ exit(-1) try: + # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts + # find it where expected. The rest of the files aren't copied because they are accessed + # using Python imports instead which will be resolved correctly. + try: + os.makedirs("pyspark/python/pyspark") + except OSError: + # Don't worry if the directory already exists. + True + copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py") + if (in_spark): # Construct the symlink farm - this is necessary since we can't refer to the path above the # package root and we need to copy the jars and scripts which are up above the python root. @@ -118,6 +128,7 @@ 'pyspark.streaming', 'pyspark.bin', 'pyspark.jars', + 'pyspark.python.pyspark', 'pyspark.python.lib', 'pyspark.examples.src.main.python'], include_package_data=True, From 2b808dc187d52a7e2f5e9d0c55307935a3010440 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 6 Nov 2016 11:13:15 -0800 Subject: [PATCH 74/97] Fix normalizaiton of paths --- python/pyspark/find_spark_home.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py index dece997be8c1..4697b8f4fa8d 100755 --- a/python/pyspark/find_spark_home.py +++ b/python/pyspark/find_spark_home.py @@ -55,7 +55,7 @@ def is_spark_home(path): True # Normalize the paths - paths = map(lambda path: os.path.abspath(path), paths) + paths = [os.path.abspath(p) for p in paths] try: return next(path for path in paths if is_spark_home(path)) From b958f7e87432bcfffc68cc3b4059575da2591098 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 6 Nov 2016 11:17:30 -0800 Subject: [PATCH 75/97] Handle edit mode based installations --- python/pyspark/find_spark_home.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py index 4697b8f4fa8d..3be4072ec6ff 100755 --- a/python/pyspark/find_spark_home.py +++ b/python/pyspark/find_spark_home.py @@ -34,7 +34,8 @@ def _find_spark_home(): def is_spark_home(path): """Takes a path and returns true if the provided path could be a reasonable SPARK_HOME""" return (os.path.isfile(os.path.join(path, "bin/spark-submit")) and - (os.path.isdir(os.path.join(path, "jars")))) + (os.path.isdir(os.path.join(path, "jars")) or + os.path.isdir(os.path.join(path, "assembly")))) paths = ["../", os.path.join(os.path.dirname(sys.argv[0]), "../")] @@ -42,14 +43,20 @@ def is_spark_home(path): if sys.version < "3": import imp try: - paths.append(imp.find_module("pyspark")[1]) + module_home = imp.find_module("pyspark")[1] + paths.append(module_home) + # If we are installed in edit mode also look two dirs up + paths.append(os.path.join(module_home, "../../")) except ImportError: # Not pip installed no worries True else: from importlib.util import find_spec try: - paths.append(os.path.dirname(find_spec("pyspark").origin)) + module_home = os.path.dirname(find_spec("pyspark").origin) + paths.append(module_home) + # If we are installed in edit mode also look two dirs up + paths.append(os.path.join(module_home, "../../")) except ImportError: # Not pip installed no worries True From 577554bf074e787bbf24fb1a0ff605e131705652 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 6 Nov 2016 12:27:20 -0800 Subject: [PATCH 76/97] Just skip caching rather than cleaning up the wheels --- dev/run-pip-tests-2 | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/dev/run-pip-tests-2 b/dev/run-pip-tests-2 index d1122a2e5707..dd6dc3d98d40 100755 --- a/dev/run-pip-tests-2 +++ b/dev/run-pip-tests-2 @@ -34,11 +34,6 @@ if ! hash pip 2>/dev/null; then exit 0 fi -if [ -d ~/.cache/pip/wheels/ ]; then - echo "Cleaning up pip wheel cache so we install the fresh package" - rm -rf ~/.cache/pip/wheels/ -fi - # Figure out which Python execs we should test pip installation with PYTHON_EXECS=() if hash python 2>/dev/null; then @@ -76,7 +71,7 @@ for python in "${PYTHON_EXECS[@]}"; do exit -1 fi # Do the actual installation - pip install --upgrade --force-reinstall *.tar.gz + pip install --upgrade --no-cache-dir --force-reinstall *.tar.gz cd / From 154a28757e422f9ee64d9c51b7ca5ffdeab76f82 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 6 Nov 2016 12:35:59 -0800 Subject: [PATCH 77/97] Remove % formatting and replace with format and os.path.join --- dev/pip-sanity-check.py | 2 +- python/pyspark/find_spark_home.py | 2 +- python/setup.py | 29 +++++++++++++++-------------- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/dev/pip-sanity-check.py b/dev/pip-sanity-check.py index 1d1e524b2633..430c2ab52766 100644 --- a/dev/pip-sanity-check.py +++ b/dev/pip-sanity-check.py @@ -29,7 +29,7 @@ rdd = sc.parallelize(range(100), 10) value = rdd.reduce(lambda x, y: x + y) if (value != 4950): - print("Value %d did not match expected value." % value, file=sys.stderr) + print("Value {0} did not match expected value.".format(value), file=sys.stderr) sys.exit(-1) print("Successfully ran pip sanity check") diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py index 3be4072ec6ff..e34772f7739d 100755 --- a/python/pyspark/find_spark_home.py +++ b/python/pyspark/find_spark_home.py @@ -67,7 +67,7 @@ def is_spark_home(path): try: return next(path for path in paths if is_spark_home(path)) except StopIteration: - print("Could not find valid SPARK_HOME while searching %s" % paths, file=sys.stderr) + print("Could not find valid SPARK_HOME while searching %s".format(paths), file=sys.stderr) if __name__ == "__main__": print(_find_spark_home()) diff --git a/python/setup.py b/python/setup.py index c5bff7c4fd4a..82281c602d00 100644 --- a/python/setup.py +++ b/python/setup.py @@ -38,17 +38,17 @@ # A temporary path so we can access above the Python project root and fetch scripts and jars we need TEMP_PATH = "deps" SPARK_HOME = os.path.abspath("../") -JARS_PATH = "%s/assembly/target/scala-2.11/jars/" % SPARK_HOME +JARS_PATH = os.path.join(SPARK_HOME, "assembly/target/scala-2.11/jars/") # Use the release jars path if we are in release mode. if (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1): - JARS_PATH = "%s/jars/" % SPARK_HOME + JARS_PATH = os.path.join(SPARK_HOME, "jars") -EXAMPLES_PATH = "%s/examples/src/main/python" % SPARK_HOME -SCRIPTS_PATH = "%s/bin" % SPARK_HOME -SCRIPTS_TARGET = "%s/bin" % TEMP_PATH -JARS_TARGET = "%s/jars" % TEMP_PATH -EXAMPLES_TARGET = "%s/examples" % TEMP_PATH +EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python") +SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin") +SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin") +JARS_TARGET = os.path.join(TEMP_PATH, "jars") +EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples") # Check and see if we are under the spark path in which case we need to build the symlink farm. # This is important because we only want to build the symlink farm while under Spark otherwise we @@ -62,7 +62,8 @@ try: os.mkdir(TEMP_PATH) except: - print("Temp path for symlink to parent already exists %s" % TEMP_PATH, file=sys.stderr) + print("Temp path for symlink to parent already exists {0}".format(TEMP_PATH), + file=sys.stderr) exit(-1) try: @@ -168,11 +169,11 @@ if (in_spark): # Depending on cleaning up the symlink farm or copied version if getattr(os, "symlink", None) is not None: - os.remove("%s/jars" % TEMP_PATH) - os.remove("%s/bin" % TEMP_PATH) - os.remove("%s/examples" % TEMP_PATH) + os.remove(os.path.join(TEMP_PATH, "jars")) + os.remove(os.path.join(TEMP_PATH, "bin")) + os.remove(os.path.join(TEMP_PATH, "examples")) else: - rmtree("%s/jars" % TEMP_PATH) - rmtree("%s/bin" % TEMP_PATH) - rmtree("%s/examples" % TEMP_PATH) + rmtree(os.path.join(TEMP_PATH, "jars")) + rmtree(os.path.join(TEMP_PATH, "bin")) + rmtree(os.path.join(TEMP_PATH, "examples")) os.rmdir(TEMP_PATH) From b478bdf981cc9a49c9dc20adbad8c722744e32a1 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 6 Nov 2016 16:29:10 -0800 Subject: [PATCH 78/97] s/True/pass/ in the places where it makes sense, fix a formatting issue --- python/pyspark/find_spark_home.py | 8 ++++---- python/setup.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py index e34772f7739d..20ccec312a6c 100755 --- a/python/pyspark/find_spark_home.py +++ b/python/pyspark/find_spark_home.py @@ -37,7 +37,7 @@ def is_spark_home(path): (os.path.isdir(os.path.join(path, "jars")) or os.path.isdir(os.path.join(path, "assembly")))) - paths = ["../", os.path.join(os.path.dirname(sys.argv[0]), "../")] + paths = ["../", os.path.dirname(os.path.realpath(__file__))] # Add the path of the PySpark module if it exists if sys.version < "3": @@ -49,7 +49,7 @@ def is_spark_home(path): paths.append(os.path.join(module_home, "../../")) except ImportError: # Not pip installed no worries - True + pass else: from importlib.util import find_spec try: @@ -59,7 +59,7 @@ def is_spark_home(path): paths.append(os.path.join(module_home, "../../")) except ImportError: # Not pip installed no worries - True + pass # Normalize the paths paths = [os.path.abspath(p) for p in paths] @@ -67,7 +67,7 @@ def is_spark_home(path): try: return next(path for path in paths if is_spark_home(path)) except StopIteration: - print("Could not find valid SPARK_HOME while searching %s".format(paths), file=sys.stderr) + print("Could not find valid SPARK_HOME while searching {0}".format(paths), file=sys.stderr) if __name__ == "__main__": print(_find_spark_home()) diff --git a/python/setup.py b/python/setup.py index 82281c602d00..f2d10337fb96 100644 --- a/python/setup.py +++ b/python/setup.py @@ -74,7 +74,7 @@ os.makedirs("pyspark/python/pyspark") except OSError: # Don't worry if the directory already exists. - True + pass copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py") if (in_spark): From fb62a8ae7a6a7ad1bff239f2a09e2170dda383a8 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 6 Nov 2016 17:47:55 -0800 Subject: [PATCH 79/97] Test both edit mode and regular installs --- dev/run-pip-tests-2 | 78 +++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 31 deletions(-) diff --git a/dev/run-pip-tests-2 b/dev/run-pip-tests-2 index dd6dc3d98d40..1df46afec646 100755 --- a/dev/run-pip-tests-2 +++ b/dev/run-pip-tests-2 @@ -45,45 +45,61 @@ if hash python3 2>/dev/null; then PYTHON_EXECS+=('python3') fi +echo "Constucting virtual env for testing" +mktemp -d > ./virtual_env_temp_dir +VIRTUALENV_BASE=`cat ./virtual_env_temp_dir` + +# Determine which version of PySpark we are building for archive name +PYSPARK_VERSION=`python -c "exec(open('python/pyspark/version.py').read());print __version__"` +PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz" +# The pip install options we use for all the pip commands +PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall " +# Test both regular user and edit/dev install modes. +PIP_COMMANDS=("pip install $PIP_OPTIONS $PYSPARK_DIST" + "pip install $PIP_OPTIONS -e python/") + for python in "${PYTHON_EXECS[@]}"; do - echo "Testing pip installation with python $python" - # Create a temp directory for us to work in and save its name to a file for cleanup - echo "Constucting virtual env for testing" - mktemp -d > ./virtual_env_temp_dir - VIRTUALENV_BASE=`cat ./virtual_env_temp_dir` - echo "Using $VIRTUALENV_BASE for virtualenv" - virtualenv --python=$python $VIRTUALENV_BASE - source $VIRTUALENV_BASE/bin/activate - # Upgrade pip - pip install --upgrade pip + for install_command in "${PIP_COMMANDS[@]}"; do + echo "Testing pip installation with python $python" + # Create a temp directory for us to work in and save its name to a file for cleanup + echo "Using $VIRTUALENV_BASE for virtualenv" + VIRTUALENV_PATH=$VIRTUALENV_BASE/$python + rm -rf $VIRTUALENV_PATH + mkdir -p $VIRTUALENV_PATH + virtualenv --python=$python $VIRTUALENV_PATH + source $VIRTUALENV_PATH/bin/activate + # Upgrade pip + pip install --upgrade pip - echo "Creating pip installable source dist" - cd $FWDIR/python - $python setup.py sdist + echo "Creating pip installable source dist" + cd $FWDIR/python + $python setup.py sdist - echo "Installing dist into virtual env" - cd dist - # Verify that the dist directory only contains one thing to install - sdists=(*.tar.gz) - if [ ${#sdists[@]} -ne 1 ]; then - echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first." - exit -1 - fi - # Do the actual installation - pip install --upgrade --no-cache-dir --force-reinstall *.tar.gz + echo "Installing dist into virtual env" + cd dist + # Verify that the dist directory only contains one thing to install + sdists=(*.tar.gz) + if [ ${#sdists[@]} -ne 1 ]; then + echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first." + exit -1 + fi + # Do the actual installation + cd $FWDIR + $install_command - cd / + cd / - echo "Run basic sanity check on pip installed version with spark-submit" - spark-submit $FWDIR/dev/pip-sanity-check.py - echo "Run basic sanity check with import based" - python $FWDIR/dev/pip-sanity-check.py - echo "Run the tests for context.py" - python $FWDIR/python/pyspark/context.py + echo "Run basic sanity check on pip installed version with spark-submit" + spark-submit $FWDIR/dev/pip-sanity-check.py + echo "Run basic sanity check with import based" + python $FWDIR/dev/pip-sanity-check.py + echo "Run the tests for context.py" + python $FWDIR/python/pyspark/context.py - cd $FWDIR + cd $FWDIR + done done exit 0 From 6540964e4e584479a67965f03c8c5fbf59f4e132 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 6 Nov 2016 18:36:08 -0800 Subject: [PATCH 80/97] Add exit(-1) --- python/pyspark/find_spark_home.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py index 20ccec312a6c..8539a4e4df77 100755 --- a/python/pyspark/find_spark_home.py +++ b/python/pyspark/find_spark_home.py @@ -68,6 +68,7 @@ def is_spark_home(path): return next(path for path in paths if is_spark_home(path)) except StopIteration: print("Could not find valid SPARK_HOME while searching {0}".format(paths), file=sys.stderr) + exit(-1) if __name__ == "__main__": print(_find_spark_home()) From d2389edc02a3c7185ffb793a8d272d4c257fcb8e Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 8 Nov 2016 14:57:37 -0800 Subject: [PATCH 81/97] CR feedback - switch symlink support checking into a function and use glob to figure out path to Spark JARs. While making this change add a clearer error message for anyone who might have assembly jars with two different ersions of Scala and an error message for missing JARs indicating missing re-requist build of Spark --- python/setup.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/python/setup.py b/python/setup.py index f2d10337fb96..18a5cd2fc78f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -38,11 +38,22 @@ # A temporary path so we can access above the Python project root and fetch scripts and jars we need TEMP_PATH = "deps" SPARK_HOME = os.path.abspath("../") -JARS_PATH = os.path.join(SPARK_HOME, "assembly/target/scala-2.11/jars/") -# Use the release jars path if we are in release mode. -if (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1): +# Figure out where the jars are we need to package with PySpark. +JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/")) + +if len(JARS_PATH) == 1: + JARS_PATH = JARS_PATH[0] +elif (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1): + # Release mode puts the jars in a jars directory JARS_PATH = os.path.join(SPARK_HOME, "jars") +elif len(JARS_PATH) > 1: + print("Assembly jars exist for multiple scalas, please cleanup assembly/target", + file=sys.stderr) + sys.exit(-1) +elif len(JARS_PATH) == 0 and not os.path.exists("deps"): + print("Assembly jars missing, please build Spark before packaging Python", file=sys.stderr) + sys.exit(-1) EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python") SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin") @@ -57,6 +68,9 @@ in_spark = (os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1)) +def _supports_symlinks(): + return getattr(os, "symlink", None) is not None + if (in_spark): # Construct links for setup try: @@ -80,7 +94,7 @@ if (in_spark): # Construct the symlink farm - this is necessary since we can't refer to the path above the # package root and we need to copy the jars and scripts which are up above the python root. - if getattr(os, "symlink", None) is not None: + if _supports_symlinks(): os.symlink(JARS_PATH, JARS_TARGET) os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET) @@ -168,7 +182,7 @@ # packaging. if (in_spark): # Depending on cleaning up the symlink farm or copied version - if getattr(os, "symlink", None) is not None: + if _supports_symlinks(): os.remove(os.path.join(TEMP_PATH, "jars")) os.remove(os.path.join(TEMP_PATH, "bin")) os.remove(os.path.join(TEMP_PATH, "examples")) From 48cd1ad7d83b588edc9370f180ac78edb8215e50 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 8 Nov 2016 14:58:31 -0800 Subject: [PATCH 82/97] Add a docstring comment just cause --- python/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/setup.py b/python/setup.py index 18a5cd2fc78f..768aaf6fa4af 100644 --- a/python/setup.py +++ b/python/setup.py @@ -69,6 +69,7 @@ (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1)) def _supports_symlinks(): + """Check if the system supports symlinks (e.g. *nix) or not." return getattr(os, "symlink", None) is not None if (in_spark): From 23109a44c25b06ec3fefc4938978cf9d76f13e7c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 8 Nov 2016 15:06:41 -0800 Subject: [PATCH 83/97] Fix support_symlinks / docstring --- python/setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 768aaf6fa4af..c4122d0e20be 100644 --- a/python/setup.py +++ b/python/setup.py @@ -68,10 +68,12 @@ in_spark = (os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1)) + def _supports_symlinks(): - """Check if the system supports symlinks (e.g. *nix) or not." + """Check if the system supports symlinks (e.g. *nix) or not.""" return getattr(os, "symlink", None) is not None + if (in_spark): # Construct links for setup try: From 49fc6db5152a0d567de0b0fa66f87a53268be8bd Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 8 Nov 2016 15:12:21 -0800 Subject: [PATCH 84/97] use update to usr bin env python --- python/pyspark/find_spark_home.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py index 8539a4e4df77..212a618b767a 100755 --- a/python/pyspark/find_spark_home.py +++ b/python/pyspark/find_spark_home.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # Licensed to the Apache Software Foundation (ASF) under one or more From 7001f90889b38d214bd0b279bd5cd6aa343bc4b3 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 9 Nov 2016 05:03:56 -0800 Subject: [PATCH 85/97] s/deps/TEMP_PATH/ incase we change it later --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index c4122d0e20be..980a41f916e8 100644 --- a/python/setup.py +++ b/python/setup.py @@ -51,7 +51,7 @@ print("Assembly jars exist for multiple scalas, please cleanup assembly/target", file=sys.stderr) sys.exit(-1) -elif len(JARS_PATH) == 0 and not os.path.exists("deps"): +elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH): print("Assembly jars missing, please build Spark before packaging Python", file=sys.stderr) sys.exit(-1) From 210c9d4fef8c5aca7eddd00b8dc0aa9f7b37bb18 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 11 Nov 2016 11:15:16 -0800 Subject: [PATCH 86/97] drop usr/bin/env python since we don't want MANIFEST to run as a script --- python/MANIFEST.in | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index a10902410496..7df219ff735c 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -1,5 +1,3 @@ -#!/usr/bin/env python - # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with From 9efca67ea4f41cb6a7ff9d0be44395b4db82890c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 11 Nov 2016 20:03:55 -0800 Subject: [PATCH 87/97] Use python2 if available and fallback to python --- dev/run-pip-tests-2 | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dev/run-pip-tests-2 b/dev/run-pip-tests-2 index 1df46afec646..25fe44c6f7c9 100755 --- a/dev/run-pip-tests-2 +++ b/dev/run-pip-tests-2 @@ -36,9 +36,12 @@ fi # Figure out which Python execs we should test pip installation with PYTHON_EXECS=() -if hash python 2>/dev/null; then +if hash python2 2>/dev/null; then # We do this since we are testing with virtualenv and the default virtual env python # is in /usr/bin/python + PYTHON_EXECS+=('python2') +elif hash python 2>/dev/null; then + # If python2 isn't installed fallback to python if available PYTHON_EXECS+=('python') fi if hash python3 2>/dev/null; then From fd3e89cf3c0e057dd1fa7e482ddf0e3f1d62a024 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 11 Nov 2016 20:07:43 -0800 Subject: [PATCH 88/97] Fix more shell check issues --- dev/run-pip-tests-2 | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dev/run-pip-tests-2 b/dev/run-pip-tests-2 index 25fe44c6f7c9..d59d054ffb41 100755 --- a/dev/run-pip-tests-2 +++ b/dev/run-pip-tests-2 @@ -22,7 +22,7 @@ set -e # Set nullglob for when we are checking existence based on globs shopt -s nullglob -FWDIR="$(cd "`dirname $0`"/..; pwd)" +FWDIR="$(cd "$(dirname $0)"/..; pwd)" cd "$FWDIR" # Some systems don't have pip or virtualenv - in those cases our tests won't work. if ! hash virtualenv 2>/dev/null; then @@ -50,10 +50,10 @@ fi echo "Constucting virtual env for testing" mktemp -d > ./virtual_env_temp_dir -VIRTUALENV_BASE=`cat ./virtual_env_temp_dir` +VIRTUALENV_BASE=$(cat ./virtual_env_temp_dir) # Determine which version of PySpark we are building for archive name -PYSPARK_VERSION=`python -c "exec(open('python/pyspark/version.py').read());print __version__"` +PYSPARK_VERSION=$(python -c "exec(open('python/pyspark/version.py').read());print __version__") PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz" # The pip install options we use for all the pip commands PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall " @@ -75,7 +75,7 @@ for python in "${PYTHON_EXECS[@]}"; do pip install --upgrade pip echo "Creating pip installable source dist" - cd $FWDIR/python + cd "$FWDIR"/python $python setup.py sdist @@ -88,19 +88,19 @@ for python in "${PYTHON_EXECS[@]}"; do exit -1 fi # Do the actual installation - cd $FWDIR + cd "$FWDIR" $install_command cd / echo "Run basic sanity check on pip installed version with spark-submit" - spark-submit $FWDIR/dev/pip-sanity-check.py + spark-submit "$FWDIR"/dev/pip-sanity-check.py echo "Run basic sanity check with import based" - python $FWDIR/dev/pip-sanity-check.py + python "$FWDIR"/dev/pip-sanity-check.py echo "Run the tests for context.py" - python $FWDIR/python/pyspark/context.py + python "$FWDIR"/python/pyspark/context.py - cd $FWDIR + cd "$FWDIR" done done From 587c0ebc71ff587e48ec8e71728069d155694dd2 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 11 Nov 2016 20:32:21 -0800 Subject: [PATCH 89/97] Fix shellcheck issues - note most of these were prexisting but since we are in here anyways makes sense --- bin/beeline | 2 +- bin/find-spark-home | 8 ++++---- bin/load-spark-env.sh | 2 +- bin/pyspark | 6 +++--- bin/run-example | 2 +- bin/spark-class | 4 ++-- bin/spark-shell | 4 ++-- bin/spark-sql | 2 +- bin/spark-submit | 2 +- bin/sparkR | 2 +- 10 files changed, 17 insertions(+), 17 deletions(-) diff --git a/bin/beeline b/bin/beeline index 19b3bc2db422..058534699e44 100755 --- a/bin/beeline +++ b/bin/beeline @@ -25,7 +25,7 @@ set -o posix # Figure out if SPARK_HOME is set if [ -z "${SPARK_HOME}" ]; then - source `dirname $0`/find-spark-home + source "$(dirname "$0")"/find-spark-home fi CLASS="org.apache.hive.beeline.BeeLine" diff --git a/bin/find-spark-home b/bin/find-spark-home index d869ae666b1b..fa78407d4175 100755 --- a/bin/find-spark-home +++ b/bin/find-spark-home @@ -19,23 +19,23 @@ # Attempts to find a proper value for SPARK_HOME. Should be included using "source" directive. -FIND_SPARK_HOME_PYTHON_SCRIPT="$(cd "`dirname "$0"`"; pwd)/find_spark_home.py" +FIND_SPARK_HOME_PYTHON_SCRIPT="$(cd "$(dirname "$0")"; pwd)/find_spark_home.py" # Short cirtuit if the user already has this set. if [ ! -z "${SPARK_HOME}" ]; then exit 0 -elif [ ! -f $FIND_SPARK_HOME_PYTHON_SCRIPT ]; then +elif [ ! -f "$FIND_SPARK_HOME_PYTHON_SCRIPT" ]; then # If we are not in the same directory as find_spark_home.py we are not pip installed so we don't # need to search the different Python directories for a Spark installation. # Note only that, if the user has pip installed PySpark but is directly calling pyspark-shell or # spark-submit in another directory we want to use that version of PySpark rather than the # pip installed version of PySpark. - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" + export SPARK_HOME="$(cd "$(dirname "$0")"/..; pwd)" else # We are pip installed, use the Python script to resolve a reasonable SPARK_HOME # Default to standard python interpreter unless told otherwise if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}" fi - export SPARK_HOME=`$PYSPARK_DRIVER_PYTHON $FIND_SPARK_HOME_PYTHON_SCRIPT` + export SPARK_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_SPARK_HOME_PYTHON_SCRIPT") fi diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh index 489967da4f57..8a2f709960a2 100644 --- a/bin/load-spark-env.sh +++ b/bin/load-spark-env.sh @@ -23,7 +23,7 @@ # Figure out where Spark is installed if [ -z "${SPARK_HOME}" ]; then - source `dirname $0`/find-spark-home + source "$(dirname "$0")"/find-spark-home fi if [ -z "$SPARK_ENV_LOADED" ]; then diff --git a/bin/pyspark b/bin/pyspark index deb81ca046f8..98387c2ec5b8 100755 --- a/bin/pyspark +++ b/bin/pyspark @@ -18,7 +18,7 @@ # if [ -z "${SPARK_HOME}" ]; then - source `dirname $0`/find-spark-home + source "$(dirname "$0")"/find-spark-home fi source "${SPARK_HOME}"/bin/load-spark-env.sh @@ -46,7 +46,7 @@ WORKS_WITH_IPYTHON=$(python -c 'import sys; print(sys.version_info >= (2, 7, 0)) # Determine the Python executable to use for the executors: if [[ -z "$PYSPARK_PYTHON" ]]; then - if [[ $PYSPARK_DRIVER_PYTHON == *ipython* && ! WORKS_WITH_IPYTHON ]]; then + if [[ $PYSPARK_DRIVER_PYTHON == *ipython* && ! $WORKS_WITH_IPYTHON ]]; then echo "IPython requires Python 2.7+; please install python2.7 or set PYSPARK_PYTHON" 1>&2 exit 1 else @@ -68,7 +68,7 @@ if [[ -n "$SPARK_TESTING" ]]; then unset YARN_CONF_DIR unset HADOOP_CONF_DIR export PYTHONHASHSEED=0 - exec "$PYSPARK_DRIVER_PYTHON" -m $1 + exec "$PYSPARK_DRIVER_PYTHON" -m "$1" exit fi diff --git a/bin/run-example b/bin/run-example index b1a436e35813..4ba5399311d3 100755 --- a/bin/run-example +++ b/bin/run-example @@ -18,7 +18,7 @@ # if [ -z "${SPARK_HOME}" ]; then - source `dirname $0`/find-spark-home + source "$(dirname "$0")"/find-spark-home fi export _SPARK_CMD_USAGE="Usage: ./bin/run-example [options] example-class [example args]" diff --git a/bin/spark-class b/bin/spark-class index 846fe5622629..77ea40cc3794 100755 --- a/bin/spark-class +++ b/bin/spark-class @@ -18,7 +18,7 @@ # if [ -z "${SPARK_HOME}" ]; then - source `dirname $0`/find-spark-home + source "$(dirname "$0")"/find-spark-home fi . "${SPARK_HOME}"/bin/load-spark-env.sh @@ -27,7 +27,7 @@ fi if [ -n "${JAVA_HOME}" ]; then RUNNER="${JAVA_HOME}/bin/java" else - if [ `command -v java` ]; then + if [ "$(command -v java)" ]; then RUNNER="java" else echo "JAVA_HOME is not set" >&2 diff --git a/bin/spark-shell b/bin/spark-shell index 87eefbcbcd27..421f36cac3d4 100755 --- a/bin/spark-shell +++ b/bin/spark-shell @@ -21,7 +21,7 @@ # Shell script for starting the Spark Shell REPL cygwin=false -case "`uname`" in +case "$(uname)" in CYGWIN*) cygwin=true;; esac @@ -29,7 +29,7 @@ esac set -o posix if [ -z "${SPARK_HOME}" ]; then - source `dirname $0`/find-spark-home + source "$(dirname "$0")"/find-spark-home fi export _SPARK_CMD_USAGE="Usage: ./bin/spark-shell [options]" diff --git a/bin/spark-sql b/bin/spark-sql index 5f702d63a763..b08b944ebd31 100755 --- a/bin/spark-sql +++ b/bin/spark-sql @@ -18,7 +18,7 @@ # if [ -z "${SPARK_HOME}" ]; then - source `dirname $0`/find-spark-home + source "$(dirname "$0")"/find-spark-home fi export _SPARK_CMD_USAGE="Usage: ./bin/spark-sql [options] [cli option]" diff --git a/bin/spark-submit b/bin/spark-submit index b00034971eb0..4e9d3614e637 100755 --- a/bin/spark-submit +++ b/bin/spark-submit @@ -18,7 +18,7 @@ # if [ -z "${SPARK_HOME}" ]; then - source `dirname $0`/find-spark-home + source "$(dirname "$0")"/find-spark-home fi # disable randomized hash for string in Python 3.3+ diff --git a/bin/sparkR b/bin/sparkR index e8ef3d73e3ad..29ab10df8ab6 100755 --- a/bin/sparkR +++ b/bin/sparkR @@ -18,7 +18,7 @@ # if [ -z "${SPARK_HOME}" ]; then - source `dirname $0`/find-spark-home + source "$(dirname "$0")"/find-spark-home fi source "${SPARK_HOME}"/bin/load-spark-env.sh From 290499848cf9d65fa35e8488f11531091a014081 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 11 Nov 2016 20:47:51 -0800 Subject: [PATCH 90/97] Move pip tests into a self cleaning up script instead of 2 --- dev/run-pip-tests | 98 ++++++++++++++++++++++++++++++++++++---- dev/run-pip-tests-2 | 108 -------------------------------------------- 2 files changed, 89 insertions(+), 117 deletions(-) delete mode 100755 dev/run-pip-tests-2 diff --git a/dev/run-pip-tests b/dev/run-pip-tests index 3690f14fec0e..e1da18e60bb3 100755 --- a/dev/run-pip-tests +++ b/dev/run-pip-tests @@ -17,19 +17,99 @@ # limitations under the License. # +# Stop on error +set -e +# Set nullglob for when we are checking existence based on globs +shopt -s nullglob -FWDIR="$(cd "`dirname $0`"/..; pwd)" +FWDIR="$(cd "$(dirname "$0")"/..; pwd)" cd "$FWDIR" -# Run the tests, we wrap the underlying test script for cleanup and because early exit -# doesn't always properly exit a virtualenv. -$FWDIR/dev/run-pip-tests-2 -export success=$? +echo "Constucting virtual env for testing" +VIRTUALENV_BASE=$(mktemp -d) # Clean up the virtual env enviroment used if we created one. -if [ -f ./virtual_env_tmp_dir ]; then - rm -rf `cat ./virtual_env_temp_dir` - rm ./virtaul_env_tmp_dir +function delete_virtualenv() { + echo "Cleaning up temporary directory - $VIRTUALENV_BASE" + rm -rf "$VIRTUALENV_BASE" +} +trap delete_virtualenv EXIT + +# Some systems don't have pip or virtualenv - in those cases our tests won't work. +if ! hash virtualenv 2>/dev/null; then + echo "Missing virtualenv skipping pip installability tests." + exit 0 +fi +if ! hash pip 2>/dev/null; then + echo "Missing pip, skipping pip installability tests." + exit 0 +fi + +# Figure out which Python execs we should test pip installation with +PYTHON_EXECS=() +if hash python2 2>/dev/null; then + # We do this since we are testing with virtualenv and the default virtual env python + # is in /usr/bin/python + PYTHON_EXECS+=('python2') +elif hash python 2>/dev/null; then + # If python2 isn't installed fallback to python if available + PYTHON_EXECS+=('python') fi +if hash python3 2>/dev/null; then + PYTHON_EXECS+=('python3') +fi + +# Determine which version of PySpark we are building for archive name +PYSPARK_VERSION=$(python -c "exec(open('python/pyspark/version.py').read());print __version__") +PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz" +# The pip install options we use for all the pip commands +PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall " +# Test both regular user and edit/dev install modes. +PIP_COMMANDS=("pip install $PIP_OPTIONS $PYSPARK_DIST" + "pip install $PIP_OPTIONS -e python/") + +for python in "${PYTHON_EXECS[@]}"; do + for install_command in "${PIP_COMMANDS[@]}"; do + echo "Testing pip installation with python $python" + # Create a temp directory for us to work in and save its name to a file for cleanup + echo "Using $VIRTUALENV_BASE for virtualenv" + VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python + rm -rf "$VIRTUALENV_PATH" + mkdir -p "$VIRTUALENV_PATH" + virtualenv --python=$python "$VIRTUALENV_PATH" + source "$VIRTUALENV_PATH"/bin/activate + # Upgrade pip + pip install --upgrade pip + + echo "Creating pip installable source dist" + cd "$FWDIR"/python + $python setup.py sdist + + + echo "Installing dist into virtual env" + cd dist + # Verify that the dist directory only contains one thing to install + sdists=(*.tar.gz) + if [ ${#sdists[@]} -ne 1 ]; then + echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first." + exit -1 + fi + # Do the actual installation + cd "$FWDIR" + $install_command + + cd / + + echo "Run basic sanity check on pip installed version with spark-submit" + spark-submit "$FWDIR"/dev/pip-sanity-check.py + echo "Run basic sanity check with import based" + python "$FWDIR"/dev/pip-sanity-check.py + echo "Run the tests for context.py" + python "$FWDIR"/python/pyspark/context.py + + cd "$FWDIR" + + done +done -exit $success +exit 0 diff --git a/dev/run-pip-tests-2 b/dev/run-pip-tests-2 deleted file mode 100755 index d59d054ffb41..000000000000 --- a/dev/run-pip-tests-2 +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Stop on error -set -e -# Set nullglob for when we are checking existence based on globs -shopt -s nullglob - -FWDIR="$(cd "$(dirname $0)"/..; pwd)" -cd "$FWDIR" -# Some systems don't have pip or virtualenv - in those cases our tests won't work. -if ! hash virtualenv 2>/dev/null; then - echo "Missing virtualenv skipping pip installability tests." - exit 0 -fi -if ! hash pip 2>/dev/null; then - echo "Missing pip, skipping pip installability tests." - exit 0 -fi - -# Figure out which Python execs we should test pip installation with -PYTHON_EXECS=() -if hash python2 2>/dev/null; then - # We do this since we are testing with virtualenv and the default virtual env python - # is in /usr/bin/python - PYTHON_EXECS+=('python2') -elif hash python 2>/dev/null; then - # If python2 isn't installed fallback to python if available - PYTHON_EXECS+=('python') -fi -if hash python3 2>/dev/null; then - PYTHON_EXECS+=('python3') -fi - -echo "Constucting virtual env for testing" -mktemp -d > ./virtual_env_temp_dir -VIRTUALENV_BASE=$(cat ./virtual_env_temp_dir) - -# Determine which version of PySpark we are building for archive name -PYSPARK_VERSION=$(python -c "exec(open('python/pyspark/version.py').read());print __version__") -PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz" -# The pip install options we use for all the pip commands -PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall " -# Test both regular user and edit/dev install modes. -PIP_COMMANDS=("pip install $PIP_OPTIONS $PYSPARK_DIST" - "pip install $PIP_OPTIONS -e python/") - -for python in "${PYTHON_EXECS[@]}"; do - for install_command in "${PIP_COMMANDS[@]}"; do - echo "Testing pip installation with python $python" - # Create a temp directory for us to work in and save its name to a file for cleanup - echo "Using $VIRTUALENV_BASE for virtualenv" - VIRTUALENV_PATH=$VIRTUALENV_BASE/$python - rm -rf $VIRTUALENV_PATH - mkdir -p $VIRTUALENV_PATH - virtualenv --python=$python $VIRTUALENV_PATH - source $VIRTUALENV_PATH/bin/activate - # Upgrade pip - pip install --upgrade pip - - echo "Creating pip installable source dist" - cd "$FWDIR"/python - $python setup.py sdist - - - echo "Installing dist into virtual env" - cd dist - # Verify that the dist directory only contains one thing to install - sdists=(*.tar.gz) - if [ ${#sdists[@]} -ne 1 ]; then - echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first." - exit -1 - fi - # Do the actual installation - cd "$FWDIR" - $install_command - - cd / - - echo "Run basic sanity check on pip installed version with spark-submit" - spark-submit "$FWDIR"/dev/pip-sanity-check.py - echo "Run basic sanity check with import based" - python "$FWDIR"/dev/pip-sanity-check.py - echo "Run the tests for context.py" - python "$FWDIR"/python/pyspark/context.py - - cd "$FWDIR" - - done -done - -exit 0 From 3345eb9bd3f0a165ae1d20d76e5d36e88e5512cd Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 11 Nov 2016 23:55:28 -0800 Subject: [PATCH 91/97] Clarify what is required to build the PySpark pip installable artifacts. --- python/setup.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 980a41f916e8..4a85f37d1e06 100644 --- a/python/setup.py +++ b/python/setup.py @@ -113,7 +113,15 @@ def _supports_symlinks(): file=sys.stderr) if not os.path.isdir(SCRIPTS_TARGET): - print("You must first create a source dist and install that source dist.", file=sys.stderr) + print("""If you are installing pyspark from spark source, you must first + build Spark and run sdist. + + To build Spark with maven you can run: + ./build/mvn -DskipTests clean package + Building the source dist is done in the Python directory: + cd python + python setup.py sdist + pip install dist/*.tar.gz""", file=sys.stderr) exit(-1) # Scripts directive requires a list of each script path and does not take wild cards. From f86574a0cdf769d106b4ac5126cf3c9c9f0cca94 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 12 Nov 2016 14:03:21 -0800 Subject: [PATCH 92/97] Make messaging more consistent --- python/setup.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/python/setup.py b/python/setup.py index 4a85f37d1e06..1d526d2f913b 100644 --- a/python/setup.py +++ b/python/setup.py @@ -39,6 +39,18 @@ TEMP_PATH = "deps" SPARK_HOME = os.path.abspath("../") +# Provide guidance about how to use setup.py +incorrect_invocation_message = """ +If you are installing pyspark from spark source, you must first build Spark and +run sdist. + + To build Spark with maven you can run: + ./build/mvn -DskipTests clean package + Building the source dist is done in the Python directory: + cd python + python setup.py sdist + pip install dist/*.tar.gz""" + # Figure out where the jars are we need to package with PySpark. JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/")) @@ -52,7 +64,7 @@ file=sys.stderr) sys.exit(-1) elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH): - print("Assembly jars missing, please build Spark before packaging Python", file=sys.stderr) + print(incorrect_invocation_message, file=sys.stderr) sys.exit(-1) EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python") @@ -61,6 +73,7 @@ JARS_TARGET = os.path.join(TEMP_PATH, "jars") EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples") + # Check and see if we are under the spark path in which case we need to build the symlink farm. # This is important because we only want to build the symlink farm while under Spark otherwise we # want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a @@ -113,15 +126,7 @@ def _supports_symlinks(): file=sys.stderr) if not os.path.isdir(SCRIPTS_TARGET): - print("""If you are installing pyspark from spark source, you must first - build Spark and run sdist. - - To build Spark with maven you can run: - ./build/mvn -DskipTests clean package - Building the source dist is done in the Python directory: - cd python - python setup.py sdist - pip install dist/*.tar.gz""", file=sys.stderr) + print(incorrect_invocation_message, file=sys.stderr) exit(-1) # Scripts directive requires a list of each script path and does not take wild cards. From 05fc25f3202ca9764dd6f3c744c213b3c0e2b93c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 14 Nov 2016 07:20:44 -0800 Subject: [PATCH 93/97] Switch to "s cause its easier to do that with sed rewrites --- python/pyspark/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/version.py b/python/pyspark/version.py index e853cbfcd2d3..08a301695fda 100644 --- a/python/pyspark/version.py +++ b/python/pyspark/version.py @@ -16,4 +16,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = '2.1.0.dev1' +__version__ = "2.1.0.dev0" From dd243a2aeed82921efad484ea382a251002b1fa7 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 14 Nov 2016 07:23:48 -0800 Subject: [PATCH 94/97] Update release tagging script --- dev/create-release/release-tag.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/dev/create-release/release-tag.sh b/dev/create-release/release-tag.sh index b7e5100ca740..88a897da9eb7 100755 --- a/dev/create-release/release-tag.sh +++ b/dev/create-release/release-tag.sh @@ -65,6 +65,7 @@ sed -i".tmp1" 's/Version.*$/Version: '"$RELEASE_VERSION"'/g' R/pkg/DESCRIPTION # Set the release version in docs sed -i".tmp1" 's/SPARK_VERSION:.*$/SPARK_VERSION: '"$RELEASE_VERSION"'/g' docs/_config.yml sed -i".tmp2" 's/SPARK_VERSION_SHORT:.*$/SPARK_VERSION_SHORT: '"$RELEASE_VERSION"'/g' docs/_config.yml +sed -i".tmp3" 's/__version__ = .*$/__version__ = "'"$RELEASE_VERSION"'"/' python/pyspark/version.py git commit -a -m "Preparing Spark release $RELEASE_TAG" echo "Creating tag $RELEASE_TAG at the head of $GIT_BRANCH" @@ -74,12 +75,16 @@ git tag $RELEASE_TAG $MVN versions:set -DnewVersion=$NEXT_VERSION | grep -v "no value" # silence logs # Remove -SNAPSHOT before setting the R version as R expects version strings to only have numbers R_NEXT_VERSION=`echo $NEXT_VERSION | sed 's/-SNAPSHOT//g'` -sed -i".tmp2" 's/Version.*$/Version: '"$R_NEXT_VERSION"'/g' R/pkg/DESCRIPTION +sed -i".tmp4" 's/Version.*$/Version: '"$R_NEXT_VERSION"'/g' R/pkg/DESCRIPTION +# Write out the NAME and VERSION to PySpark version info we use dev0 instead of snapshot to closer +# to PEP440. +sed -i".tmp5" 's/__version__ = .*$/__version__ = "'"$RELEASE_VERSION.dev0"'"/' python/pyspark/version.py + # Update docs with next version -sed -i".tmp3" 's/SPARK_VERSION:.*$/SPARK_VERSION: '"$NEXT_VERSION"'/g' docs/_config.yml +sed -i".tmp6" 's/SPARK_VERSION:.*$/SPARK_VERSION: '"$NEXT_VERSION"'/g' docs/_config.yml # Use R version for short version -sed -i".tmp4" 's/SPARK_VERSION_SHORT:.*$/SPARK_VERSION_SHORT: '"$R_NEXT_VERSION"'/g' docs/_config.yml +sed -i".tmp7" 's/SPARK_VERSION_SHORT:.*$/SPARK_VERSION_SHORT: '"$R_NEXT_VERSION"'/g' docs/_config.yml git commit -a -m "Preparing development version $NEXT_VERSION" From df5a3f92731329ca19f28ca52e6511258156268e Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 14 Nov 2016 07:29:44 -0800 Subject: [PATCH 95/97] Drop the notice since the script does it now --- pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/pom.xml b/pom.xml index 63fbd48a5a75..8aa0a6c3caab 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,6 @@ org.apache.spark spark-parent_2.11 - 2.1.0-SNAPSHOT pom Spark Project Parent POM From d753d8094e5483e0da7577a85c0c2ed182de3e34 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 14 Nov 2016 09:38:08 -0800 Subject: [PATCH 96/97] Fix the next version output and update the comment to be more precise --- dev/create-release/release-tag.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/create-release/release-tag.sh b/dev/create-release/release-tag.sh index 88a897da9eb7..370a62ce15bc 100755 --- a/dev/create-release/release-tag.sh +++ b/dev/create-release/release-tag.sh @@ -76,9 +76,9 @@ $MVN versions:set -DnewVersion=$NEXT_VERSION | grep -v "no value" # silence logs # Remove -SNAPSHOT before setting the R version as R expects version strings to only have numbers R_NEXT_VERSION=`echo $NEXT_VERSION | sed 's/-SNAPSHOT//g'` sed -i".tmp4" 's/Version.*$/Version: '"$R_NEXT_VERSION"'/g' R/pkg/DESCRIPTION -# Write out the NAME and VERSION to PySpark version info we use dev0 instead of snapshot to closer +# Write out the R_NEXT_VERSION to PySpark version info we use dev0 instead of SNAPSHOT to be closer # to PEP440. -sed -i".tmp5" 's/__version__ = .*$/__version__ = "'"$RELEASE_VERSION.dev0"'"/' python/pyspark/version.py +sed -i".tmp5" 's/__version__ = .*$/__version__ = "'"$R_NEXT_VERSION.dev0"'"/' python/pyspark/version.py # Update docs with next version From e1398552469288de3829eb889fec0de2ba568f15 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 15 Nov 2016 07:45:59 -0800 Subject: [PATCH 97/97] Add a global-exclude and add a format to the setup.py for multiple assembly jar scala versions --- python/MANIFEST.in | 1 + python/setup.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 7df219ff735c..bbcce1baa439 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +global-exclude *.py[cod] __pycache__ .DS_Store recursive-include deps/jars *.jar graft deps/bin recursive-include deps/examples *.py diff --git a/python/setup.py b/python/setup.py index 1d526d2f913b..625aea04073f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -60,8 +60,8 @@ # Release mode puts the jars in a jars directory JARS_PATH = os.path.join(SPARK_HOME, "jars") elif len(JARS_PATH) > 1: - print("Assembly jars exist for multiple scalas, please cleanup assembly/target", - file=sys.stderr) + print("Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format( + JARS_PATH), file=sys.stderr) sys.exit(-1) elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH): print(incorrect_invocation_message, file=sys.stderr)