apache · gsemet · Jan 9, 2017 · Feb 14, 2017 · Feb 14, 2017 · Feb 14, 2017
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
@@ -1534,10 +1534,10 @@ See the full [source code]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_
 # Lazily instantiated global instance of SparkSession
 def getSparkSessionInstance(sparkConf):
     if ("sparkSessionSingletonInstance" not in globals()):
-        globals()["sparkSessionSingletonInstance"] = SparkSession \
-            .builder \
-            .config(conf=sparkConf) \
-            .getOrCreate()
+        globals()["sparkSessionSingletonInstance"] = (SparkSession
+            .builder
+            .config(conf=sparkConf)
+            .getOrCreate())
     return globals()["sparkSessionSingletonInstance"]
 
 ...

diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py
@@ -26,8 +26,8 @@
 import sys
 
 import numpy as np
-from numpy.random import rand
 from numpy import matrix
+from numpy.random import rand
 from pyspark.sql import SparkSession
 
 LAMBDA = 0.01   # regularization
@@ -62,10 +62,10 @@ def update(i, mat, ratings):
       example. Please use pyspark.ml.recommendation.ALS for more
       conventional use.""", file=sys.stderr)
 
-    spark = SparkSession\
-        .builder\
-        .appName("PythonALS")\
-        .getOrCreate()
+    spark = (SparkSession
+             .builder
+             .appName("PythonALS")
+             .getOrCreate())
 
     sc = spark.sparkContext
 
@@ -87,17 +87,19 @@ def update(i, mat, ratings):
     usb = sc.broadcast(us)
 
     for i in range(ITERATIONS):
-        ms = sc.parallelize(range(M), partitions) \
-               .map(lambda x: update(x, usb.value, Rb.value)) \
-               .collect()
+        ms = (sc
+              .parallelize(range(M), partitions)
+              .map(lambda x: update(x, usb.value, Rb.value))
+              .collect())
         # collect() returns a list, so array ends up being
         # a 3-d array, we take the first 2 dims for the matrix
         ms = matrix(np.array(ms)[:, :, 0])
         msb = sc.broadcast(ms)
 
-        us = sc.parallelize(range(U), partitions) \
-               .map(lambda x: update(x, msb.value, Rb.value.T)) \
-               .collect()
+        us = (sc
+              .parallelize(range(U), partitions)
+              .map(lambda x: update(x, msb.value, Rb.value.T))
+              .collect())
         us = matrix(np.array(us)[:, :, 0])
         usb = sc.broadcast(us)
 

diff --git a/examples/src/main/python/avro_inputformat.py b/examples/src/main/python/avro_inputformat.py
@@ -18,10 +18,11 @@
 from __future__ import print_function
 
 import sys
-
 from functools import reduce
+
 from pyspark.sql import SparkSession
 
+
 """
 Read data file users.avro in local Spark distro:
 
@@ -65,10 +66,10 @@
 
     path = sys.argv[1]
 
-    spark = SparkSession\
-        .builder\
-        .appName("AvroKeyInputFormat")\
-        .getOrCreate()
+    spark = (SparkSession
+             .builder
+             .appName("AvroKeyInputFormat")
+             .getOrCreate())
 
     sc = spark.sparkContext
 

diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py
@@ -55,10 +55,10 @@ def closestPoint(p, centers):
        as an example! Please refer to examples/src/main/python/ml/kmeans_example.py for an
        example on how to use ML's KMeans implementation.""", file=sys.stderr)
 
-    spark = SparkSession\
-        .builder\
-        .appName("PythonKMeans")\
-        .getOrCreate()
+    spark = (SparkSession
+             .builder
+             .appName("PythonKMeans")
+             .getOrCreate())
 
     lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
     data = lines.map(parseVector).cache()

diff --git a/examples/src/main/python/logistic_regression.py b/examples/src/main/python/logistic_regression.py
@@ -55,13 +55,15 @@ def readPointBatch(iterator):
       Please refer to examples/src/main/python/ml/logistic_regression_with_elastic_net.py
       to see how ML's implementation is used.""", file=sys.stderr)
 
-    spark = SparkSession\
-        .builder\
-        .appName("PythonLR")\
-        .getOrCreate()
-
-    points = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])\
-        .mapPartitions(readPointBatch).cache()
+    spark = (SparkSession
+             .builder
+             .appName("PythonLR")
+             .getOrCreate())
+    points = (spark.read
+              .text(sys.argv[1])
+              .rdd.map(lambda r: r[0])
+              .mapPartitions(readPointBatch)
+              .cache())
     iterations = int(sys.argv[2])
 
     # Initialize w to a random value

diff --git a/examples/src/main/python/ml/aft_survival_regression.py b/examples/src/main/python/ml/aft_survival_regression.py
@@ -18,8 +18,8 @@
 from __future__ import print_function
 
 # $example on$
-from pyspark.ml.regression import AFTSurvivalRegression
 from pyspark.ml.linalg import Vectors
+from pyspark.ml.regression import AFTSurvivalRegression
 # $example off$
 from pyspark.sql import SparkSession
 
@@ -30,10 +30,10 @@
 """
 
 if __name__ == "__main__":
-    spark = SparkSession \
-        .builder \
-        .appName("AFTSurvivalRegressionExample") \
-        .getOrCreate()
+    spark = (SparkSession
+             .builder
+             .appName("AFTSurvivalRegressionExample")
+             .getOrCreate())
 
     # $example on$
     training = spark.createDataFrame([

diff --git a/examples/src/main/python/ml/als_example.py b/examples/src/main/python/ml/als_example.py
@@ -18,22 +18,23 @@
 from __future__ import print_function
 
 import sys
-if sys.version >= '3':
-    long = int
-
-from pyspark.sql import SparkSession
 
 # $example on$
 from pyspark.ml.evaluation import RegressionEvaluator
 from pyspark.ml.recommendation import ALS
 from pyspark.sql import Row
 # $example off$
+from pyspark.sql import SparkSession
+
+if sys.version >= '3':
+    long = int
+
 
 if __name__ == "__main__":
-    spark = SparkSession\
-        .builder\
-        .appName("ALSExample")\
-        .getOrCreate()
+    spark = (SparkSession
+             .builder
+             .appName("ALSExample")
+             .getOrCreate())
 
     # $example on$
     lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd

diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py
@@ -17,16 +17,17 @@
 
 from __future__ import print_function
 
-from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import Binarizer
 # $example off$
+from pyspark.sql import SparkSession
+
 
 if __name__ == "__main__":
-    spark = SparkSession\
-        .builder\
-        .appName("BinarizerExample")\
-        .getOrCreate()
+    spark = (SparkSession
+             .builder
+             .appName("BinarizerExample")
+             .getOrCreate())
 
     # $example on$
     continuousDataFrame = spark.createDataFrame([

diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -29,10 +29,10 @@
 """
 
 if __name__ == "__main__":
-    spark = SparkSession\
-        .builder\
-        .appName("BisectingKMeansExample")\
-        .getOrCreate()
+    spark = (SparkSession
+             .builder
+             .appName("BisectingKMeansExample")
+             .getOrCreate())
 
     # $example on$
     # Loads data.

diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py
@@ -17,16 +17,17 @@
 
 from __future__ import print_function
 
-from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import Bucketizer
 # $example off$
+from pyspark.sql import SparkSession
+
 
 if __name__ == "__main__":
-    spark = SparkSession\
-        .builder\
-        .appName("BucketizerExample")\
-        .getOrCreate()
+    spark = (SparkSession
+             .builder
+             .appName("BucketizerExample")
+             .getOrCreate())
 
     # $example on$
     splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
@@ -39,7 +40,7 @@
     # Transform original data into its bucket index.
     bucketedData = bucketizer.transform(dataFrame)
 
-    print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
+    print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1))
     bucketedData.show()
     # $example off$
 

diff --git a/examples/src/main/python/ml/chisq_selector_example.py b/examples/src/main/python/ml/chisq_selector_example.py
@@ -17,17 +17,18 @@
 
 from __future__ import print_function
 
-from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import ChiSqSelector
 from pyspark.ml.linalg import Vectors
 # $example off$
+from pyspark.sql import SparkSession
+
 
 if __name__ == "__main__":
-    spark = SparkSession\
-        .builder\
-        .appName("ChiSqSelectorExample")\
-        .getOrCreate()
+    spark = (SparkSession
+             .builder
+             .appName("ChiSqSelectorExample")
+             .getOrCreate())
 
     # $example on$
     df = spark.createDataFrame([

diff --git a/examples/src/main/python/ml/count_vectorizer_example.py b/examples/src/main/python/ml/count_vectorizer_example.py
@@ -17,23 +17,26 @@
 
 from __future__ import print_function
 
-from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import CountVectorizer
 # $example off$
+from pyspark.sql import SparkSession
+
 
 if __name__ == "__main__":
-    spark = SparkSession\
-        .builder\
-        .appName("CountVectorizerExample")\
-        .getOrCreate()
+    spark = (SparkSession
+             .builder
+             .appName("CountVectorizerExample")
+             .getOrCreate())
 
     # $example on$
     # Input data: Each row is a bag of words with a ID.
-    df = spark.createDataFrame([
-        (0, "a b c".split(" ")),
-        (1, "a b b c a".split(" "))
-    ], ["id", "words"])
+    df = spark.createDataFrame(
+        [
+            (0, "a b c".split(" ")),
+            (1, "a b b c a".split(" "))
+        ],
+        ["id", "words"])
 
     # fit a CountVectorizerModel from the corpus.
     cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py
@@ -35,27 +35,29 @@
 """
 
 if __name__ == "__main__":
-    spark = SparkSession\
-        .builder\
-        .appName("CrossValidatorExample")\
-        .getOrCreate()
+    spark = (SparkSession
+             .builder
+             .appName("CrossValidatorExample")
+             .getOrCreate())
 
     # $example on$
     # Prepare training documents, which are labeled.
-    training = spark.createDataFrame([
-        (0, "a b c d e spark", 1.0),
-        (1, "b d", 0.0),
-        (2, "spark f g h", 1.0),
-        (3, "hadoop mapreduce", 0.0),
-        (4, "b spark who", 1.0),
-        (5, "g d a y", 0.0),
-        (6, "spark fly", 1.0),
-        (7, "was mapreduce", 0.0),
-        (8, "e spark program", 1.0),
-        (9, "a e c l", 0.0),
-        (10, "spark compile", 1.0),
-        (11, "hadoop software", 0.0)
-    ], ["id", "text", "label"])
+    training = spark.createDataFrame(
+        [
+            (0, "a b c d e spark", 1.0),
+            (1, "b d", 0.0),
+            (2, "spark f g h", 1.0),
+            (3, "hadoop mapreduce", 0.0),
+            (4, "b spark who", 1.0),
+            (5, "g d a y", 0.0),
+            (6, "spark fly", 1.0),
+            (7, "was mapreduce", 0.0),
+            (8, "e spark program", 1.0),
+            (9, "a e c l", 0.0),
+            (10, "spark compile", 1.0),
+            (11, "hadoop software", 0.0)
+        ],
+        ["id", "text", "label"])
 
     # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
     tokenizer = Tokenizer(inputCol="text", outputCol="words")