From ade614b09f2e9cc0d09f9d30d6cabc44d2000b93 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 16 May 2016 20:03:44 +0900 Subject: [PATCH 1/4] Make an python example working with SparkSession --- .../main/python/ml/simple_params_example.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/examples/src/main/python/ml/simple_params_example.py b/examples/src/main/python/ml/simple_params_example.py index 2d6d115d54d0..2173c92f974d 100644 --- a/examples/src/main/python/ml/simple_params_example.py +++ b/examples/src/main/python/ml/simple_params_example.py @@ -20,11 +20,10 @@ import pprint import sys -from pyspark import SparkContext from pyspark.ml.classification import LogisticRegression from pyspark.mllib.linalg import DenseVector from pyspark.mllib.regression import LabeledPoint -from pyspark.sql import SQLContext +from pyspark.sql import Row, SparkSession """ A simple example demonstrating ways to specify parameters for Estimators and Transformers. @@ -36,18 +35,20 @@ if len(sys.argv) > 1: print("Usage: simple_params_example", file=sys.stderr) exit(1) - sc = SparkContext(appName="PythonSimpleParamsExample") - sqlContext = SQLContext(sc) + spark = SparkSession \ + .builder \ + .appName("SimpleTextClassificationPipeline") \ + .getOrCreate() # prepare training data. # We create an RDD of LabeledPoints and convert them into a DataFrame. # A LabeledPoint is an Object with two fields named label and features # and Spark SQL identifies these fields and creates the schema appropriately. - training = sc.parallelize([ + training = spark.createDataFrame([ LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])), LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])), LabeledPoint(0.0, DenseVector([2.0, 1.3, 1.0])), - LabeledPoint(1.0, DenseVector([0.0, 1.2, -0.5]))]).toDF() + LabeledPoint(1.0, DenseVector([0.0, 1.2, -0.5]))]) # Create a LogisticRegression instance with maxIter = 10. # This instance is an Estimator. @@ -70,7 +71,7 @@ # We may alternatively specify parameters using a parameter map. # paramMap overrides all lr parameters set earlier. - paramMap = {lr.maxIter: 20, lr.thresholds: [0.45, 0.55], lr.probabilityCol: "myProbability"} + paramMap = {lr.maxIter: 20, lr.thresholds: [0.5, 0.5], lr.probabilityCol: "myProbability"} # Now learn a new model using the new parameters. model2 = lr.fit(training, paramMap) @@ -78,10 +79,10 @@ pprint.pprint(model2.extractParamMap()) # prepare test data. - test = sc.parallelize([ + test = spark.createDataFrame([ LabeledPoint(1.0, DenseVector([-1.0, 1.5, 1.3])), LabeledPoint(0.0, DenseVector([3.0, 2.0, -0.1])), - LabeledPoint(0.0, DenseVector([0.0, 2.2, -1.5]))]).toDF() + LabeledPoint(0.0, DenseVector([0.0, 2.2, -1.5]))]) # Make predictions on test data using the Transformer.transform() method. # LogisticRegressionModel.transform will only use the 'features' column. @@ -95,4 +96,4 @@ print("features=%s,label=%s -> prob=%s, prediction=%s" % (row.features, row.label, row.myProbability, row.prediction)) - sc.stop() + spark.stop() From fc49d3013436a02d0b10b03f84f59598487e70b4 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 16 May 2016 20:15:16 +0900 Subject: [PATCH 2/4] Remove unused imports --- examples/src/main/python/ml/simple_params_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/python/ml/simple_params_example.py b/examples/src/main/python/ml/simple_params_example.py index 2173c92f974d..d2986b4813e7 100644 --- a/examples/src/main/python/ml/simple_params_example.py +++ b/examples/src/main/python/ml/simple_params_example.py @@ -23,7 +23,7 @@ from pyspark.ml.classification import LogisticRegression from pyspark.mllib.linalg import DenseVector from pyspark.mllib.regression import LabeledPoint -from pyspark.sql import Row, SparkSession +from pyspark.sql import SparkSession """ A simple example demonstrating ways to specify parameters for Estimators and Transformers. From bb88635fc65ea53a21baba6a240dc726e8004b92 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Wed, 18 May 2016 13:58:55 +0900 Subject: [PATCH 3/4] Remove sys.argv checking --- examples/src/main/python/ml/simple_params_example.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/src/main/python/ml/simple_params_example.py b/examples/src/main/python/ml/simple_params_example.py index d2986b4813e7..c57e59d01b54 100644 --- a/examples/src/main/python/ml/simple_params_example.py +++ b/examples/src/main/python/ml/simple_params_example.py @@ -32,9 +32,6 @@ """ if __name__ == "__main__": - if len(sys.argv) > 1: - print("Usage: simple_params_example", file=sys.stderr) - exit(1) spark = SparkSession \ .builder \ .appName("SimpleTextClassificationPipeline") \ From 9ec58e6368d848b90b94145a1bb1354587898d82 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Thu, 19 May 2016 12:53:46 +0900 Subject: [PATCH 4/4] Update thresholds for consistency for Java/Scala examples --- .../org/apache/spark/examples/ml/JavaSimpleParamsExample.java | 2 +- .../org/apache/spark/examples/ml/SimpleParamsExample.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java index ff1eb07dc605..ca80d0d8bba5 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java @@ -77,7 +77,7 @@ public static void main(String[] args) { ParamMap paramMap = new ParamMap(); paramMap.put(lr.maxIter().w(20)); // Specify 1 Param. paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter. - double[] thresholds = {0.45, 0.55}; + double[] thresholds = {0.5, 0.5}; paramMap.put(lr.regParam().w(0.1), lr.thresholds().w(thresholds)); // Specify multiple Params. // One can also combine ParamMaps. diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala index 83bab5c55758..e447435b9d11 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala @@ -70,7 +70,7 @@ object SimpleParamsExample { // which supports several methods for specifying parameters. val paramMap = ParamMap(lr.maxIter -> 20) paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter. - paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.45, 0.55)) // Specify multiple Params. + paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.5, 0.5)) // Specify multiple Params. // One can also combine ParamMaps. val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name