From 99b7672c16ead7b93ecf36dd47c3f83329e8035c Mon Sep 17 00:00:00 2001 From: Alexander Ulanov Date: Mon, 17 Aug 2015 12:26:38 -0700 Subject: [PATCH 1/5] Docs placeholder and link to it from ML guide --- docs/ml-ann.md | 83 ++++++++++++++++++++++++++++++++++++++++++++++++ docs/ml-guide.md | 1 + 2 files changed, 84 insertions(+) create mode 100644 docs/ml-ann.md diff --git a/docs/ml-ann.md b/docs/ml-ann.md new file mode 100644 index 0000000000000..dd27b40b4969f --- /dev/null +++ b/docs/ml-ann.md @@ -0,0 +1,83 @@ +--- +layout: global +title: Multilayer perceptron classifier - ML +displayTitle: ML - Multilayer perceptron classifier +--- + + +`\[ +\newcommand{\R}{\mathbb{R}} +\newcommand{\E}{\mathbb{E}} +\newcommand{\x}{\mathbf{x}} +\newcommand{\y}{\mathbf{y}} +\newcommand{\wv}{\mathbf{w}} +\newcommand{\av}{\mathbf{\alpha}} +\newcommand{\bv}{\mathbf{b}} +\newcommand{\N}{\mathbb{N}} +\newcommand{\id}{\mathbf{I}} +\newcommand{\ind}{\mathbf{1}} +\newcommand{\0}{\mathbf{0}} +\newcommand{\unit}{\mathbf{e}} +\newcommand{\one}{\mathbf{1}} +\newcommand{\zero}{\mathbf{0}} +\]` + + +In MLlib, we implement MLP + +**Examples** + +
+ +
+ +{% highlight scala %} + +import org.apache.spark.ml.classification.MultilayerPerceptronClassifier +import org.apache.spark.mllib.util.MLUtils + +// Load training data +val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + +{% endhighlight %} + +
+ +
+ +{% highlight java %} + +import org.apache.spark.ml.classification.MultilayerPerceptronClassifier; +import org.apache.spark.mllib.util.MLUtils; + +public class Example { + public static void main(String[] args) { + SparkConf conf = new SparkConf() + .setAppName("Multilayer perceptron classifier"); + + SparkContext sc = new SparkContext(conf); + SQLContext sql = new SQLContext(sc); + String path = "sample_libsvm_data.txt"; + + // Load training data + DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class); + + } +} +{% endhighlight %} +
+ +
+ +{% highlight python %} +Sorry, Python example not available yet + +{% endhighlight %} + +
+ +
+ +### Optimization + +The optimization. diff --git a/docs/ml-guide.md b/docs/ml-guide.md index c64fff7c0315a..de8fead3529e4 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -179,6 +179,7 @@ There are now several algorithms in the Pipelines API which are not in the lower * [Decision Trees for Classification and Regression](ml-decision-tree.html) * [Ensembles](ml-ensembles.html) * [Linear methods with elastic net regularization](ml-linear-methods.html) +* [Multilayer perceptron classifier](ml-ann.html) # Code Examples From 38927453b5ae131ec6d5edb6b76f122c4c49eaa7 Mon Sep 17 00:00:00 2001 From: Alexander Ulanov Date: Mon, 17 Aug 2015 17:31:41 -0700 Subject: [PATCH 2/5] MLPC description and Scala example --- docs/ml-ann.md | 46 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/docs/ml-ann.md b/docs/ml-ann.md index dd27b40b4969f..2620b65a0ff56 100644 --- a/docs/ml-ann.md +++ b/docs/ml-ann.md @@ -23,7 +23,25 @@ displayTitle: ML - Multilayer perceptron classifier \]` -In MLlib, we implement MLP +Multilayer perceptron classifier (MLPC) is a classifier based on the [feedforward artificial neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network). +MLPC consists of multiple layers of nodes. +Each layer is fully connected to the next layer in the network. Nodes in the input layer represent the input data. All other nodes maps inputs to the outputs +by performing linear combination of the inputs with the node's weights `$\wv$` and bias `$\bv$` and applying an activation function. +It can be written in matrix form for MLPC with `$K+1$` layers as follows: +`\[ +\mathrm{y}(\x) = \mathrm{f_K}(...\mathrm{f_2}(\wv_2^T\mathrm{f_1}(\wv_1^T \x+b_1)+b_2)...+b_K) +\]` +Nodes in intermediate layers use sigmoid (logistic) function: +`\[ +\mathrm{f}(z_i) = \frac{1}{1 + e^{-z_i}} +\]` +Nodes in the output layer use softmax function: +`\[ +\mathrm{f}(z_i) = \frac{e^{z_i}}{\sum_{k=1}^N e^{z_k}} +\]` +The number of nodes `$N$` in the output layer corresponds to the number of classes. + +MLPC employes backpropagation for learning the model. We use logistic loss function for optimization and L-BFGS as optimization routine. **Examples** @@ -35,10 +53,26 @@ In MLlib, we implement MLP import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.sql.Row // Load training data -val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() - +val data = MLUtils.loadLibSVMFile(sc, "c:/ulanov/dev/spark/data/mllib/sample_multiclass_classification_data.txt").toDF() +// Split the data into train and test +val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) +val train = splits(0) +val test = splits(1) +// specify layers for the neural network: +// input layer of size 4 (features), two intermediate of size 5 and 4 and output of size 3 (classes) +val layers = Array[Int](4, 5, 4, 3) +// create the trainer and set its parameters +val trainer = new MultilayerPerceptronClassifier().setLayers(layers).setBlockSize(128).setSeed(1234L).setMaxIter(100) +// fit the model to the data +val model = trainer.fit(train) +val result = model.transform(test) +val predictionAndLabels = result.select("prediction", "label") +// compute accuracy on the test set +val accuracy = predictionAndLabels.map{ case Row(p: Double, l: Double) => if (p == l) 1 else 0}.sum / predictionAndLabels.count +println("Accuracy:" + accuracy) {% endhighlight %} @@ -70,14 +104,10 @@ public class Example {
{% highlight python %} -Sorry, Python example not available yet +Sorry, Python example is not available yet {% endhighlight %}
- -### Optimization - -The optimization. From 46067c52d27071f391eff4696f5c39f1ea6609c9 Mon Sep 17 00:00:00 2001 From: Alexander Ulanov Date: Mon, 17 Aug 2015 18:17:05 -0700 Subject: [PATCH 3/5] Java example --- docs/ml-ann.md | 74 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/docs/ml-ann.md b/docs/ml-ann.md index 2620b65a0ff56..9df8925f4a88e 100644 --- a/docs/ml-ann.md +++ b/docs/ml-ann.md @@ -50,15 +50,15 @@ MLPC employes backpropagation for learning the model. We use logistic loss funct
{% highlight scala %} - import org.apache.spark.ml.classification.MultilayerPerceptronClassifier +import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.Row // Load training data -val data = MLUtils.loadLibSVMFile(sc, "c:/ulanov/dev/spark/data/mllib/sample_multiclass_classification_data.txt").toDF() +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt").toDF() // Split the data into train and test -val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) +val splits = data.randomSplit(Array(0.6, 0.4), seed = 1234L) val train = splits(0) val test = splits(1) // specify layers for the neural network: @@ -66,13 +66,13 @@ val test = splits(1) val layers = Array[Int](4, 5, 4, 3) // create the trainer and set its parameters val trainer = new MultilayerPerceptronClassifier().setLayers(layers).setBlockSize(128).setSeed(1234L).setMaxIter(100) -// fit the model to the data +// train the model val model = trainer.fit(train) +// compute precision on the test set val result = model.transform(test) -val predictionAndLabels = result.select("prediction", "label") -// compute accuracy on the test set -val accuracy = predictionAndLabels.map{ case Row(p: Double, l: Double) => if (p == l) 1 else 0}.sum / predictionAndLabels.count -println("Accuracy:" + accuracy) +val predictionAndLabels = result.select("prediction", "label").map { case Row(p: Double, l: Double) => (p, l) } +val metrics = new MulticlassMetrics(predictionAndLabels) +println("Precision:" + metrics.precision) {% endhighlight %}
@@ -81,33 +81,49 @@ println("Accuracy:" + accuracy) {% highlight java %} +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel; import org.apache.spark.ml.classification.MultilayerPerceptronClassifier; +import org.apache.spark.mllib.evaluation.MulticlassMetrics; +import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; - -public class Example { - public static void main(String[] args) { - SparkConf conf = new SparkConf() - .setAppName("Multilayer perceptron classifier"); - - SparkContext sc = new SparkContext(conf); - SQLContext sql = new SQLContext(sc); - String path = "sample_libsvm_data.txt"; - - // Load training data - DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class); - - } +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.SQLContext; + +public class MultilayerPerceptronClassifierTest { + + public static void main( String[] args ) + { + SparkConf conf = new SparkConf().setAppName("Multilayer perceptron classifier example").setMaster("local"); + SparkContext sc = new SparkContext(conf); + SQLContext sql = new SQLContext(sc); + String path = "data/mllib/sample_multiclass_classification_data.txt"; + // Load training data + JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); + // Split the data into train and test + JavaRDD[] splits = data.randomSplit(new double[]{0.6, 0.4}, 1234L); + DataFrame train = sql.createDataFrame(splits[0], LabeledPoint.class); + DataFrame test = sql.createDataFrame(splits[1], LabeledPoint.class); + // specify layers for the neural network: + // input layer of size 4 (features), two intermediate of size 5 and 4 and output of size 3 (classes) + int[] layers = new int[] {4, 5, 4, 3}; + // create the trainer and set its parameters + MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier(); + trainer.setLayers(layers).setBlockSize(128).setSeed(1234L).setMaxIter(100); + // train the model + MultilayerPerceptronClassificationModel model = trainer.fit(train); + // compute precision on the test set + DataFrame result = model.transform(test); + DataFrame predictionAndLabels = result.select("prediction", "label"); + MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels); + System.out.println("Precision = " + metrics.precision()); + } } -{% endhighlight %} - -
- -{% highlight python %} -Sorry, Python example is not available yet {% endhighlight %} -
From 192429b8cd46d1e4eeb158717d81343c5bdc43e0 Mon Sep 17 00:00:00 2001 From: Alexander Ulanov Date: Wed, 19 Aug 2015 15:40:51 -0700 Subject: [PATCH 4/5] Addressing reviewers comments --- docs/ml-ann.md | 78 +++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 42 deletions(-) diff --git a/docs/ml-ann.md b/docs/ml-ann.md index 9df8925f4a88e..e698d3f2d9138 100644 --- a/docs/ml-ann.md +++ b/docs/ml-ann.md @@ -51,7 +51,7 @@ MLPC employes backpropagation for learning the model. We use logistic loss funct {% highlight scala %} import org.apache.spark.ml.classification.MultilayerPerceptronClassifier -import org.apache.spark.mllib.evaluation.MulticlassMetrics +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.Row @@ -65,14 +65,19 @@ val test = splits(1) // input layer of size 4 (features), two intermediate of size 5 and 4 and output of size 3 (classes) val layers = Array[Int](4, 5, 4, 3) // create the trainer and set its parameters -val trainer = new MultilayerPerceptronClassifier().setLayers(layers).setBlockSize(128).setSeed(1234L).setMaxIter(100) +val trainer = new MultilayerPerceptronClassifier() + .setLayers(layers) + .setBlockSize(128) + .setSeed(1234L) + .setMaxIter(100) // train the model val model = trainer.fit(train) // compute precision on the test set val result = model.transform(test) -val predictionAndLabels = result.select("prediction", "label").map { case Row(p: Double, l: Double) => (p, l) } -val metrics = new MulticlassMetrics(predictionAndLabels) -println("Precision:" + metrics.precision) +val predictionAndLabels = result.select("prediction", "label") +val evaluator = new MulticlassClassificationEvaluator() + .setMetricName("precision") +println("Precision:" + evaluator.evaluate(predictionAndLabels)) {% endhighlight %} @@ -80,49 +85,38 @@ println("Precision:" + metrics.precision)
{% highlight java %} - -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel; import org.apache.spark.ml.classification.MultilayerPerceptronClassifier; -import org.apache.spark.mllib.evaluation.MulticlassMetrics; +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.sql.DataFrame; -import org.apache.spark.sql.SQLContext; - -public class MultilayerPerceptronClassifierTest { - - public static void main( String[] args ) - { - SparkConf conf = new SparkConf().setAppName("Multilayer perceptron classifier example").setMaster("local"); - SparkContext sc = new SparkContext(conf); - SQLContext sql = new SQLContext(sc); - String path = "data/mllib/sample_multiclass_classification_data.txt"; - // Load training data - JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); - // Split the data into train and test - JavaRDD[] splits = data.randomSplit(new double[]{0.6, 0.4}, 1234L); - DataFrame train = sql.createDataFrame(splits[0], LabeledPoint.class); - DataFrame test = sql.createDataFrame(splits[1], LabeledPoint.class); - // specify layers for the neural network: - // input layer of size 4 (features), two intermediate of size 5 and 4 and output of size 3 (classes) - int[] layers = new int[] {4, 5, 4, 3}; - // create the trainer and set its parameters - MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier(); - trainer.setLayers(layers).setBlockSize(128).setSeed(1234L).setMaxIter(100); - // train the model - MultilayerPerceptronClassificationModel model = trainer.fit(train); - // compute precision on the test set - DataFrame result = model.transform(test); - DataFrame predictionAndLabels = result.select("prediction", "label"); - MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels); - System.out.println("Precision = " + metrics.precision()); - } -} - +// Load training data +String path = "data/mllib/sample_multiclass_classification_data.txt"; +JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); +DataFrame dataFrame = sqlContext.createDataFrame(data, LabeledPoint.class); +// Split the data into train and test +DataFrame[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L); +DataFrame train = splits[0]; +DataFrame test = splits[1]; +// specify layers for the neural network: +// input layer of size 4 (features), two intermediate of size 5 and 4 and output of size 3 (classes) +int[] layers = new int[] {4, 5, 4, 3}; +// create the trainer and set its parameters +MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier(); +trainer.setLayers(layers) + .setBlockSize(128) + .setSeed(1234L) + .setMaxIter(100); +// train the model +MultilayerPerceptronClassificationModel model = trainer.fit(train); +// compute precision on the test set +DataFrame result = model.transform(test); +DataFrame predictionAndLabels = result.select("prediction", "label"); +MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() + .setMetricName("precision"); +System.out.println("Precision = " + evaluator.evaluate(predictionAndLabels)); {% endhighlight %}
From 07ada2bda4d1c9956611a55f9d274b3227483349 Mon Sep 17 00:00:00 2001 From: Alexander Ulanov Date: Thu, 20 Aug 2015 16:35:16 -0700 Subject: [PATCH 5/5] Addressing reviewers comments --- docs/ml-ann.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/ml-ann.md b/docs/ml-ann.md index e698d3f2d9138..d5ddd92af1e96 100644 --- a/docs/ml-ann.md +++ b/docs/ml-ann.md @@ -66,17 +66,17 @@ val test = splits(1) val layers = Array[Int](4, 5, 4, 3) // create the trainer and set its parameters val trainer = new MultilayerPerceptronClassifier() - .setLayers(layers) - .setBlockSize(128) - .setSeed(1234L) - .setMaxIter(100) + .setLayers(layers) + .setBlockSize(128) + .setSeed(1234L) + .setMaxIter(100) // train the model val model = trainer.fit(train) // compute precision on the test set val result = model.transform(test) val predictionAndLabels = result.select("prediction", "label") val evaluator = new MulticlassClassificationEvaluator() - .setMetricName("precision") + .setMetricName("precision") println("Precision:" + evaluator.evaluate(predictionAndLabels)) {% endhighlight %} @@ -104,18 +104,18 @@ DataFrame test = splits[1]; // input layer of size 4 (features), two intermediate of size 5 and 4 and output of size 3 (classes) int[] layers = new int[] {4, 5, 4, 3}; // create the trainer and set its parameters -MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier(); -trainer.setLayers(layers) - .setBlockSize(128) - .setSeed(1234L) - .setMaxIter(100); +MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier() + .setLayers(layers) + .setBlockSize(128) + .setSeed(1234L) + .setMaxIter(100); // train the model MultilayerPerceptronClassificationModel model = trainer.fit(train); // compute precision on the test set DataFrame result = model.transform(test); DataFrame predictionAndLabels = result.select("prediction", "label"); MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() - .setMetricName("precision"); + .setMetricName("precision"); System.out.println("Precision = " + evaluator.evaluate(predictionAndLabels)); {% endhighlight %}