Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions data/mllib/sample_multiclass_classification_data.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
1 1:-0.222222 2:0.5 3:-0.762712 4:-0.833333
1 1:-0.555556 2:0.25 3:-0.864407 4:-0.916667
1 1:-0.722222 2:-0.166667 3:-0.864407 4:-0.833333
1 1:-0.722222 2:0.166667 3:-0.694915 4:-0.916667
0 1:0.166667 2:-0.416667 3:0.457627 4:0.5
1 1:-0.833333 3:-0.864407 4:-0.916667
2 1:-1.32455e-07 2:-0.166667 3:0.220339 4:0.0833333
2 1:-1.32455e-07 2:-0.333333 3:0.0169491 4:-4.03573e-08
1 1:-0.5 2:0.75 3:-0.830508 4:-1
0 1:0.611111 3:0.694915 4:0.416667
0 1:0.222222 2:-0.166667 3:0.423729 4:0.583333
1 1:-0.722222 2:-0.166667 3:-0.864407 4:-1
1 1:-0.5 2:0.166667 3:-0.864407 4:-0.916667
2 1:-0.222222 2:-0.333333 3:0.0508474 4:-4.03573e-08
2 1:-0.0555556 2:-0.833333 3:0.0169491 4:-0.25
2 1:-0.166667 2:-0.416667 3:-0.0169491 4:-0.0833333
1 1:-0.944444 3:-0.898305 4:-0.916667
2 1:-0.277778 2:-0.583333 3:-0.0169491 4:-0.166667
0 1:0.111111 2:-0.333333 3:0.38983 4:0.166667
2 1:-0.222222 2:-0.166667 3:0.0847457 4:-0.0833333
0 1:0.166667 2:-0.333333 3:0.559322 4:0.666667
1 1:-0.611111 2:0.0833333 3:-0.864407 4:-0.916667
2 1:-0.333333 2:-0.583333 3:0.0169491 4:-4.03573e-08
0 1:0.555555 2:-0.166667 3:0.661017 4:0.666667
2 1:0.166667 3:0.186441 4:0.166667
2 1:0.111111 2:-0.75 3:0.152542 4:-4.03573e-08
2 1:0.166667 2:-0.25 3:0.118644 4:-4.03573e-08
0 1:-0.0555556 2:-0.833333 3:0.355932 4:0.166667
0 1:-0.277778 2:-0.333333 3:0.322034 4:0.583333
2 1:-0.222222 2:-0.5 3:-0.152542 4:-0.25
2 1:-0.111111 3:0.288136 4:0.416667
2 1:-0.0555556 2:-0.25 3:0.186441 4:0.166667
2 1:0.333333 2:-0.166667 3:0.355932 4:0.333333
1 1:-0.611111 2:0.25 3:-0.898305 4:-0.833333
0 1:0.166667 2:-0.333333 3:0.559322 4:0.75
0 1:0.111111 2:-0.25 3:0.559322 4:0.416667
0 1:0.833333 2:-0.166667 3:0.898305 4:0.666667
2 1:-0.277778 2:-0.166667 3:0.186441 4:0.166667
0 1:-0.666667 2:-0.583333 3:0.186441 4:0.333333
1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1
1 1:-0.166667 2:0.666667 3:-0.932203 4:-0.916667
0 1:0.0555554 2:-0.333333 3:0.288136 4:0.416667
1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1
1 1:-0.833333 2:0.166667 3:-0.864407 4:-0.833333
0 1:0.0555554 2:0.166667 3:0.491525 4:0.833333
0 1:0.722222 2:-0.333333 3:0.728813 4:0.5
2 1:-0.166667 2:-0.416667 3:0.0508474 4:-0.25
2 1:0.5 3:0.254237 4:0.0833333
0 1:0.111111 2:-0.583333 3:0.355932 4:0.5
1 1:-0.944444 2:-0.166667 3:-0.898305 4:-0.916667
2 1:0.277778 2:-0.25 3:0.220339 4:-4.03573e-08
0 1:0.666667 2:-0.25 3:0.79661 4:0.416667
0 1:0.111111 2:0.0833333 3:0.694915 4:1
0 1:0.444444 3:0.59322 4:0.833333
2 1:-0.0555556 2:0.166667 3:0.186441 4:0.25
1 1:-0.833333 2:0.333333 3:-1 4:-0.916667
1 1:-0.555556 2:0.416667 3:-0.830508 4:-0.75
2 1:-0.333333 2:-0.5 3:0.152542 4:-0.0833333
1 1:-1 2:-0.166667 3:-0.966102 4:-1
1 1:-0.333333 2:0.25 3:-0.898305 4:-0.916667
2 1:0.388889 2:-0.333333 3:0.288136 4:0.0833333
2 1:0.277778 2:-0.166667 3:0.152542 4:0.0833333
0 1:0.333333 2:0.0833333 3:0.59322 4:0.666667
1 1:-0.777778 3:-0.79661 4:-0.916667
1 1:-0.444444 2:0.416667 3:-0.830508 4:-0.916667
0 1:0.222222 2:-0.166667 3:0.627119 4:0.75
1 1:-0.555556 2:0.5 3:-0.79661 4:-0.916667
1 1:-0.555556 2:0.5 3:-0.694915 4:-0.75
2 1:-1.32455e-07 2:-0.25 3:0.254237 4:0.0833333
1 1:-0.5 2:0.25 3:-0.830508 4:-0.916667
0 1:0.166667 3:0.457627 4:0.833333
2 1:0.444444 2:-0.0833334 3:0.322034 4:0.166667
0 1:0.111111 2:0.166667 3:0.559322 4:0.916667
1 1:-0.611111 2:0.25 3:-0.79661 4:-0.583333
0 1:0.388889 3:0.661017 4:0.833333
1 1:-0.722222 2:0.166667 3:-0.79661 4:-0.916667
1 1:-0.722222 2:-0.0833334 3:-0.79661 4:-0.916667
1 1:-0.555556 2:0.166667 3:-0.830508 4:-0.916667
2 1:-0.666667 2:-0.666667 3:-0.220339 4:-0.25
2 1:-0.611111 2:-0.75 3:-0.220339 4:-0.25
2 1:0.0555554 2:-0.833333 3:0.186441 4:0.166667
0 1:-0.166667 2:-0.416667 3:0.38983 4:0.5
0 1:0.611111 2:0.333333 3:0.728813 4:1
2 1:0.0555554 2:-0.25 3:0.118644 4:-4.03573e-08
1 1:-0.666667 2:-0.166667 3:-0.864407 4:-0.916667
1 1:-0.833333 2:-0.0833334 3:-0.830508 4:-0.916667
0 1:0.611111 2:-0.166667 3:0.627119 4:0.25
0 1:0.888889 2:0.5 3:0.932203 4:0.75
2 1:0.222222 2:-0.333333 3:0.220339 4:0.166667
1 1:-0.555556 2:0.25 3:-0.864407 4:-0.833333
0 1:-1.32455e-07 2:-0.166667 3:0.322034 4:0.416667
0 1:-1.32455e-07 2:-0.5 3:0.559322 4:0.0833333
1 1:-0.611111 3:-0.932203 4:-0.916667
1 1:-0.333333 2:0.833333 3:-0.864407 4:-0.916667
0 1:-0.166667 2:-0.333333 3:0.38983 4:0.916667
2 1:-0.333333 2:-0.666667 3:-0.0847458 4:-0.25
2 1:-0.0555556 2:-0.416667 3:0.38983 4:0.25
1 1:-0.388889 2:0.416667 3:-0.830508 4:-0.916667
0 1:0.444444 2:-0.0833334 3:0.38983 4:0.833333
1 1:-0.611111 2:0.333333 3:-0.864407 4:-0.916667
0 1:0.111111 2:-0.416667 3:0.322034 4:0.416667
0 1:0.166667 2:-0.0833334 3:0.525424 4:0.416667
2 1:0.333333 2:-0.0833334 3:0.152542 4:0.0833333
0 1:-0.0555556 2:-0.166667 3:0.288136 4:0.416667
0 1:-0.166667 2:-0.416667 3:0.38983 4:0.5
1 1:-0.611111 2:0.166667 3:-0.830508 4:-0.916667
0 1:0.888889 2:-0.166667 3:0.728813 4:0.833333
2 1:-0.277778 2:-0.25 3:-0.118644 4:-4.03573e-08
2 1:-0.222222 2:-0.333333 3:0.186441 4:-4.03573e-08
0 1:0.333333 2:-0.583333 3:0.627119 4:0.416667
0 1:0.444444 2:-0.0833334 3:0.491525 4:0.666667
2 1:-0.222222 2:-0.25 3:0.0847457 4:-4.03573e-08
1 1:-0.611111 2:0.166667 3:-0.79661 4:-0.75
2 1:-0.277778 2:-0.166667 3:0.0508474 4:-4.03573e-08
0 1:1 2:0.5 3:0.830508 4:0.583333
2 1:-0.333333 2:-0.666667 3:-0.0508475 4:-0.166667
2 1:-0.277778 2:-0.416667 3:0.0847457 4:-4.03573e-08
0 1:0.888889 2:-0.333333 3:0.932203 4:0.583333
2 1:-0.111111 2:-0.166667 3:0.0847457 4:0.166667
2 1:0.111111 2:-0.583333 3:0.322034 4:0.166667
0 1:0.333333 2:0.0833333 3:0.59322 4:1
0 1:0.222222 2:-0.166667 3:0.525424 4:0.416667
1 1:-0.555556 2:0.5 3:-0.830508 4:-0.833333
0 1:-0.111111 2:-0.166667 3:0.38983 4:0.416667
0 1:0.888889 2:-0.5 3:1 4:0.833333
1 1:-0.388889 2:0.583333 3:-0.898305 4:-0.75
2 1:0.111111 2:0.0833333 3:0.254237 4:0.25
0 1:0.333333 2:-0.166667 3:0.423729 4:0.833333
1 1:-0.388889 2:0.166667 3:-0.762712 4:-0.916667
0 1:0.333333 2:-0.0833334 3:0.559322 4:0.916667
2 1:-0.333333 2:-0.75 3:0.0169491 4:-4.03573e-08
1 1:-0.222222 2:1 3:-0.830508 4:-0.75
1 1:-0.388889 2:0.583333 3:-0.762712 4:-0.75
2 1:-0.611111 2:-1 3:-0.152542 4:-0.25
2 1:-1.32455e-07 2:-0.333333 3:0.254237 4:-0.0833333
2 1:-0.5 2:-0.416667 3:-0.0169491 4:0.0833333
1 1:-0.888889 2:-0.75 3:-0.898305 4:-0.833333
1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1
2 1:-0.555556 2:-0.583333 3:-0.322034 4:-0.166667
2 1:-0.166667 2:-0.5 3:0.0169491 4:-0.0833333
1 1:-0.555556 2:0.0833333 3:-0.762712 4:-0.666667
1 1:-0.777778 3:-0.898305 4:-0.916667
0 1:0.388889 2:-0.166667 3:0.525424 4:0.666667
0 1:0.222222 3:0.38983 4:0.583333
2 1:0.333333 2:-0.0833334 3:0.254237 4:0.166667
2 1:-0.388889 2:-0.166667 3:0.186441 4:0.166667
0 1:-0.222222 2:-0.583333 3:0.355932 4:0.583333
1 1:-0.611111 2:-0.166667 3:-0.79661 4:-0.916667
1 1:-0.944444 2:-0.25 3:-0.864407 4:-0.916667
1 1:-0.388889 2:0.166667 3:-0.830508 4:-0.75
129 changes: 129 additions & 0 deletions docs/ml-ensembles.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
---
layout: global
title: Ensembles
displayTitle: <a href="ml-guide.html">ML</a> - Ensembles
---

**Table of Contents**

* This will become a table of contents (this text will be scraped).
{:toc}

An [ensemble method](http://en.wikipedia.org/wiki/Ensemble_learning)
is a learning algorithm which creates a model composed of a set of other base models.
The Pipelines API supports the following ensemble algorithms: [`OneVsRest`](api/scala/index.html#org.apache.spark.ml.classifier.OneVsRest)

## OneVsRest

[OneVsRest](http://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest) is an example of a machine learning reduction for performing multiclass classification given a base classifier that can perform binary classification efficiently.

`OneVsRest` is implemented as an `Estimator`. For the base classifier it takes instances of `Classifier` and creates a binary classification problem for each of the k classes. The classifier for class i is trained to predict whether the label is i or not, distinguishing class i from all other classes.

Predictions are done by evaluating each binary classifier and the index of the most confident classifier is output as label.

### Example

The example below demonstrates how to load the
[Iris dataset](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/iris.scale), parse it as a DataFrame and perform multiclass classification using `OneVsRest`. The test error is calculated to measure the algorithm accuracy.

<div class="codetabs">
<div data-lang="scala" markdown="1">
{% highlight scala %}
import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.{Row, SQLContext}

val sqlContext = new SQLContext(sc)

// parse data into dataframe
val data = MLUtils.loadLibSVMFile(sc,
"data/mllib/sample_multiclass_classification_data.txt")
val Array(train, test) = data.toDF().randomSplit(Array(0.7, 0.3))

// instantiate multiclass learner and train
val ovr = new OneVsRest().setClassifier(new LogisticRegression)

val ovrModel = ovr.fit(train)

// score model on test data
val predictions = ovrModel.transform(test).select("prediction", "label")
val predictionsAndLabels = predictions.map {case Row(p: Double, l: Double) => (p, l)}

// compute confusion matrix
val metrics = new MulticlassMetrics(predictionsAndLabels)
println(metrics.confusionMatrix)

// the Iris DataSet has three classes
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you say "Iris" here, can you please say it in the example description too?

val numClasses = 3

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need for space

println("label\tfpr\n")
(0 until numClasses).foreach { index =>
val label = index.toDouble
println(label + "\t" + metrics.falsePositiveRate(label))
}
{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
{% highlight java %}

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.ml.classification.OneVsRest;
import org.apache.spark.ml.classification.OneVsRestModel;
import org.apache.spark.mllib.evaluation.MulticlassMetrics;
import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;

SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext jsql = new SQLContext(jsc);

RDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(),
"data/mllib/sample_multiclass_classification_data.txt");

DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
DataFrame[] splits = dataFrame.randomSplit(new double[]{0.7, 0.3}, 12345);
DataFrame train = splits[0];
DataFrame test = splits[1];

// instantiate the One Vs Rest Classifier
OneVsRest ovr = new OneVsRest().setClassifier(new LogisticRegression());

// train the multiclass model
OneVsRestModel ovrModel = ovr.fit(train.cache());

// score the model on test data
DataFrame predictions = ovrModel
.transform(test)
.select("prediction", "label");

// obtain metrics
MulticlassMetrics metrics = new MulticlassMetrics(predictions);
Matrix confusionMatrix = metrics.confusionMatrix();

// output the Confusion Matrix
System.out.println("Confusion Matrix");
System.out.println(confusionMatrix);

// compute the false positive rate per label
System.out.println();
System.out.println("label\tfpr\n");

// the Iris DataSet has three classes
int numClasses = 3;
for (int index = 0; index < numClasses; index++) {
double label = (double) index;
System.out.print(label);
System.out.print("\t");
System.out.print(metrics.falsePositiveRate(label));
System.out.println();
}
{% endhighlight %}
</div>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing another </div>

</div>
3 changes: 2 additions & 1 deletion docs/ml-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,12 @@ This is useful if there are two algorithms with the `maxIter` parameter in a `Pi

# Algorithm Guides

There are now several algorithms in the Pipelines API which are not in the lower-level MLlib API, so we link to documentation for them here. These algorithms are mostly feature transformers, which fit naturally into the `Transformer` abstraction in Pipelines.
There are now several algorithms in the Pipelines API which are not in the lower-level MLlib API, so we link to documentation for them here. These algorithms are mostly feature transformers, which fit naturally into the `Transformer` abstraction in Pipelines, and ensembles, which fit naturally into the `Estimator` abstraction in the Pipelines.

**Pipelines API Algorithm Guides**

* [Feature Extraction, Transformation, and Selection](ml-features.html)
* [Ensembles](ml-ensembles.html)


# Code Examples
Expand Down