From 1290ff83175b9724e3f205cb4ac356d017ddf1ca Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Mon, 24 Apr 2017 16:50:37 -0400 Subject: [PATCH 01/11] update v7 --- .../classification/LogisticRegression.scala | 319 +++++++++++++----- .../LogisticRegressionSuite.scala | 99 ++++-- .../ml/regression/LinearRegressionSuite.scala | 2 +- project/MimaExcludes.scala | 20 +- 4 files changed, 337 insertions(+), 103 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 21957d94e2dc..1d09b16a035e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -35,7 +35,7 @@ import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ -import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics +import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics} import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils @@ -882,21 +882,28 @@ class LogisticRegression @Since("1.2.0") ( val model = copyValues(new LogisticRegressionModel(uid, coefficientMatrix, interceptVector, numClasses, isMultinomial)) - // TODO: implement summary model for multinomial case - val m = if (!isMultinomial) { - val (summaryModel, probabilityColName) = model.findSummaryModelAndProbabilityCol() - val logRegSummary = new BinaryLogisticRegressionTrainingSummary( + + val (summaryModel, probabilityColName, predictionColName) = model.findSummaryModel() + val logRegSummary = if (numClasses <= 2) { + new BinaryLogisticRegressionTrainingSummaryImpl( summaryModel.transform(dataset), probabilityColName, + predictionColName, $(labelCol), $(featuresCol), objectiveHistory) - model.setSummary(Some(logRegSummary)) } else { - model + new LogisticRegressionTrainingSummaryImpl( + summaryModel.transform(dataset), + probabilityColName, + predictionColName, + $(labelCol), + $(featuresCol), + objectiveHistory) } - instr.logSuccess(m) - m + model.setSummary(Some(logRegSummary)) + instr.logSuccess(model) + model } @Since("1.4.0") @@ -1018,19 +1025,33 @@ class LogisticRegressionModel private[spark] ( throw new SparkException("No training summary available for this LogisticRegressionModel") } + @Since("2.2.0") + def binarySummary: BinaryLogisticRegressionTrainingSummary = summary match { + case b: BinaryLogisticRegressionTrainingSummary => b + case _ => + throw new RuntimeException("Cannot create a binary summary for a non-binary model" + + s"(numClasses=${numClasses}), use summary instead.") + } + /** - * If the probability column is set returns the current model and probability column, - * otherwise generates a new column and sets it as the probability column on a new copy - * of the current model. + * If the probability and prediction columns are set, this method returns the current model, + * otherwise it generates new columns for them and sets them as columns on a new copy of + * the current model */ - private[classification] def findSummaryModelAndProbabilityCol(): - (LogisticRegressionModel, String) = { - $(probabilityCol) match { - case "" => - val probabilityColName = "probability_" + java.util.UUID.randomUUID.toString - (copy(ParamMap.empty).setProbabilityCol(probabilityColName), probabilityColName) - case p => (this, p) + private[classification] def findSummaryModel(): + (LogisticRegressionModel, String, String) = { + val model = if ($(probabilityCol).isEmpty && $(predictionCol).isEmpty) { + copy(ParamMap.empty) + .setProbabilityCol("probability_" + java.util.UUID.randomUUID.toString) + .setPredictionCol("prediction_" + java.util.UUID.randomUUID.toString) + } else if ($(probabilityCol).isEmpty) { + copy(ParamMap.empty).setProbabilityCol("probability_" + java.util.UUID.randomUUID.toString) + } else if ($(predictionCol).isEmpty) { + copy(ParamMap.empty).setPredictionCol("prediction_" + java.util.UUID.randomUUID.toString) + } else { + this } + (model, model.getProbabilityCol, model.getPredictionCol) } private[classification] @@ -1051,9 +1072,14 @@ class LogisticRegressionModel private[spark] ( @Since("2.0.0") def evaluate(dataset: Dataset[_]): LogisticRegressionSummary = { // Handle possible missing or invalid prediction columns - val (summaryModel, probabilityColName) = findSummaryModelAndProbabilityCol() - new BinaryLogisticRegressionSummary(summaryModel.transform(dataset), - probabilityColName, $(labelCol), $(featuresCol)) + val (summaryModel, probabilityColName, predictionColName) = findSummaryModel() + if (numClasses > 2) { + new LogisticRegressionSummaryImpl(summaryModel.transform(dataset), + probabilityColName, predictionColName, $(labelCol), $(featuresCol)) + } else { + new BinaryLogisticRegressionSummaryImpl(summaryModel.transform(dataset), + probabilityColName, predictionColName, $(labelCol), $(featuresCol)) + } } /** @@ -1324,90 +1350,128 @@ private[ml] class MultiClassSummarizer extends Serializable { } /** - * Abstraction for multinomial Logistic Regression Training results. - * Currently, the training summary ignores the training weights except - * for the objective trace. - */ -sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary { - - /** objective function (scaled loss + regularization) at each iteration. */ - def objectiveHistory: Array[Double] - - /** Number of training iterations until termination */ - def totalIterations: Int = objectiveHistory.length - -} - -/** - * Abstraction for Logistic Regression Results for a given model. + * Abstraction for logistic regression results for a given model. */ sealed trait LogisticRegressionSummary extends Serializable { /** * Dataframe output by the model's `transform` method. */ + @Since("2.2.0") def predictions: DataFrame /** Field in "predictions" which gives the probability of each class as a vector. */ + @Since("2.2.0") def probabilityCol: String + /** Field in "predictions" which gives the prediction of each class. */ + @Since("2.2.0") + def predictionCol: String + /** Field in "predictions" which gives the true label of each instance (if available). */ + @Since("2.2.0") def labelCol: String /** Field in "predictions" which gives the features of each instance as a vector. */ + @Since("2.2.0") def featuresCol: String + @transient private val multiclassMetrics = { + new MulticlassMetrics( + predictions.select( + col(predictionCol), + col(labelCol).cast(DoubleType)) + .rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) }) + } + + /** Returns true positive rate for each label. */ + @Since("2.2.0") + def truePositiveRateByLabel: Array[Double] = recallByLabel + + /** Returns false positive rate for each label. */ + @Since("2.2.0") + def falsePositiveRateByLabel: Array[Double] = { + multiclassMetrics.labels.map(label => multiclassMetrics.falsePositiveRate(label)) + } + + /** Returns precision for each label. */ + @Since("2.2.0") + def precisionByLabel: Array[Double] = { + multiclassMetrics.labels.map(label => multiclassMetrics.precision(label)) + } + + /** Returns recall for each label. */ + @Since("2.2.0") + def recallByLabel: Array[Double] = { + multiclassMetrics.labels.map(label => multiclassMetrics.recall(label)) + } + + /** + * Returns f-measure for each label. + */ + @Since("2.2.0") + def fMeasureByLabel(beta: Double): Array[Double] = { + multiclassMetrics.labels.map(label => multiclassMetrics.fMeasure(label, beta)) + } + + /** Returns f1-measure for each label. */ + @Since("2.2.0") + def fMeasureByLabel: Array[Double] = fMeasureByLabel(1.0) + + /** Returns accuracy. */ + @Since("2.2.0") + def accuracy: Double = multiclassMetrics.accuracy + + /** Returns weighted true positive rate. */ + @Since("2.2.0") + def weightedTruePositiveRate: Double = weightedRecall + + /** Returns weighted false positive rate. */ + @Since("2.2.0") + def weightedFalsePositiveRate: Double = multiclassMetrics.weightedFalsePositiveRate + + /** Returns weighted averaged recall. */ + @Since("2.2.0") + def weightedRecall: Double = multiclassMetrics.weightedRecall + + /** Returns weighted averaged precision. */ + @Since("2.2.0") + def weightedPrecision: Double = multiclassMetrics.weightedPrecision + + /** + * Returns weighted averaged f-measure. + */ + @Since("2.2.0") + def weightedFMeasure(beta: Double): Double = multiclassMetrics.weightedFMeasure(beta) + + /** Returns weighted averaged f1-measure. */ + @Since("2.2.0") + def weightedFMeasure: Double = multiclassMetrics.weightedFMeasure(1.0) } /** - * :: Experimental :: - * Logistic regression training results. - * - * @param predictions dataframe output by the model's `transform` method. - * @param probabilityCol field in "predictions" which gives the probability of - * each class as a vector. - * @param labelCol field in "predictions" which gives the true label of each instance. - * @param featuresCol field in "predictions" which gives the features of each instance as a vector. - * @param objectiveHistory objective function (scaled loss + regularization) at each iteration. + * Abstraction for multiclass logistic regression training results. + * Currently, the training summary ignores the training weights except + * for the objective trace. */ -@Experimental -@Since("1.5.0") -class BinaryLogisticRegressionTrainingSummary private[classification] ( - predictions: DataFrame, - probabilityCol: String, - labelCol: String, - featuresCol: String, - @Since("1.5.0") val objectiveHistory: Array[Double]) - extends BinaryLogisticRegressionSummary(predictions, probabilityCol, labelCol, featuresCol) - with LogisticRegressionTrainingSummary { +sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary { + + /** objective function (scaled loss + regularization) at each iteration. */ + def objectiveHistory: Array[Double] + + /** Number of training iterations. */ + def totalIterations: Int = objectiveHistory.length } /** - * :: Experimental :: - * Binary Logistic regression results for a given model. - * - * @param predictions dataframe output by the model's `transform` method. - * @param probabilityCol field in "predictions" which gives the probability of - * each class as a vector. - * @param labelCol field in "predictions" which gives the true label of each instance. - * @param featuresCol field in "predictions" which gives the features of each instance as a vector. + * Abstraction for binary logistic regression results for a given model. */ -@Experimental -@Since("1.5.0") -class BinaryLogisticRegressionSummary private[classification] ( - @Since("1.5.0") @transient override val predictions: DataFrame, - @Since("1.5.0") override val probabilityCol: String, - @Since("1.5.0") override val labelCol: String, - @Since("1.6.0") override val featuresCol: String) extends LogisticRegressionSummary { - +sealed trait BinaryLogisticRegressionSummary extends LogisticRegressionSummary { private val sparkSession = predictions.sparkSession import sparkSession.implicits._ - /** - * Returns a BinaryClassificationMetrics object. - */ // TODO: Allow the user to vary the number of bins using a setBins method in // BinaryClassificationMetrics. For now the default is set to 100. @transient private val binaryMetrics = new BinaryClassificationMetrics( @@ -1484,3 +1548,106 @@ class BinaryLogisticRegressionSummary private[classification] ( binaryMetrics.recallByThreshold().toDF("threshold", "recall") } } + +sealed trait BinaryLogisticRegressionTrainingSummary extends BinaryLogisticRegressionSummary + with LogisticRegressionTrainingSummary + +/** + * :: Experimental :: + * Multiclass logistic regression training results. + * + * @param predictions dataframe output by the model's `transform` method. + * @param probabilityCol field in "predictions" which gives the probability of + * each class as a vector. + * @param predictionCol field in "predictions" which gives the prediction for a data instance as a + * double. + * @param labelCol field in "predictions" which gives the true label of each instance. + * @param featuresCol field in "predictions" which gives the features of each instance as a vector. + * @param objectiveHistory objective function (scaled loss + regularization) at each iteration. + */ +@Experimental +@Since("2.2.0") +private class LogisticRegressionTrainingSummaryImpl( + override val predictions: DataFrame, + override val probabilityCol: String, + override val predictionCol: String, + override val labelCol: String, + override val featuresCol: String, + @Since("1.5.0") val objectiveHistory: Array[Double]) + extends LogisticRegressionSummaryImpl( + predictions, probabilityCol, predictionCol, labelCol, featuresCol) + with LogisticRegressionTrainingSummary { + +} + +/** + * :: Experimental :: + * Multiclass Logistic regression results for a given model. + * + * @param predictions dataframe output by the model's `transform` method. + * @param probabilityCol field in "predictions" which gives the probability of + * each class as a vector. + * @param predictionCol field in "predictions" which gives the prediction for a data instance as a + * double. + * @param labelCol field in "predictions" which gives the true label of each instance. + * @param featuresCol field in "predictions" which gives the features of each instance as a vector. + */ +@Experimental +@Since("2.2.0") +private class LogisticRegressionSummaryImpl( + @Since("2.2.0") @transient override val predictions: DataFrame, + @Since("2.2.0") override val probabilityCol: String, + @Since("2.2.0") override val predictionCol: String, + @Since("2.2.0") override val labelCol: String, + @Since("2.2.0") override val featuresCol: String) + extends LogisticRegressionSummary + +/** + * :: Experimental :: + * Binary logistic regression training results. + * + * @param predictions dataframe output by the model's `transform` method. + * @param probabilityCol field in "predictions" which gives the probability of + * each class as a vector. + * @param predictionCol field in "predictions" which gives the prediction for a data instance as a + * double. + * @param labelCol field in "predictions" which gives the true label of each instance. + * @param featuresCol field in "predictions" which gives the features of each instance as a vector. + * @param objectiveHistory objective function (scaled loss + regularization) at each iteration. + */ +@Experimental +@Since("2.2.0") +private class BinaryLogisticRegressionTrainingSummaryImpl( + override val predictions: DataFrame, + override val probabilityCol: String, + override val predictionCol: String, + override val labelCol: String, + override val featuresCol: String, + @Since("1.5.0") override val objectiveHistory: Array[Double]) + extends BinaryLogisticRegressionSummaryImpl( + predictions, probabilityCol, predictionCol, labelCol, featuresCol) + with BinaryLogisticRegressionTrainingSummary + +/** + * :: Experimental :: + * Binary logistic regression results for a given model. + * + * @param predictions dataframe output by the model's `transform` method. + * @param probabilityCol field in "predictions" which gives the probability of + * each class as a vector. + * @param predictionCol field in "predictions" which gives the prediction of + * each class as a double. + * @param labelCol field in "predictions" which gives the true label of each instance. + * @param featuresCol field in "predictions" which gives the features of each instance as a vector. + */ +@Experimental +@Since("2.2.0") +private class BinaryLogisticRegressionSummaryImpl( + @Since("2.2.0") @transient override val predictions: DataFrame, + @Since("2.2.0") override val probabilityCol: String, + @Since("2.2.0") override val predictionCol: String, + @Since("2.2.0") override val labelCol: String, + @Since("2.2.0") override val featuresCol: String) + extends LogisticRegressionSummaryImpl( + predictions, probabilityCol, predictionCol, labelCol, featuresCol) + with BinaryLogisticRegressionSummary diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 0570499e7451..cc701dfdeb98 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -2263,51 +2263,98 @@ class LogisticRegressionSuite } test("evaluate on test set") { - // TODO: add for multiclass when model summary becomes available // Evaluate on test set should be same as that of the transformed training data. val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(1.0) .setThreshold(0.6) - val model = lr.fit(smallBinaryDataset) - val summary = model.summary.asInstanceOf[BinaryLogisticRegressionSummary] - - val sameSummary = - model.evaluate(smallBinaryDataset).asInstanceOf[BinaryLogisticRegressionSummary] - assert(summary.areaUnderROC === sameSummary.areaUnderROC) - assert(summary.roc.collect() === sameSummary.roc.collect()) - assert(summary.pr.collect === sameSummary.pr.collect()) + .setFamily("binomial") + val blorModel = lr.fit(smallBinaryDataset) + val blorSummary = blorModel.binarySummary + + val sameBlorSummary = + blorModel.evaluate(smallBinaryDataset).asInstanceOf[BinaryLogisticRegressionSummary] + assert(blorSummary.areaUnderROC === sameBlorSummary.areaUnderROC) + assert(blorSummary.roc.collect() === sameBlorSummary.roc.collect()) + assert(blorSummary.pr.collect === sameBlorSummary.pr.collect()) assert( - summary.fMeasureByThreshold.collect() === sameSummary.fMeasureByThreshold.collect()) - assert(summary.recallByThreshold.collect() === sameSummary.recallByThreshold.collect()) + blorSummary.fMeasureByThreshold.collect() === sameBlorSummary.fMeasureByThreshold.collect()) assert( - summary.precisionByThreshold.collect() === sameSummary.precisionByThreshold.collect()) + blorSummary.recallByThreshold.collect() === sameBlorSummary.recallByThreshold.collect()) + assert( + blorSummary.precisionByThreshold.collect() === sameBlorSummary.precisionByThreshold.collect()) + + lr.setFamily("multinomial") + val mlorModel = lr.fit(smallMultinomialDataset) + val mlorSummary = mlorModel.summary + + val mlorSameSummary = mlorModel.evaluate(smallMultinomialDataset) + + assert(mlorSummary.truePositiveRateByLabel === mlorSameSummary.truePositiveRateByLabel) + assert(mlorSummary.falsePositiveRateByLabel === mlorSameSummary.falsePositiveRateByLabel) + assert(mlorSummary.precisionByLabel === mlorSameSummary.precisionByLabel) + assert(mlorSummary.recallByLabel === mlorSameSummary.recallByLabel) + assert(mlorSummary.fMeasureByLabel === mlorSameSummary.fMeasureByLabel) + assert(mlorSummary.accuracy === mlorSameSummary.accuracy) + assert(mlorSummary.weightedTruePositiveRate === mlorSameSummary.weightedTruePositiveRate) + assert(mlorSummary.weightedFalsePositiveRate === mlorSameSummary.weightedFalsePositiveRate) + assert(mlorSummary.weightedPrecision === mlorSameSummary.weightedPrecision) + assert(mlorSummary.weightedRecall === mlorSameSummary.weightedRecall) + assert(mlorSummary.weightedFMeasure === mlorSameSummary.weightedFMeasure) } test("evaluate with labels that are not doubles") { // Evaluate a test set with Label that is a numeric type other than Double - val lr = new LogisticRegression() + val blor = new LogisticRegression() .setMaxIter(1) .setRegParam(1.0) - val model = lr.fit(smallBinaryDataset) - val summary = model.evaluate(smallBinaryDataset).asInstanceOf[BinaryLogisticRegressionSummary] + .setFamily("binomial") + val blorModel = blor.fit(smallBinaryDataset) + val blorSummary = blorModel.evaluate(smallBinaryDataset) + .asInstanceOf[BinaryLogisticRegressionSummary] + + val blorLongLabelData = smallBinaryDataset.select(col(blorModel.getLabelCol).cast(LongType), + col(blorModel.getFeaturesCol)) + val blorLongSummary = blorModel.evaluate(blorLongLabelData) + .asInstanceOf[BinaryLogisticRegressionSummary] + + assert(blorSummary.areaUnderROC === blorLongSummary.areaUnderROC) + + val mlor = new LogisticRegression() + .setMaxIter(1) + .setRegParam(1.0) + .setFamily("multinomial") + val mlorModel = mlor.fit(smallMultinomialDataset) + val mlorSummary = mlorModel.evaluate(smallMultinomialDataset) - val longLabelData = smallBinaryDataset.select(col(model.getLabelCol).cast(LongType), - col(model.getFeaturesCol)) - val longSummary = model.evaluate(longLabelData).asInstanceOf[BinaryLogisticRegressionSummary] + val mlorLongLabelData = smallMultinomialDataset.select( + col(mlorModel.getLabelCol).cast(LongType), + col(mlorModel.getFeaturesCol)) + val mlorLongSummary = mlorModel.evaluate(mlorLongLabelData) - assert(summary.areaUnderROC === longSummary.areaUnderROC) + assert(mlorSummary.accuracy === mlorLongSummary.accuracy) } test("statistics on training data") { // Test that loss is monotonically decreasing. - val lr = new LogisticRegression() + val blor = new LogisticRegression() .setMaxIter(10) .setRegParam(1.0) - .setThreshold(0.6) - val model = lr.fit(smallBinaryDataset) + .setFamily("binomial") + val blorModel = blor.fit(smallBinaryDataset) + assert( + blorModel.summary + .objectiveHistory + .sliding(2) + .forall(x => x(0) >= x(1))) + + val mlor = new LogisticRegression() + .setMaxIter(10) + .setRegParam(1.0) + .setFamily("multinomial") + val mlorModel = mlor.fit(smallMultinomialDataset) assert( - model.summary + mlorModel.summary .objectiveHistory .sliding(2) .forall(x => x(0) >= x(1))) @@ -2392,7 +2439,7 @@ class LogisticRegressionSuite predictions3.zip(predictions4).foreach { case (Row(p1: Double), Row(p2: Double)) => assert(p1 === p2) } - // TODO: check that it converges in a single iteration when model summary is available + assert(model4.summary.totalIterations === 1) } test("binary logistic regression with all labels the same") { @@ -2453,6 +2500,7 @@ class LogisticRegressionSuite assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0))) assert(pred === 4.0) } + assert(model.summary.totalIterations === 0) // force the model to be trained with only one class val constantZeroData = Seq( @@ -2466,6 +2514,7 @@ class LogisticRegressionSuite assert(prob === Vectors.dense(Array(1.0))) assert(pred === 0.0) } + assert(modelZeroLabel.summary.totalIterations > 0) // ensure that the correct value is predicted when numClasses passed through metadata val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata() @@ -2479,7 +2528,7 @@ class LogisticRegressionSuite assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0, 0.0))) assert(pred === 4.0) } - // TODO: check num iters is zero when it become available in the model + require(modelWithMetadata.summary.totalIterations === 0) } test("compressed storage for constant label") { diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index e7bd4eb9e0ad..f470dca7dbd0 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -715,7 +715,7 @@ class LinearRegressionSuite assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_"))) // Residuals in [[LinearRegressionResults]] should equal those manually computed - val expectedResiduals = datasetWithDenseFeature.select("features", "label") + datasetWithDenseFeature.select("features", "label") .rdd .map { case Row(features: DenseVector, label: Double) => val prediction = diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 9bda917377c2..10a958b7464b 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -115,7 +115,25 @@ object MimaExcludes { ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDenseColMajor"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDenseMatrix"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toSparseMatrix"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.getSizeInBytes") + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.getSizeInBytes"), + + // [SPARK-17139] Add model summary for MultinomialLogisticRegression + ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary"), + ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.predictionCol"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.truePositiveRateByLabel"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.falsePositiveRateByLabel"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.precisionByLabel"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.recallByLabel"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.fMeasureByLabel"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.accuracy"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedTruePositiveRate"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedFalsePositiveRate"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedRecall"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedPrecision"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedFMeasure"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$LogisticRegressionSummary$$multiclassMetrics"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$LogisticRegressionSummary$_setter_$org$apache$spark$ml$classification$LogisticRegressionSummary$$multiclassMetrics_=") ) ++ Seq( // [SPARK-17019] Expose on-heap and off-heap memory usage in various places ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.SparkListenerBlockManagerAdded.copy"), From 17272033e30a9c8d754d51442b9d206396d60d15 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Mon, 24 Apr 2017 16:55:24 -0400 Subject: [PATCH 02/11] fix nits --- .../classification/LogisticRegression.scala | 6 +- .../LogisticRegressionSuite.scala | 60 ++++++++++++++++--- 2 files changed, 53 insertions(+), 13 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 1d09b16a035e..2f0118a28596 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -1576,13 +1576,11 @@ private class LogisticRegressionTrainingSummaryImpl( @Since("1.5.0") val objectiveHistory: Array[Double]) extends LogisticRegressionSummaryImpl( predictions, probabilityCol, predictionCol, labelCol, featuresCol) - with LogisticRegressionTrainingSummary { - -} + with LogisticRegressionTrainingSummary /** * :: Experimental :: - * Multiclass Logistic regression results for a given model. + * Multiclass logistic regression results for a given model. * * @param predictions dataframe output by the model's `transform` method. * @param probabilityCol field in "predictions" which gives the probability of diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index cc701dfdeb98..a1e32f33ba88 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -199,15 +199,57 @@ class LogisticRegressionSuite } } - test("empty probabilityCol") { - val lr = new LogisticRegression().setProbabilityCol("") - val model = lr.fit(smallBinaryDataset) - assert(model.hasSummary) - // Validate that we re-insert a probability column for evaluation - val fieldNames = model.summary.predictions.schema.fieldNames - assert(smallBinaryDataset.schema.fieldNames.toSet.subsetOf( - fieldNames.toSet)) - assert(fieldNames.exists(s => s.startsWith("probability_"))) + test("empty probabilityCol or predictionCol") { + val lr = new LogisticRegression().setMaxIter(1) + val datasetFieldNames = smallBinaryDataset.schema.fieldNames.toSet + def checkSummarySchema(model: LogisticRegressionModel, columns: Seq[String]): Unit = { + val fieldNames = model.summary.predictions.schema.fieldNames + assert(model.hasSummary) + assert(datasetFieldNames.subsetOf(fieldNames.toSet)) + columns.foreach { c => assert(fieldNames.exists(_.startsWith(c))) } + } + // check that the summary model adds the appropriate columns + Seq(("binomial", smallBinaryDataset), ("multinomial", smallMultinomialDataset)).foreach { + case (family, dataset) => + lr.setFamily(family) + lr.setProbabilityCol("").setPredictionCol("prediction") + val modelNoProb = lr.fit(smallBinaryDataset) + checkSummarySchema(modelNoProb, Seq("probability_")) + + lr.setProbabilityCol("probability").setPredictionCol("") + val modelNoPred = lr.fit(smallBinaryDataset) + checkSummarySchema(modelNoPred, Seq("prediction_")) + + lr.setProbabilityCol("").setPredictionCol("") + val modelNoPredNoProb = lr.fit(smallBinaryDataset) + checkSummarySchema(modelNoPredNoProb, Seq("prediction_", "probability_")) + } + } + + test("check summary types for binary and multiclass") { + val lr = new LogisticRegression() + .setFamily("binomial") + + val blorModel = lr.fit(smallBinaryDataset) + assert(blorModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummaryImpl]) + assert(blorModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummaryImpl]) + + val mlorModel = lr.setFamily("multinomial").fit(smallMultinomialDataset) + assert(mlorModel.summary.isInstanceOf[LogisticRegressionTrainingSummaryImpl]) + withClue("cannot get binary summary for multiclass model") { + intercept[RuntimeException] { + mlorModel.binarySummary + } + } + + val mlorBinaryModel = lr.setFamily("multinomial").fit(smallBinaryDataset) + assert(mlorBinaryModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummaryImpl]) + assert(mlorBinaryModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummaryImpl]) + + val blorSummary = blorModel.evaluate(smallBinaryDataset) + val mlorSummary = mlorModel.evaluate(smallMultinomialDataset) + assert(blorSummary.isInstanceOf[BinaryLogisticRegressionSummaryImpl]) + assert(mlorSummary.isInstanceOf[LogisticRegressionSummaryImpl]) } test("setThreshold, getThreshold") { From a96dc54a5743876939e2773d2ad3fe8ccc9b8cd1 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Mon, 8 May 2017 15:29:29 -0400 Subject: [PATCH 03/11] add since tags2 --- .../classification/LogisticRegression.scala | 34 ++++++++----------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 2f0118a28596..65f49c22d221 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -1456,9 +1456,11 @@ sealed trait LogisticRegressionSummary extends Serializable { */ sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary { + @Since("1.5.0") /** objective function (scaled loss + regularization) at each iteration. */ def objectiveHistory: Array[Double] + @Since("1.5.0") /** Number of training iterations. */ def totalIterations: Int = objectiveHistory.length @@ -1565,15 +1567,13 @@ sealed trait BinaryLogisticRegressionTrainingSummary extends BinaryLogisticRegre * @param featuresCol field in "predictions" which gives the features of each instance as a vector. * @param objectiveHistory objective function (scaled loss + regularization) at each iteration. */ -@Experimental -@Since("2.2.0") private class LogisticRegressionTrainingSummaryImpl( override val predictions: DataFrame, override val probabilityCol: String, override val predictionCol: String, override val labelCol: String, override val featuresCol: String, - @Since("1.5.0") val objectiveHistory: Array[Double]) + val objectiveHistory: Array[Double]) extends LogisticRegressionSummaryImpl( predictions, probabilityCol, predictionCol, labelCol, featuresCol) with LogisticRegressionTrainingSummary @@ -1590,14 +1590,12 @@ private class LogisticRegressionTrainingSummaryImpl( * @param labelCol field in "predictions" which gives the true label of each instance. * @param featuresCol field in "predictions" which gives the features of each instance as a vector. */ -@Experimental -@Since("2.2.0") private class LogisticRegressionSummaryImpl( - @Since("2.2.0") @transient override val predictions: DataFrame, - @Since("2.2.0") override val probabilityCol: String, - @Since("2.2.0") override val predictionCol: String, - @Since("2.2.0") override val labelCol: String, - @Since("2.2.0") override val featuresCol: String) + @transient override val predictions: DataFrame, + override val probabilityCol: String, + override val predictionCol: String, + override val labelCol: String, + override val featuresCol: String) extends LogisticRegressionSummary /** @@ -1613,15 +1611,13 @@ private class LogisticRegressionSummaryImpl( * @param featuresCol field in "predictions" which gives the features of each instance as a vector. * @param objectiveHistory objective function (scaled loss + regularization) at each iteration. */ -@Experimental -@Since("2.2.0") private class BinaryLogisticRegressionTrainingSummaryImpl( override val predictions: DataFrame, override val probabilityCol: String, override val predictionCol: String, override val labelCol: String, override val featuresCol: String, - @Since("1.5.0") override val objectiveHistory: Array[Double]) + override val objectiveHistory: Array[Double]) extends BinaryLogisticRegressionSummaryImpl( predictions, probabilityCol, predictionCol, labelCol, featuresCol) with BinaryLogisticRegressionTrainingSummary @@ -1638,14 +1634,12 @@ private class BinaryLogisticRegressionTrainingSummaryImpl( * @param labelCol field in "predictions" which gives the true label of each instance. * @param featuresCol field in "predictions" which gives the features of each instance as a vector. */ -@Experimental -@Since("2.2.0") private class BinaryLogisticRegressionSummaryImpl( - @Since("2.2.0") @transient override val predictions: DataFrame, - @Since("2.2.0") override val probabilityCol: String, - @Since("2.2.0") override val predictionCol: String, - @Since("2.2.0") override val labelCol: String, - @Since("2.2.0") override val featuresCol: String) + @transient override val predictions: DataFrame, + override val probabilityCol: String, + override val predictionCol: String, + override val labelCol: String, + override val featuresCol: String) extends LogisticRegressionSummaryImpl( predictions, probabilityCol, predictionCol, labelCol, featuresCol) with BinaryLogisticRegressionSummary From 3c4b995a325d277fa46fefec0db74cbcb75e3b36 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Mon, 15 May 2017 13:24:17 -0400 Subject: [PATCH 04/11] update since tag to 2.3 --- .../classification/LogisticRegression.scala | 36 ++++++++--------- project/MimaExcludes.scala | 40 +++++++++---------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 65f49c22d221..f5dbf4e0e74a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -1357,23 +1357,23 @@ sealed trait LogisticRegressionSummary extends Serializable { /** * Dataframe output by the model's `transform` method. */ - @Since("2.2.0") + @Since("2.3.0") def predictions: DataFrame /** Field in "predictions" which gives the probability of each class as a vector. */ - @Since("2.2.0") + @Since("2.3.0") def probabilityCol: String /** Field in "predictions" which gives the prediction of each class. */ - @Since("2.2.0") + @Since("2.3.0") def predictionCol: String /** Field in "predictions" which gives the true label of each instance (if available). */ - @Since("2.2.0") + @Since("2.3.0") def labelCol: String /** Field in "predictions" which gives the features of each instance as a vector. */ - @Since("2.2.0") + @Since("2.3.0") def featuresCol: String @transient private val multiclassMetrics = { @@ -1385,23 +1385,23 @@ sealed trait LogisticRegressionSummary extends Serializable { } /** Returns true positive rate for each label. */ - @Since("2.2.0") + @Since("2.3.0") def truePositiveRateByLabel: Array[Double] = recallByLabel /** Returns false positive rate for each label. */ - @Since("2.2.0") + @Since("2.3.0") def falsePositiveRateByLabel: Array[Double] = { multiclassMetrics.labels.map(label => multiclassMetrics.falsePositiveRate(label)) } /** Returns precision for each label. */ - @Since("2.2.0") + @Since("2.3.0") def precisionByLabel: Array[Double] = { multiclassMetrics.labels.map(label => multiclassMetrics.precision(label)) } /** Returns recall for each label. */ - @Since("2.2.0") + @Since("2.3.0") def recallByLabel: Array[Double] = { multiclassMetrics.labels.map(label => multiclassMetrics.recall(label)) } @@ -1409,43 +1409,43 @@ sealed trait LogisticRegressionSummary extends Serializable { /** * Returns f-measure for each label. */ - @Since("2.2.0") + @Since("2.3.0") def fMeasureByLabel(beta: Double): Array[Double] = { multiclassMetrics.labels.map(label => multiclassMetrics.fMeasure(label, beta)) } /** Returns f1-measure for each label. */ - @Since("2.2.0") + @Since("2.3.0") def fMeasureByLabel: Array[Double] = fMeasureByLabel(1.0) /** Returns accuracy. */ - @Since("2.2.0") + @Since("2.3.0") def accuracy: Double = multiclassMetrics.accuracy /** Returns weighted true positive rate. */ - @Since("2.2.0") + @Since("2.3.0") def weightedTruePositiveRate: Double = weightedRecall /** Returns weighted false positive rate. */ - @Since("2.2.0") + @Since("2.3.0") def weightedFalsePositiveRate: Double = multiclassMetrics.weightedFalsePositiveRate /** Returns weighted averaged recall. */ - @Since("2.2.0") + @Since("2.3.0") def weightedRecall: Double = multiclassMetrics.weightedRecall /** Returns weighted averaged precision. */ - @Since("2.2.0") + @Since("2.3.0") def weightedPrecision: Double = multiclassMetrics.weightedPrecision /** * Returns weighted averaged f-measure. */ - @Since("2.2.0") + @Since("2.3.0") def weightedFMeasure(beta: Double): Double = multiclassMetrics.weightedFMeasure(beta) /** Returns weighted averaged f1-measure. */ - @Since("2.2.0") + @Since("2.3.0") def weightedFMeasure: Double = multiclassMetrics.weightedFMeasure(1.0) } diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 10a958b7464b..92dc36e7fee1 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -44,7 +44,25 @@ object MimaExcludes { ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.ShuffleReadMetricDistributions.this"), // [SPARK-21276] Update lz4-java to the latest (v1.4.0) - ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.io.LZ4BlockInputStream") + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.io.LZ4BlockInputStream"), + + // [SPARK-17139] Add model summary for MultinomialLogisticRegression + ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary"), + ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.predictionCol"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.truePositiveRateByLabel"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.falsePositiveRateByLabel"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.precisionByLabel"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.recallByLabel"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.fMeasureByLabel"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.accuracy"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedTruePositiveRate"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedFalsePositiveRate"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedRecall"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedPrecision"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedFMeasure"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$LogisticRegressionSummary$$multiclassMetrics"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$LogisticRegressionSummary$_setter_$org$apache$spark$ml$classification$LogisticRegressionSummary$$multiclassMetrics_=") ) // Exclude rules for 2.2.x @@ -115,25 +133,7 @@ object MimaExcludes { ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDenseColMajor"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDenseMatrix"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toSparseMatrix"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.getSizeInBytes"), - - // [SPARK-17139] Add model summary for MultinomialLogisticRegression - ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary"), - ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.predictionCol"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.truePositiveRateByLabel"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.falsePositiveRateByLabel"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.precisionByLabel"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.recallByLabel"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.fMeasureByLabel"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.accuracy"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedTruePositiveRate"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedFalsePositiveRate"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedRecall"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedPrecision"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightedFMeasure"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$LogisticRegressionSummary$$multiclassMetrics"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$LogisticRegressionSummary$_setter_$org$apache$spark$ml$classification$LogisticRegressionSummary$$multiclassMetrics_=") + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.getSizeInBytes") ) ++ Seq( // [SPARK-17019] Expose on-heap and off-heap memory usage in various places ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.SparkListenerBlockManagerAdded.copy"), From deddb00671af3e878b4bb390dd305d6a53f5da68 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Thu, 17 Aug 2017 10:56:17 +0800 Subject: [PATCH 05/11] update --- .../classification/LogisticRegression.scala | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index f5dbf4e0e74a..4aca3ca86f78 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -884,7 +884,7 @@ class LogisticRegression @Since("1.2.0") ( numClasses, isMultinomial)) val (summaryModel, probabilityColName, predictionColName) = model.findSummaryModel() - val logRegSummary = if (numClasses <= 2) { + val logRegSummary = if (!isMultinomial) { new BinaryLogisticRegressionTrainingSummaryImpl( summaryModel.transform(dataset), probabilityColName, @@ -1384,45 +1384,50 @@ sealed trait LogisticRegressionSummary extends Serializable { .rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) }) } - /** Returns true positive rate for each label. */ + /** Returns true positive rate for each label (category). */ @Since("2.3.0") def truePositiveRateByLabel: Array[Double] = recallByLabel - /** Returns false positive rate for each label. */ + /** Returns false positive rate for each label (category). */ @Since("2.3.0") def falsePositiveRateByLabel: Array[Double] = { multiclassMetrics.labels.map(label => multiclassMetrics.falsePositiveRate(label)) } - /** Returns precision for each label. */ + /** Returns precision for each label (category). */ @Since("2.3.0") def precisionByLabel: Array[Double] = { multiclassMetrics.labels.map(label => multiclassMetrics.precision(label)) } - /** Returns recall for each label. */ + /** Returns recall for each label (category). */ @Since("2.3.0") def recallByLabel: Array[Double] = { multiclassMetrics.labels.map(label => multiclassMetrics.recall(label)) } - /** - * Returns f-measure for each label. - */ + /** Returns f-measure for each label (category). */ @Since("2.3.0") def fMeasureByLabel(beta: Double): Array[Double] = { multiclassMetrics.labels.map(label => multiclassMetrics.fMeasure(label, beta)) } - /** Returns f1-measure for each label. */ + /** Returns f1-measure for each label (category). */ @Since("2.3.0") def fMeasureByLabel: Array[Double] = fMeasureByLabel(1.0) - /** Returns accuracy. */ + /** + * Returns accuracy. + * (equals to the total number of correctly classified instances + * out of the total number of instances.) + */ @Since("2.3.0") def accuracy: Double = multiclassMetrics.accuracy - /** Returns weighted true positive rate. */ + /** + * Returns weighted true positive rate. + * (equals to precision, recall and f-measure) + */ @Since("2.3.0") def weightedTruePositiveRate: Double = weightedRecall @@ -1430,7 +1435,10 @@ sealed trait LogisticRegressionSummary extends Serializable { @Since("2.3.0") def weightedFalsePositiveRate: Double = multiclassMetrics.weightedFalsePositiveRate - /** Returns weighted averaged recall. */ + /** + * Returns weighted averaged recall. + * (equals to precision, recall and f-measure) + */ @Since("2.3.0") def weightedRecall: Double = multiclassMetrics.weightedRecall @@ -1438,9 +1446,7 @@ sealed trait LogisticRegressionSummary extends Serializable { @Since("2.3.0") def weightedPrecision: Double = multiclassMetrics.weightedPrecision - /** - * Returns weighted averaged f-measure. - */ + /** Returns weighted averaged f-measure. */ @Since("2.3.0") def weightedFMeasure(beta: Double): Double = multiclassMetrics.weightedFMeasure(beta) @@ -1456,12 +1462,12 @@ sealed trait LogisticRegressionSummary extends Serializable { */ sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary { - @Since("1.5.0") /** objective function (scaled loss + regularization) at each iteration. */ + @Since("1.5.0") def objectiveHistory: Array[Double] - @Since("1.5.0") /** Number of training iterations. */ + @Since("1.5.0") def totalIterations: Int = objectiveHistory.length } From 2bce87b10905aa61cc7e85084c519cea28ed8793 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Thu, 17 Aug 2017 14:58:16 +0800 Subject: [PATCH 06/11] update --- .../org/apache/spark/ml/classification/LogisticRegression.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 4aca3ca86f78..c26b93697022 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -884,7 +884,7 @@ class LogisticRegression @Since("1.2.0") ( numClasses, isMultinomial)) val (summaryModel, probabilityColName, predictionColName) = model.findSummaryModel() - val logRegSummary = if (!isMultinomial) { + val logRegSummary = if (!isMultinomial || (isMultinomial && numClasses <= 2)) { new BinaryLogisticRegressionTrainingSummaryImpl( summaryModel.transform(dataset), probabilityColName, From ce95023b44db2d86360144cc69814435a3b2373c Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Thu, 17 Aug 2017 20:04:06 +0800 Subject: [PATCH 07/11] update --- .../org/apache/spark/ml/classification/LogisticRegression.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index c26b93697022..5f4009c86e01 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -884,7 +884,7 @@ class LogisticRegression @Since("1.2.0") ( numClasses, isMultinomial)) val (summaryModel, probabilityColName, predictionColName) = model.findSummaryModel() - val logRegSummary = if (!isMultinomial || (isMultinomial && numClasses <= 2)) { + val logRegSummary = if (numClasses <=2) { new BinaryLogisticRegressionTrainingSummaryImpl( summaryModel.transform(dataset), probabilityColName, From b6cde56f18caa85f79b9cd0dc604ae1a46fd4948 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Fri, 18 Aug 2017 15:23:41 +0800 Subject: [PATCH 08/11] update --- .../classification/LogisticRegression.scala | 67 +++++++++++-------- .../LogisticRegressionSuite.scala | 21 +++--- 2 files changed, 50 insertions(+), 38 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 5f4009c86e01..56144453e185 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -884,7 +884,7 @@ class LogisticRegression @Since("1.2.0") ( numClasses, isMultinomial)) val (summaryModel, probabilityColName, predictionColName) = model.findSummaryModel() - val logRegSummary = if (numClasses <=2) { + val logRegSummary = if (numClasses <= 2) { new BinaryLogisticRegressionTrainingSummaryImpl( summaryModel.transform(dataset), probabilityColName, @@ -1017,15 +1017,19 @@ class LogisticRegressionModel private[spark] ( private var trainingSummary: Option[LogisticRegressionTrainingSummary] = None /** - * Gets summary of model on training set. An exception is - * thrown if `trainingSummary == None`. + * Gets summary of model on training set. An exception is thrown + * if `trainingSummary == None`. */ @Since("1.5.0") def summary: LogisticRegressionTrainingSummary = trainingSummary.getOrElse { throw new SparkException("No training summary available for this LogisticRegressionModel") } - @Since("2.2.0") + /** + * Gets summary of model on training set. An exception is thrown + * if `trainingSummary == None` or it is a multiclass model. + */ + @Since("2.3.0") def binarySummary: BinaryLogisticRegressionTrainingSummary = summary match { case b: BinaryLogisticRegressionTrainingSummary => b case _ => @@ -1357,11 +1361,11 @@ sealed trait LogisticRegressionSummary extends Serializable { /** * Dataframe output by the model's `transform` method. */ - @Since("2.3.0") + @Since("1.5.0") def predictions: DataFrame /** Field in "predictions" which gives the probability of each class as a vector. */ - @Since("2.3.0") + @Since("1.5.0") def probabilityCol: String /** Field in "predictions" which gives the prediction of each class. */ @@ -1369,11 +1373,11 @@ sealed trait LogisticRegressionSummary extends Serializable { def predictionCol: String /** Field in "predictions" which gives the true label of each instance (if available). */ - @Since("2.3.0") + @Since("1.5.0") def labelCol: String /** Field in "predictions" which gives the features of each instance as a vector. */ - @Since("2.3.0") + @Since("1.6.0") def featuresCol: String @transient private val multiclassMetrics = { @@ -1384,6 +1388,17 @@ sealed trait LogisticRegressionSummary extends Serializable { .rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) }) } + /** + * Returns the sequence of labels in ascending order + * + * Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the + * training set is missing a label, then all of the arrays over labels + * (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the + * expected numClasses. + */ + @Since("2.3.0") + def labels: Array[Double] = multiclassMetrics.labels + /** Returns true positive rate for each label (category). */ @Since("2.3.0") def truePositiveRateByLabel: Array[Double] = recallByLabel @@ -1561,7 +1576,6 @@ sealed trait BinaryLogisticRegressionTrainingSummary extends BinaryLogisticRegre with LogisticRegressionTrainingSummary /** - * :: Experimental :: * Multiclass logistic regression training results. * * @param predictions dataframe output by the model's `transform` method. @@ -1574,18 +1588,17 @@ sealed trait BinaryLogisticRegressionTrainingSummary extends BinaryLogisticRegre * @param objectiveHistory objective function (scaled loss + regularization) at each iteration. */ private class LogisticRegressionTrainingSummaryImpl( - override val predictions: DataFrame, - override val probabilityCol: String, - override val predictionCol: String, - override val labelCol: String, - override val featuresCol: String, - val objectiveHistory: Array[Double]) + predictions: DataFrame, + probabilityCol: String, + predictionCol: String, + labelCol: String, + featuresCol: String, + override val objectiveHistory: Array[Double]) extends LogisticRegressionSummaryImpl( predictions, probabilityCol, predictionCol, labelCol, featuresCol) with LogisticRegressionTrainingSummary /** - * :: Experimental :: * Multiclass logistic regression results for a given model. * * @param predictions dataframe output by the model's `transform` method. @@ -1605,7 +1618,6 @@ private class LogisticRegressionSummaryImpl( extends LogisticRegressionSummary /** - * :: Experimental :: * Binary logistic regression training results. * * @param predictions dataframe output by the model's `transform` method. @@ -1618,18 +1630,17 @@ private class LogisticRegressionSummaryImpl( * @param objectiveHistory objective function (scaled loss + regularization) at each iteration. */ private class BinaryLogisticRegressionTrainingSummaryImpl( - override val predictions: DataFrame, - override val probabilityCol: String, - override val predictionCol: String, - override val labelCol: String, - override val featuresCol: String, + predictions: DataFrame, + probabilityCol: String, + predictionCol: String, + labelCol: String, + featuresCol: String, override val objectiveHistory: Array[Double]) extends BinaryLogisticRegressionSummaryImpl( predictions, probabilityCol, predictionCol, labelCol, featuresCol) with BinaryLogisticRegressionTrainingSummary /** - * :: Experimental :: * Binary logistic regression results for a given model. * * @param predictions dataframe output by the model's `transform` method. @@ -1641,11 +1652,11 @@ private class BinaryLogisticRegressionTrainingSummaryImpl( * @param featuresCol field in "predictions" which gives the features of each instance as a vector. */ private class BinaryLogisticRegressionSummaryImpl( - @transient override val predictions: DataFrame, - override val probabilityCol: String, - override val predictionCol: String, - override val labelCol: String, - override val featuresCol: String) + predictions: DataFrame, + probabilityCol: String, + predictionCol: String, + labelCol: String, + featuresCol: String) extends LogisticRegressionSummaryImpl( predictions, probabilityCol, predictionCol, labelCol, featuresCol) with BinaryLogisticRegressionSummary diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index a1e32f33ba88..b4cce849e4fa 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -213,15 +213,15 @@ class LogisticRegressionSuite case (family, dataset) => lr.setFamily(family) lr.setProbabilityCol("").setPredictionCol("prediction") - val modelNoProb = lr.fit(smallBinaryDataset) + val modelNoProb = lr.fit(dataset) checkSummarySchema(modelNoProb, Seq("probability_")) lr.setProbabilityCol("probability").setPredictionCol("") - val modelNoPred = lr.fit(smallBinaryDataset) + val modelNoPred = lr.fit(dataset) checkSummarySchema(modelNoPred, Seq("prediction_")) lr.setProbabilityCol("").setPredictionCol("") - val modelNoPredNoProb = lr.fit(smallBinaryDataset) + val modelNoPredNoProb = lr.fit(dataset) checkSummarySchema(modelNoPredNoProb, Seq("prediction_", "probability_")) } } @@ -229,13 +229,14 @@ class LogisticRegressionSuite test("check summary types for binary and multiclass") { val lr = new LogisticRegression() .setFamily("binomial") + .setMaxIter(1) val blorModel = lr.fit(smallBinaryDataset) - assert(blorModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummaryImpl]) - assert(blorModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummaryImpl]) + assert(blorModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary]) + assert(blorModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummary]) val mlorModel = lr.setFamily("multinomial").fit(smallMultinomialDataset) - assert(mlorModel.summary.isInstanceOf[LogisticRegressionTrainingSummaryImpl]) + assert(mlorModel.summary.isInstanceOf[LogisticRegressionTrainingSummary]) withClue("cannot get binary summary for multiclass model") { intercept[RuntimeException] { mlorModel.binarySummary @@ -243,13 +244,13 @@ class LogisticRegressionSuite } val mlorBinaryModel = lr.setFamily("multinomial").fit(smallBinaryDataset) - assert(mlorBinaryModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummaryImpl]) - assert(mlorBinaryModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummaryImpl]) + assert(mlorBinaryModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary]) + assert(mlorBinaryModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummary]) val blorSummary = blorModel.evaluate(smallBinaryDataset) val mlorSummary = mlorModel.evaluate(smallMultinomialDataset) - assert(blorSummary.isInstanceOf[BinaryLogisticRegressionSummaryImpl]) - assert(mlorSummary.isInstanceOf[LogisticRegressionSummaryImpl]) + assert(blorSummary.isInstanceOf[BinaryLogisticRegressionSummary]) + assert(mlorSummary.isInstanceOf[LogisticRegressionSummary]) } test("setThreshold, getThreshold") { From 67c57e547b654ec2816fe4f33e067072a05c4d5e Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Tue, 22 Aug 2017 07:38:18 +0800 Subject: [PATCH 09/11] update mima --- project/MimaExcludes.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 92dc36e7fee1..eecda26abb7e 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -50,6 +50,7 @@ object MimaExcludes { ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary"), ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.predictionCol"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.labels"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.truePositiveRateByLabel"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.falsePositiveRateByLabel"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.precisionByLabel"), From 0ebc943ea12e88e85f2a331e3f6c729ff4ff9aa7 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Wed, 23 Aug 2017 15:05:24 +0800 Subject: [PATCH 10/11] tiny update comment --- .../apache/spark/ml/classification/LogisticRegression.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 56144453e185..86526d6f41ba 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -1389,7 +1389,8 @@ sealed trait LogisticRegressionSummary extends Serializable { } /** - * Returns the sequence of labels in ascending order + * Returns the sequence of labels in ascending order. This order matches the order used + * in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel. * * Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the * training set is missing a label, then all of the arrays over labels From 1395de2c5ab85ea76c690b786c55d459180f1b44 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Fri, 25 Aug 2017 09:15:06 +0800 Subject: [PATCH 11/11] add experimental tag --- .../ml/classification/LogisticRegression.scala | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 86526d6f41ba..ffe4b52300c7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -22,7 +22,7 @@ import java.util.Locale import scala.collection.mutable import breeze.linalg.{DenseVector => BDV} -import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, LBFGSB => BreezeLBFGSB, OWLQN => BreezeOWLQN} +import breeze.optimize.{CachedDiffFunction, LBFGS => BreezeLBFGS, LBFGSB => BreezeLBFGSB, OWLQN => BreezeOWLQN} import org.apache.hadoop.fs.Path import org.apache.spark.SparkException @@ -1354,8 +1354,10 @@ private[ml] class MultiClassSummarizer extends Serializable { } /** + * :: Experimental :: * Abstraction for logistic regression results for a given model. */ +@Experimental sealed trait LogisticRegressionSummary extends Serializable { /** @@ -1472,10 +1474,12 @@ sealed trait LogisticRegressionSummary extends Serializable { } /** + * :: Experimental :: * Abstraction for multiclass logistic regression training results. * Currently, the training summary ignores the training weights except * for the objective trace. */ +@Experimental sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary { /** objective function (scaled loss + regularization) at each iteration. */ @@ -1489,8 +1493,10 @@ sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary } /** + * :: Experimental :: * Abstraction for binary logistic regression results for a given model. */ +@Experimental sealed trait BinaryLogisticRegressionSummary extends LogisticRegressionSummary { private val sparkSession = predictions.sparkSession @@ -1573,6 +1579,13 @@ sealed trait BinaryLogisticRegressionSummary extends LogisticRegressionSummary { } } +/** + * :: Experimental :: + * Abstraction for binary logistic regression training results. + * Currently, the training summary ignores the training weights except + * for the objective trace. + */ +@Experimental sealed trait BinaryLogisticRegressionTrainingSummary extends BinaryLogisticRegressionSummary with LogisticRegressionTrainingSummary