From 19b8de498a4c2a602810738aed0a7ad4088ef172 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 13 Jan 2017 09:24:14 -0800 Subject: [PATCH 01/20] Start working on summary table --- .../GeneralizedLinearRegression.scala | 39 +++++++++++++++++++ .../GeneralizedLinearRegressionSuite.scala | 5 ++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index c600b87bdc64a..f8648b2ae4a44 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -32,6 +32,7 @@ import org.apache.spark.ml.optim._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ +import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ @@ -1204,6 +1205,15 @@ class GeneralizedLinearRegressionSummary private[regression] ( @Since("2.2.0") lazy val numInstances: Long = predictions.count() + + /** Name of features. */ + @Since("2.2.0") + lazy val featureName: Array[String] = { + val features = AttributeGroup.fromStructField(dataset.schema(model.getFeaturesCol)) + .attributes.get.map(_.name.get) + features + } + /** The numeric rank of the fitted linear model. */ @Since("2.0.0") lazy val rank: Long = if (model.getFitIntercept) { @@ -1458,4 +1468,33 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( "No p-value available for this GeneralizedLinearRegressionModel") } } + + /** + * Summary table with feature name, coefficient standard error, + * tValues and pValues. + * + */ + @Since("2.2.0") + lazy val summaryTable: Seq[_] = { + if (isNormalSolver) { + val features = if (model.getFitIntercept) { + featureName :+ "Intercept" + } else { + featureName + } + val coef = if (model.getFitIntercept) { + model.coefficients.toArray :+ model.intercept + } else { + model.coefficients.toArray + } + val result = for (i <- 0 until coef.length) + yield (features(i), coef(i), coefficientStandardErrors(i), tValues(i), pValues(i)) + result + } else { + throw new UnsupportedOperationException( + "No summary table available for this GeneralizedLinearRegressionModel") + } + } + + } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index a47bd17f47bb1..a2a36925ffc5d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -1350,6 +1350,9 @@ class GeneralizedLinearRegressionSuite assert(summary.residualDegreeOfFreedomNull === residualDegreeOfFreedomNullR) assert(summary.aic ~== aicR absTol 1E-3) assert(summary.solver === "irls") + println(summary.featureName) + println(summary.summaryTable(0)) + println(summary.summaryTable(1)) } test("glm summary: tweedie family with weight and offset") { @@ -1492,7 +1495,7 @@ class GeneralizedLinearRegressionSuite } } - test("read/write") { + ignore("read/write") { def checkModelData( model: GeneralizedLinearRegressionModel, model2: GeneralizedLinearRegressionModel): Unit = { From 93139b90a9c714f0f760780e00c9fa7743f7de1d Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 17 Jan 2017 00:26:47 -0800 Subject: [PATCH 02/20] convert result to dataframe --- .../GeneralizedLinearRegression.scala | 47 +++++++++++-------- .../GeneralizedLinearRegressionSuite.scala | 5 +- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index f8648b2ae4a44..a55df89bd4d97 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -21,7 +21,6 @@ import java.util.Locale import breeze.stats.{distributions => dist} import org.apache.hadoop.fs.Path - import org.apache.spark.SparkException import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.internal.Logging @@ -36,7 +35,8 @@ import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{DataType, DoubleType, StructType} +import org.apache.spark.sql.types.{DataType, DoubleType, StringType, StructType, StructField} +import org.apache.spark.sql.SparkSession /** @@ -1206,12 +1206,19 @@ class GeneralizedLinearRegressionSummary private[regression] ( lazy val numInstances: Long = predictions.count() - /** Name of features. */ + /** + * Name of features. If the name cannot be retrieved from attributes, + * use default "V0", "V1", ... + */ @Since("2.2.0") lazy val featureName: Array[String] = { - val features = AttributeGroup.fromStructField(dataset.schema(model.getFeaturesCol)) - .attributes.get.map(_.name.get) - features + val featureAttrs = AttributeGroup.fromStructField( + dataset.schema(model.getFeaturesCol)).attributes + if (featureAttrs == None) { + Array.tabulate[String](origModel.numFeatures)((x: Int) => ("V" + x)) + } else { + featureAttrs.get.map(_.name.get) + } } /** The numeric rank of the fitted linear model. */ @@ -1475,21 +1482,23 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( * */ @Since("2.2.0") - lazy val summaryTable: Seq[_] = { + lazy val summaryTable: DataFrame = { if (isNormalSolver) { - val features = if (model.getFitIntercept) { - featureName :+ "Intercept" - } else { - featureName - } - val coef = if (model.getFitIntercept) { - model.coefficients.toArray :+ model.intercept - } else { - model.coefficients.toArray + var featureNames = featureName + var coefficients = model.coefficients.toArray + if (model.getFitIntercept) { + featureNames = featureNames :+ "Intercept" + coefficients = coefficients :+ model.intercept } - val result = for (i <- 0 until coef.length) - yield (features(i), coef(i), coefficientStandardErrors(i), tValues(i), pValues(i)) - result + var result = for (i <- 0 until coefficients.length) yield + (featureNames(i), coefficients(i), coefficientStandardErrors(i), tValues(i), pValues(i)) + /*if (model.getFitIntercept) { + result = ((coefficients.length - 1) +: Array.range(0, (coefficients.length - 1))) + .map(result(_)).toSeq + }*/ + val spark = SparkSession.builder().getOrCreate() + import spark.implicits._ + result.toDF("Feature", "Estimate", "StdError", "tValue", "pValue").repartition(1) } else { throw new UnsupportedOperationException( "No summary table available for this GeneralizedLinearRegressionModel") diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index a2a36925ffc5d..a47bd17f47bb1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -1350,9 +1350,6 @@ class GeneralizedLinearRegressionSuite assert(summary.residualDegreeOfFreedomNull === residualDegreeOfFreedomNullR) assert(summary.aic ~== aicR absTol 1E-3) assert(summary.solver === "irls") - println(summary.featureName) - println(summary.summaryTable(0)) - println(summary.summaryTable(1)) } test("glm summary: tweedie family with weight and offset") { @@ -1495,7 +1492,7 @@ class GeneralizedLinearRegressionSuite } } - ignore("read/write") { + test("read/write") { def checkModelData( model: GeneralizedLinearRegressionModel, model2: GeneralizedLinearRegressionModel): Unit = { From 0b50f34cd91416c913ff500f33314ef29833dff0 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 17 Jan 2017 22:59:41 -0800 Subject: [PATCH 03/20] Add test --- .../GeneralizedLinearRegression.scala | 20 ++--- .../GeneralizedLinearRegressionSuite.scala | 82 ++++++++++++++++++- 2 files changed, 90 insertions(+), 12 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index a55df89bd4d97..12343dd8c3bbc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -1208,14 +1208,14 @@ class GeneralizedLinearRegressionSummary private[regression] ( /** * Name of features. If the name cannot be retrieved from attributes, - * use default "V0", "V1", ... + * use default names "V1", "V2", ... */ @Since("2.2.0") lazy val featureName: Array[String] = { val featureAttrs = AttributeGroup.fromStructField( dataset.schema(model.getFeaturesCol)).attributes if (featureAttrs == None) { - Array.tabulate[String](origModel.numFeatures)((x: Int) => ("V" + x)) + Array.tabulate[String](origModel.numFeatures)((x: Int) => ("V" + (x + 1))) } else { featureAttrs.get.map(_.name.get) } @@ -1477,8 +1477,8 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( } /** - * Summary table with feature name, coefficient standard error, - * tValues and pValues. + * Summary table with feature name, coefficient, standard error, + * tValue and pValue. * */ @Since("2.2.0") @@ -1486,19 +1486,19 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( if (isNormalSolver) { var featureNames = featureName var coefficients = model.coefficients.toArray + var idx = Array.range(0, coefficients.length) if (model.getFitIntercept) { featureNames = featureNames :+ "Intercept" coefficients = coefficients :+ model.intercept + // reorder so that intercept comes first + idx = (coefficients.length - 1) +: idx } - var result = for (i <- 0 until coefficients.length) yield + val result = for (i <- idx.toSeq) yield (featureNames(i), coefficients(i), coefficientStandardErrors(i), tValues(i), pValues(i)) - /*if (model.getFitIntercept) { - result = ((coefficients.length - 1) +: Array.range(0, (coefficients.length - 1))) - .map(result(_)).toSeq - }*/ + val spark = SparkSession.builder().getOrCreate() import spark.implicits._ - result.toDF("Feature", "Estimate", "StdError", "tValue", "pValue").repartition(1) + result.toDF("Feature", "Estimate", "StdError", "TValue", "PValue").repartition(1) } else { throw new UnsupportedOperationException( "No summary table available for this GeneralizedLinearRegressionModel") diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index a47bd17f47bb1..fb934b53181ac 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -18,11 +18,10 @@ package org.apache.spark.ml.regression import scala.util.Random - import org.apache.spark.SparkFunSuite import org.apache.spark.ml.classification.LogisticRegressionSuite._ import org.apache.spark.ml.feature.{Instance, OffsetInstance} -import org.apache.spark.ml.feature.LabeledPoint +import org.apache.spark.ml.feature.{LabeledPoint, RFormula} import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors} import org.apache.spark.ml.param.{ParamMap, ParamsSuite} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} @@ -1524,6 +1523,85 @@ class GeneralizedLinearRegressionSuite .fit(datasetGaussianIdentity.as[LabeledPoint]) } + + test("glm summary: feature name") { + // dataset with no attribute + val dataset1 = Seq( + Instance(2.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), + Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)), + Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)), + Instance(9.0, 4.0, Vectors.dense(3.0, 13.0)), + Instance(2.0, 5.0, Vectors.dense(2.0, 3.0)) + ).toDF() + + // dataset with attribute + val datasetTmp = Seq( + (2.0, 1.0, 0.0, 5.0), + (8.0, 2.0, 1.0, 7.0), + (3.0, 3.0, 2.0, 11.0), + (9.0, 4.0, 3.0, 13.0), + (2.0, 5.0, 2.0, 3.0) + ).toDF("y", "w", "x1", "x2") + val formula = new RFormula().setFormula("y ~ x1 + x2") + val dataset2 = formula.fit(datasetTmp).transform(datasetTmp) + + val expectedFeature = Seq(Array("V1", "V2"), Array("x1", "x2")) + + val trainer = new GeneralizedLinearRegression() + var idx = 0 + for (dataset <- Seq(dataset1, dataset2)) { + val model = trainer.fit(dataset) + model.summary.featureName + .zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2, + "Feature name mismatch in glm summary") } + idx += 1 + } + } + + test("glm summary: summaryTable") { + val dataset = Seq( + Instance(2.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), + Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)), + Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)), + Instance(9.0, 4.0, Vectors.dense(3.0, 13.0)), + Instance(2.0, 5.0, Vectors.dense(2.0, 3.0)) + ).toDF() + + val expectedFeature = Seq(Array("V1", "V2"), + Array("Intercept", "V1", "V2")) + val expectedEstimate = Seq(Vectors.dense(0.2884, 0.538), + Vectors.dense(0.7903, 0.2258, 0.4677)) + val expectedStdError = Seq(Vectors.dense(1.724, 0.3787), + Vectors.dense(4.0129, 2.1153, 0.5815)) + val expectedTValue = Seq(Vectors.dense(0.1673, 1.4205), + Vectors.dense(0.1969, 0.1067, 0.8043)) + val expectedPValue = Seq(Vectors.dense(0.8778, 0.2506), + Vectors.dense(0.8621, 0.9247, 0.5056)) + + var idx = 0 + for (fitIntercept <- Seq(false, true)) { + val trainer = new GeneralizedLinearRegression() + .setFamily("gaussian") + .setFitIntercept(fitIntercept) + val model = trainer.fit(dataset) + val summaryTable = model.summary.summaryTable + + summaryTable.select("Feature").rdd.collect.map(_.getString(0)) + .zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2, + "Feature name mismatch in summaryTable") } + assert(Vectors.dense(summaryTable.select("Estimate").rdd.collect.map(_.getDouble(0))) + ~= expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable") + assert(Vectors.dense(summaryTable.select("StdError").rdd.collect.map(_.getDouble(0))) + ~= expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable") + assert(Vectors.dense(summaryTable.select("TValue").rdd.collect.map(_.getDouble(0))) + ~= expectedTValue(idx) absTol 1E-3, "TValue mismatch in summaryTable") + assert(Vectors.dense(summaryTable.select("PValue").rdd.collect.map(_.getDouble(0))) + ~= expectedPValue(idx) absTol 1E-3, "PValue mismatch in summaryTable") + + idx += 1 + } + } + test("generalized linear regression: regularization parameter") { /* R code: From af2dbea625b49e0c7219b794c4eb84a946032d25 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 17 Jan 2017 23:27:37 -0800 Subject: [PATCH 04/20] minor cleanup --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 3 +-- .../ml/regression/GeneralizedLinearRegressionSuite.scala | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 12343dd8c3bbc..cd91f7d5838b0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -1208,7 +1208,7 @@ class GeneralizedLinearRegressionSummary private[regression] ( /** * Name of features. If the name cannot be retrieved from attributes, - * use default names "V1", "V2", ... + * set default names to "V1", "V2", and so on. */ @Since("2.2.0") lazy val featureName: Array[String] = { @@ -1505,5 +1505,4 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( } } - } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index fb934b53181ac..41e827d5f5586 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -1525,7 +1525,7 @@ class GeneralizedLinearRegressionSuite test("glm summary: feature name") { - // dataset with no attribute + // dataset1 with no attribute val dataset1 = Seq( Instance(2.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)), @@ -1534,7 +1534,7 @@ class GeneralizedLinearRegressionSuite Instance(2.0, 5.0, Vectors.dense(2.0, 3.0)) ).toDF() - // dataset with attribute + // dataset2 with attribute val datasetTmp = Seq( (2.0, 1.0, 0.0, 5.0), (8.0, 2.0, 1.0, 7.0), From e2ac2d48b5eaedfb368cad7b1e8b7d38ea255ba8 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 17 Jan 2017 23:34:44 -0800 Subject: [PATCH 05/20] clean up test --- .../GeneralizedLinearRegressionSuite.scala | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 41e827d5f5586..7750fddaea891 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.ml.regression import scala.util.Random + import org.apache.spark.SparkFunSuite import org.apache.spark.ml.classification.LogisticRegressionSuite._ import org.apache.spark.ml.feature.{Instance, OffsetInstance} @@ -1527,7 +1528,7 @@ class GeneralizedLinearRegressionSuite test("glm summary: feature name") { // dataset1 with no attribute val dataset1 = Seq( - Instance(2.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), + Instance(2.0, 1.0, Vectors.dense(0.0, 5.0)), Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)), Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)), Instance(9.0, 4.0, Vectors.dense(3.0, 13.0)), @@ -1547,20 +1548,18 @@ class GeneralizedLinearRegressionSuite val expectedFeature = Seq(Array("V1", "V2"), Array("x1", "x2")) - val trainer = new GeneralizedLinearRegression() var idx = 0 for (dataset <- Seq(dataset1, dataset2)) { - val model = trainer.fit(dataset) - model.summary.featureName - .zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2, - "Feature name mismatch in glm summary") } + val model = new GeneralizedLinearRegression().fit(dataset) + model.summary.featureName.zip(expectedFeature(idx)) + .foreach{ x => assert(x._1 === x._2) } idx += 1 } } test("glm summary: summaryTable") { val dataset = Seq( - Instance(2.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), + Instance(2.0, 1.0, Vectors.dense(0.0, 5.0)), Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)), Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)), Instance(9.0, 4.0, Vectors.dense(3.0, 13.0)), From eec31b46f135393ed2dead3a8ac469669e86bafc Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 18 Jan 2017 15:14:06 -0800 Subject: [PATCH 06/20] clean up imports --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index cd91f7d5838b0..545b5d0fd83a0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -21,21 +21,22 @@ import java.util.Locale import breeze.stats.{distributions => dist} import org.apache.hadoop.fs.Path + import org.apache.spark.SparkException import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml.PredictorParams +import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.feature.{Instance, OffsetInstance} import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} import org.apache.spark.ml.optim._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ -import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{DataType, DoubleType, StringType, StructType, StructField} +import org.apache.spark.sql.types.{DataType, DoubleType, StructType} import org.apache.spark.sql.SparkSession @@ -1504,5 +1505,4 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( "No summary table available for this GeneralizedLinearRegressionModel") } } - } From 602c3bd91ff481da6fb37743d1c007305b3d47a5 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 25 Jan 2017 10:17:16 -0800 Subject: [PATCH 07/20] fix style issues --- .../ml/regression/GeneralizedLinearRegression.scala | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 545b5d0fd83a0..7152e603bccb5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -1478,10 +1478,9 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( } /** - * Summary table with feature name, coefficient, standard error, - * tValue and pValue. - * - */ + * Summary table with feature name, coefficient, standard error, + * tValue and pValue. + */ @Since("2.2.0") lazy val summaryTable: DataFrame = { if (isNormalSolver) { @@ -1491,7 +1490,7 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( if (model.getFitIntercept) { featureNames = featureNames :+ "Intercept" coefficients = coefficients :+ model.intercept - // reorder so that intercept comes first + // Reorder so that intercept comes first idx = (coefficients.length - 1) +: idx } val result = for (i <- idx.toSeq) yield From 6882be4b9e01502237b5272ce538ed4b2e47458f Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 14 Feb 2017 11:35:54 -0800 Subject: [PATCH 08/20] change default name to use feature colname --- .../GeneralizedLinearRegression.scala | 18 ++++++++++------- .../GeneralizedLinearRegressionSuite.scala | 20 +++++++++---------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 7152e603bccb5..982c911b3480b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -471,6 +471,8 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine private[regression] val epsilon: Double = 1E-16 + private[regression] val Intercept: String = "Intercept" + /** * Wrapper of family and link combination used in the model. */ @@ -1209,14 +1211,15 @@ class GeneralizedLinearRegressionSummary private[regression] ( /** * Name of features. If the name cannot be retrieved from attributes, - * set default names to "V1", "V2", and so on. + * set default names to feature column name with numbered suffix "_0", "_1", and so on. */ @Since("2.2.0") - lazy val featureName: Array[String] = { + lazy val featureNames: Array[String] = { val featureAttrs = AttributeGroup.fromStructField( dataset.schema(model.getFeaturesCol)).attributes if (featureAttrs == None) { - Array.tabulate[String](origModel.numFeatures)((x: Int) => ("V" + (x + 1))) + Array.tabulate[String](origModel.numFeatures)( + (x: Int) => (model.getFeaturesCol + "_" + x)) } else { featureAttrs.get.map(_.name.get) } @@ -1484,21 +1487,22 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( @Since("2.2.0") lazy val summaryTable: DataFrame = { if (isNormalSolver) { - var featureNames = featureName + var featureNamesLocal = featureNames var coefficients = model.coefficients.toArray var idx = Array.range(0, coefficients.length) if (model.getFitIntercept) { - featureNames = featureNames :+ "Intercept" + featureNamesLocal = featureNamesLocal :+ Intercept coefficients = coefficients :+ model.intercept // Reorder so that intercept comes first idx = (coefficients.length - 1) +: idx } val result = for (i <- idx.toSeq) yield - (featureNames(i), coefficients(i), coefficientStandardErrors(i), tValues(i), pValues(i)) + (featureNamesLocal(i), coefficients(i), coefficientStandardErrors(i), + tValues(i), pValues(i)) val spark = SparkSession.builder().getOrCreate() import spark.implicits._ - result.toDF("Feature", "Estimate", "StdError", "TValue", "PValue").repartition(1) + result.toDF("Feature", "Coefficient", "StdError", "TValue", "PValue").repartition(1) } else { throw new UnsupportedOperationException( "No summary table available for this GeneralizedLinearRegressionModel") diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 7750fddaea891..9b1e1fc7a5bf4 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -1546,12 +1546,12 @@ class GeneralizedLinearRegressionSuite val formula = new RFormula().setFormula("y ~ x1 + x2") val dataset2 = formula.fit(datasetTmp).transform(datasetTmp) - val expectedFeature = Seq(Array("V1", "V2"), Array("x1", "x2")) + val expectedFeature = Seq(Array("features_0", "features_1"), Array("x1", "x2")) var idx = 0 for (dataset <- Seq(dataset1, dataset2)) { val model = new GeneralizedLinearRegression().fit(dataset) - model.summary.featureName.zip(expectedFeature(idx)) + model.summary.featureNames.zip(expectedFeature(idx)) .foreach{ x => assert(x._1 === x._2) } idx += 1 } @@ -1566,8 +1566,8 @@ class GeneralizedLinearRegressionSuite Instance(2.0, 5.0, Vectors.dense(2.0, 3.0)) ).toDF() - val expectedFeature = Seq(Array("V1", "V2"), - Array("Intercept", "V1", "V2")) + val expectedFeature = Seq(Array("features_0", "features_1"), + Array("Intercept", "features_0", "features_1")) val expectedEstimate = Seq(Vectors.dense(0.2884, 0.538), Vectors.dense(0.7903, 0.2258, 0.4677)) val expectedStdError = Seq(Vectors.dense(1.724, 0.3787), @@ -1585,17 +1585,17 @@ class GeneralizedLinearRegressionSuite val model = trainer.fit(dataset) val summaryTable = model.summary.summaryTable - summaryTable.select("Feature").rdd.collect.map(_.getString(0)) + summaryTable.select("Feature").collect.map(_.getString(0)) .zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2, "Feature name mismatch in summaryTable") } - assert(Vectors.dense(summaryTable.select("Estimate").rdd.collect.map(_.getDouble(0))) - ~= expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable") + assert(Vectors.dense(summaryTable.select("Coefficient").rdd.collect.map(_.getDouble(0))) + ~== expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable") assert(Vectors.dense(summaryTable.select("StdError").rdd.collect.map(_.getDouble(0))) - ~= expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable") + ~== expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable") assert(Vectors.dense(summaryTable.select("TValue").rdd.collect.map(_.getDouble(0))) - ~= expectedTValue(idx) absTol 1E-3, "TValue mismatch in summaryTable") + ~== expectedTValue(idx) absTol 1E-3, "TValue mismatch in summaryTable") assert(Vectors.dense(summaryTable.select("PValue").rdd.collect.map(_.getDouble(0))) - ~= expectedPValue(idx) absTol 1E-3, "PValue mismatch in summaryTable") + ~== expectedPValue(idx) absTol 1E-3, "PValue mismatch in summaryTable") idx += 1 } From 8405501969155980817b73d466fca81ddc070d7f Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 14 Feb 2017 14:08:06 -0800 Subject: [PATCH 09/20] glmTable --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 2 +- .../spark/ml/regression/GeneralizedLinearRegressionSuite.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 982c911b3480b..2263fe2d18fd3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -471,7 +471,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine private[regression] val epsilon: Double = 1E-16 - private[regression] val Intercept: String = "Intercept" + private[regression] val Intercept: String = "(Intercept)" /** * Wrapper of family and link combination used in the model. diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 9b1e1fc7a5bf4..7969f69148c67 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -1567,7 +1567,7 @@ class GeneralizedLinearRegressionSuite ).toDF() val expectedFeature = Seq(Array("features_0", "features_1"), - Array("Intercept", "features_0", "features_1")) + Array("(Intercept)", "features_0", "features_1")) val expectedEstimate = Seq(Vectors.dense(0.2884, 0.538), Vectors.dense(0.7903, 0.2258, 0.4677)) val expectedStdError = Seq(Vectors.dense(1.724, 0.3787), From 10f0f9bcf112a4019a41d87e8a6a841f5938bfaf Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 17 Feb 2017 10:25:34 -0800 Subject: [PATCH 10/20] update R glm wrapper to use summaryTable --- .../GeneralizedLinearRegressionWrapper.scala | 61 ++++++------------- 1 file changed, 19 insertions(+), 42 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala index ee1fc9b14ceaa..89185e887e6cb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala @@ -83,11 +83,7 @@ private[r] object GeneralizedLinearRegressionWrapper .setStringIndexerOrderType(stringIndexerOrderType) checkDataColumns(rFormula, data) val rFormulaModel = rFormula.fit(data) - // get labels and feature names from output schema - val schema = rFormulaModel.transform(data).schema - val featureAttrs = AttributeGroup.fromStructField(schema(rFormula.getFeaturesCol)) - .attributes.get - val features = featureAttrs.map(_.name.get) + // assemble and fit the pipeline val glr = new GeneralizedLinearRegression() .setFamily(family) @@ -112,44 +108,25 @@ private[r] object GeneralizedLinearRegressionWrapper pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel] val summary = glm.summary - val rFeatures: Array[String] = if (glm.getFitIntercept) { - Array("(Intercept)") ++ features - } else { - features - } + val features = summary.featureNames - val rCoefficients: Array[Double] = if (summary.isNormalSolver) { - val rCoefficientStandardErrors = if (glm.getFitIntercept) { - Array(summary.coefficientStandardErrors.last) ++ - summary.coefficientStandardErrors.dropRight(1) - } else { - summary.coefficientStandardErrors - } - - val rTValues = if (glm.getFitIntercept) { - Array(summary.tValues.last) ++ summary.tValues.dropRight(1) - } else { - summary.tValues - } - - val rPValues = if (glm.getFitIntercept) { - Array(summary.pValues.last) ++ summary.pValues.dropRight(1) - } else { - summary.pValues - } - - if (glm.getFitIntercept) { - Array(glm.intercept) ++ glm.coefficients.toArray ++ - rCoefficientStandardErrors ++ rTValues ++ rPValues - } else { - glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues - } - } else { - if (glm.getFitIntercept) { - Array(glm.intercept) ++ glm.coefficients.toArray - } else { - glm.coefficients.toArray - } + val rFeatures: Array[String] = + summary.summaryTable.select("Feature").collect.map(_.getString(0)) + + var rCoefficients: Array[Double] = + summary.summaryTable.select("Coefficient").collect.map(_.getDouble(0)) + + if (summary.isNormalSolver) { + val rCoefficientStandardErrors = + summary.summaryTable.select("StdError").collect.map(_.getDouble(0)) + + val rTValues = + summary.summaryTable.select("TValue").collect.map(_.getDouble(0)) + + val rPValues = + summary.summaryTable.select("PValue").collect.map(_.getDouble(0)) + + rCoefficients = rCoefficients ++ rCoefficientStandardErrors ++ rTValues ++ rPValues } val rDispersion: Double = summary.dispersion From 3d72cf52666dc2d28334419c3ed2572bebf7f45a Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 17 Feb 2017 10:55:34 -0800 Subject: [PATCH 11/20] clean up test --- .../GeneralizedLinearRegressionSuite.scala | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 7969f69148c67..b92253d1cfb82 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -1558,6 +1558,21 @@ class GeneralizedLinearRegressionSuite } test("glm summary: summaryTable") { + /* + R code: + + A <- matrix(c(0, 1, 2, 3, 2, 5, 7, 11, 13, 3), 5, 2) + b <- c(2, 8, 3, 9, 2) + df <- as.data.frame(cbind(A, b)) + model <- glm(formula = "b ~ .", data = df) + summary(model) + + Coefficients: + Estimate Std. Error t value Pr(>|t|) + (Intercept) 0.7903 4.0129 0.197 0.862 + V1 0.2258 2.1153 0.107 0.925 + V2 0.4677 0.5815 0.804 0.506 + */ val dataset = Seq( Instance(2.0, 1.0, Vectors.dense(0.0, 5.0)), Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)), @@ -1588,13 +1603,13 @@ class GeneralizedLinearRegressionSuite summaryTable.select("Feature").collect.map(_.getString(0)) .zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2, "Feature name mismatch in summaryTable") } - assert(Vectors.dense(summaryTable.select("Coefficient").rdd.collect.map(_.getDouble(0))) + assert(Vectors.dense(summaryTable.select("Coefficient").collect.map(_.getDouble(0))) ~== expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable") - assert(Vectors.dense(summaryTable.select("StdError").rdd.collect.map(_.getDouble(0))) + assert(Vectors.dense(summaryTable.select("StdError").collect.map(_.getDouble(0))) ~== expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable") - assert(Vectors.dense(summaryTable.select("TValue").rdd.collect.map(_.getDouble(0))) + assert(Vectors.dense(summaryTable.select("TValue").collect.map(_.getDouble(0))) ~== expectedTValue(idx) absTol 1E-3, "TValue mismatch in summaryTable") - assert(Vectors.dense(summaryTable.select("PValue").rdd.collect.map(_.getDouble(0))) + assert(Vectors.dense(summaryTable.select("PValue").collect.map(_.getDouble(0))) ~== expectedPValue(idx) absTol 1E-3, "PValue mismatch in summaryTable") idx += 1 From 07a6784d30e6baaa74750a65c9f1ee1d5f98167d Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 17 Feb 2017 14:46:03 -0800 Subject: [PATCH 12/20] fix issue in R wrapper --- .../GeneralizedLinearRegressionWrapper.scala | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala index 89185e887e6cb..efc37cb3b86fe 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala @@ -108,15 +108,13 @@ private[r] object GeneralizedLinearRegressionWrapper pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel] val summary = glm.summary - val features = summary.featureNames - - val rFeatures: Array[String] = - summary.summaryTable.select("Feature").collect.map(_.getString(0)) - - var rCoefficients: Array[Double] = - summary.summaryTable.select("Coefficient").collect.map(_.getDouble(0)) + val rFeatures: Array[String] = if (glm.getFitIntercept) { + Array("(Intercept)") ++ summary.featureNames + } else { + summary.featureNames + } - if (summary.isNormalSolver) { + val rCoefficients: Array[Double] = if (summary.isNormalSolver) { val rCoefficientStandardErrors = summary.summaryTable.select("StdError").collect.map(_.getDouble(0)) @@ -126,7 +124,14 @@ private[r] object GeneralizedLinearRegressionWrapper val rPValues = summary.summaryTable.select("PValue").collect.map(_.getDouble(0)) - rCoefficients = rCoefficients ++ rCoefficientStandardErrors ++ rTValues ++ rPValues + summary.summaryTable.select("Coefficient").collect.map(_.getDouble(0)) ++ + rCoefficientStandardErrors ++ rTValues ++ rPValues + } else { + if (glm.getFitIntercept) { + Array(glm.intercept) ++ glm.coefficients.toArray + } else { + glm.coefficients.toArray + } } val rDispersion: Double = summary.dispersion From 1c1d3e6e0cb1e39fbf8afe0660b5033809415df9 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 24 Feb 2017 10:35:27 -0800 Subject: [PATCH 13/20] sort import --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 2263fe2d18fd3..2a3d236301f8e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -34,11 +34,9 @@ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} +import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DataType, DoubleType, StructType} -import org.apache.spark.sql.SparkSession - /** * Params for Generalized Linear Regression. From a16cbee4e86cf044a90015bdd6900b9a22116200 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 17 Jul 2017 14:45:54 -0700 Subject: [PATCH 14/20] use 2D array for summary table --- .../GeneralizedLinearRegressionWrapper.scala | 15 +- .../GeneralizedLinearRegression.scala | 160 ++++++++++++++++-- .../GeneralizedLinearRegressionSuite.scala | 21 +-- 3 files changed, 157 insertions(+), 39 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala index efc37cb3b86fe..13cd9b602f011 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala @@ -115,17 +115,10 @@ private[r] object GeneralizedLinearRegressionWrapper } val rCoefficients: Array[Double] = if (summary.isNormalSolver) { - val rCoefficientStandardErrors = - summary.summaryTable.select("StdError").collect.map(_.getDouble(0)) - - val rTValues = - summary.summaryTable.select("TValue").collect.map(_.getDouble(0)) - - val rPValues = - summary.summaryTable.select("PValue").collect.map(_.getDouble(0)) - - summary.summaryTable.select("Coefficient").collect.map(_.getDouble(0)) ++ - rCoefficientStandardErrors ++ rTValues ++ rPValues + summary.coefficientMatrix.map(_._2) ++ + summary.coefficientMatrix.map(_._3) ++ + summary.coefficientMatrix.map(_._4) ++ + summary.coefficientMatrix.map(_._5) } else { if (glm.getFitIntercept) { Array(glm.intercept) ++ glm.coefficients.toArray diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 2a3d236301f8e..72b3c8d399615 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -20,6 +20,9 @@ package org.apache.spark.ml.regression import java.util.Locale import breeze.stats.{distributions => dist} + +import org.apache.commons.lang3.StringUtils + import org.apache.hadoop.fs.Path import org.apache.spark.SparkException @@ -34,7 +37,7 @@ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession} +import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DataType, DoubleType, StructType} @@ -1211,8 +1214,7 @@ class GeneralizedLinearRegressionSummary private[regression] ( * Name of features. If the name cannot be retrieved from attributes, * set default names to feature column name with numbered suffix "_0", "_1", and so on. */ - @Since("2.2.0") - lazy val featureNames: Array[String] = { + private[ml] lazy val featureNames: Array[String] = { val featureAttrs = AttributeGroup.fromStructField( dataset.schema(model.getFeaturesCol)).attributes if (featureAttrs == None) { @@ -1479,31 +1481,165 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( } /** - * Summary table with feature name, coefficient, standard error, + * Coefficient matrix with feature name, coefficient, standard error, * tValue and pValue. */ - @Since("2.2.0") - lazy val summaryTable: DataFrame = { + @Since("2.3.0") + lazy val coefficientMatrix: Array[(String, Double, Double, Double, Double)] = { if (isNormalSolver) { var featureNamesLocal = featureNames var coefficients = model.coefficients.toArray var idx = Array.range(0, coefficients.length) if (model.getFitIntercept) { - featureNamesLocal = featureNamesLocal :+ Intercept + featureNamesLocal = featureNamesLocal :+ "(Intercept)" coefficients = coefficients :+ model.intercept // Reorder so that intercept comes first idx = (coefficients.length - 1) +: idx } - val result = for (i <- idx.toSeq) yield + val result = for (i <- idx) yield (featureNamesLocal(i), coefficients(i), coefficientStandardErrors(i), tValues(i), pValues(i)) - - val spark = SparkSession.builder().getOrCreate() - import spark.implicits._ - result.toDF("Feature", "Coefficient", "StdError", "TValue", "PValue").repartition(1) + result } else { throw new UnsupportedOperationException( "No summary table available for this GeneralizedLinearRegressionModel") } } + + private def round(x: Double, digit: Int): String = { + BigDecimal(x).setScale(digit, BigDecimal.RoundingMode.HALF_UP).toString() + } + + private[regression] def showString(_numRows: Int, truncate: Int = 20, + numDigits: Int = 3): String = { + val numRows = _numRows.max(1) + val data = coefficientMatrix.take(numRows) + val hasMoreData = coefficientMatrix.size > numRows + + val colNames = Array("Feature", "Estimate", "StdError", "TValue", "PValue") + val numCols = colNames.size + + val rows = colNames +: data.map( row => { + val mrow = for (cell <- row.productIterator) yield { + val str = cell match { + case s: String => s + case n: Double => round(n, numDigits).toString + } + if (truncate > 0 && str.length > truncate) { + // do not show ellipses for strings shorter than 4 characters. + if (truncate < 4) str.substring(0, truncate) + else str.substring(0, truncate - 3) + "..." + } else { + str + } + } + mrow.toArray + }) + + val sb = new StringBuilder + val colWidths = Array.fill(numCols)(3) + + // Compute the width of each column + for (row <- rows) { + for ((cell, i) <- row.zipWithIndex) { + colWidths(i) = math.max(colWidths(i), cell.length) + } + } + + // Create SeparateLine + val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString() + + // column names + rows.head.zipWithIndex.map { case (cell, i) => + if (truncate > 0) { + StringUtils.leftPad(cell, colWidths(i)) + } else { + StringUtils.rightPad(cell, colWidths(i)) + } + }.addString(sb, "|", "|", "|\n") + sb.append(sep) + + // data + rows.tail.map { + _.zipWithIndex.map { case (cell, i) => + if (truncate > 0) { + StringUtils.leftPad(cell.toString, colWidths(i)) + } else { + StringUtils.rightPad(cell.toString, colWidths(i)) + } + }.addString(sb, "|", "|", "|\n") + } + + // For Data that has more than "numRows" records + if (hasMoreData) { + sb.append("...\n") + sb.append(sep) + val rowsString = if (numRows == 1) "row" else "rows" + sb.append(s"only showing top $numRows $rowsString\n") + } else { + sb.append(sep) + } + + sb.append("\n") + sb.append(s"(Dispersion parameter for ${family.name} family taken to be " + + round(dispersion, numDigits) + ")") + + sb.append("\n") + val nd = "Null deviance: " + round(nullDeviance, numDigits) + + s" on $degreesOfFreedom degrees of freedom" + val rd = "Residual deviance: " + round(deviance, numDigits) + + s" on $residualDegreeOfFreedom degrees of freedom" + val l = math.max(nd.length, rd.length) + sb.append(StringUtils.leftPad(nd, l)) + sb.append("\n") + sb.append(StringUtils.leftPad(rd, l)) + + if (family.name != "tweedie") { + sb.append("\n") + sb.append(s"AIC: " + round(aic, numDigits)) + } + + sb.toString() + } + + /** + * Displays the summary of a GeneralizedLinearModel fit. + * + * @since 2.3.0 + */ + def show(): Unit = { + val numRows = coefficientMatrix.size + show(numRows, true, 3) + } + + /** + * Displays the top numRows rows of the summary of a GeneralizedLinearModel fit. + * + * @param numRows Number of rows to show + * + * @since 2.3.0 + */ + @Since("2.3.0") + def show(numRows: Int): Unit = { + show(numRows, true, 3) + } + + /** + * Displays the summary of a GeneralizedLinearModel fit. Strings more than 20 characters + * will be truncated, and all cells will be aligned right. + * + * @param numRows Number of rows to show + * @param truncate Whether truncate long strings. If true, strings more than 20 characters will + * be truncated and all cells will be aligned right + * @param numDigits Number of decimal places used to round numerical values. + * + * @since 2.3.0 + */ + // scalastyle:off println + def show(numRows: Int, truncate: Boolean, numDigits: Int): Unit = if (truncate) { + println(showString(numRows, truncate = 20, numDigits)) + } else { + println(showString(numRows, truncate = 0, numDigits)) + } + // scalastyle:on println } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index b92253d1cfb82..e6202f1e4ce37 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -1524,7 +1524,6 @@ class GeneralizedLinearRegressionSuite .fit(datasetGaussianIdentity.as[LabeledPoint]) } - test("glm summary: feature name") { // dataset1 with no attribute val dataset1 = Seq( @@ -1557,7 +1556,7 @@ class GeneralizedLinearRegressionSuite } } - test("glm summary: summaryTable") { + test("glm summary: coefficient matrix") { /* R code: @@ -1587,10 +1586,6 @@ class GeneralizedLinearRegressionSuite Vectors.dense(0.7903, 0.2258, 0.4677)) val expectedStdError = Seq(Vectors.dense(1.724, 0.3787), Vectors.dense(4.0129, 2.1153, 0.5815)) - val expectedTValue = Seq(Vectors.dense(0.1673, 1.4205), - Vectors.dense(0.1969, 0.1067, 0.8043)) - val expectedPValue = Seq(Vectors.dense(0.8778, 0.2506), - Vectors.dense(0.8621, 0.9247, 0.5056)) var idx = 0 for (fitIntercept <- Seq(false, true)) { @@ -1598,20 +1593,14 @@ class GeneralizedLinearRegressionSuite .setFamily("gaussian") .setFitIntercept(fitIntercept) val model = trainer.fit(dataset) - val summaryTable = model.summary.summaryTable + val coefficientMatrix = model.summary.coefficientMatrix - summaryTable.select("Feature").collect.map(_.getString(0)) - .zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2, + coefficientMatrix.map(_._1).zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2, "Feature name mismatch in summaryTable") } - assert(Vectors.dense(summaryTable.select("Coefficient").collect.map(_.getDouble(0))) + assert(Vectors.dense(coefficientMatrix.map(_._2)) ~== expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable") - assert(Vectors.dense(summaryTable.select("StdError").collect.map(_.getDouble(0))) + assert(Vectors.dense(coefficientMatrix.map(_._3)) ~== expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable") - assert(Vectors.dense(summaryTable.select("TValue").collect.map(_.getDouble(0))) - ~== expectedTValue(idx) absTol 1E-3, "TValue mismatch in summaryTable") - assert(Vectors.dense(summaryTable.select("PValue").collect.map(_.getDouble(0))) - ~== expectedPValue(idx) absTol 1E-3, "PValue mismatch in summaryTable") - idx += 1 } } From 640d56442e6f5d1a14b4a0cb895d6da713b003fd Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 17 Jul 2017 16:37:09 -0700 Subject: [PATCH 15/20] fix import --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 72b3c8d399615..383c7c0ff28a5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -20,9 +20,7 @@ package org.apache.spark.ml.regression import java.util.Locale import breeze.stats.{distributions => dist} - import org.apache.commons.lang3.StringUtils - import org.apache.hadoop.fs.Path import org.apache.spark.SparkException From 57f1e5c259d7f237324dd1b3b481b7e82952b53e Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 17 Jul 2017 16:45:08 -0700 Subject: [PATCH 16/20] remove intercept --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 383c7c0ff28a5..5164622063a8f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -470,8 +470,6 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine private[regression] val epsilon: Double = 1E-16 - private[regression] val Intercept: String = "(Intercept)" - /** * Wrapper of family and link combination used in the model. */ From 167af016b5319b71137102dd69e3b8331616d4ad Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 19 Jul 2017 10:27:57 -0700 Subject: [PATCH 17/20] simplify show method --- .../GeneralizedLinearRegressionWrapper.scala | 8 ++-- .../GeneralizedLinearRegression.scala | 48 ++++--------------- .../GeneralizedLinearRegressionSuite.scala | 8 ++-- 3 files changed, 18 insertions(+), 46 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala index 13cd9b602f011..1e7ece6bc7199 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala @@ -115,10 +115,10 @@ private[r] object GeneralizedLinearRegressionWrapper } val rCoefficients: Array[Double] = if (summary.isNormalSolver) { - summary.coefficientMatrix.map(_._2) ++ - summary.coefficientMatrix.map(_._3) ++ - summary.coefficientMatrix.map(_._4) ++ - summary.coefficientMatrix.map(_._5) + summary.coefficientCollection.map(_._2) ++ + summary.coefficientCollection.map(_._3) ++ + summary.coefficientCollection.map(_._4) ++ + summary.coefficientCollection.map(_._5) } else { if (glm.getFitIntercept) { Array(glm.intercept) ++ glm.coefficients.toArray diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 5164622063a8f..9aa439745a73d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -1477,11 +1477,10 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( } /** - * Coefficient matrix with feature name, coefficient, standard error, + * Collection of feature name, coefficient, standard error, * tValue and pValue. */ - @Since("2.3.0") - lazy val coefficientMatrix: Array[(String, Double, Double, Double, Double)] = { + private[ml] lazy val coefficientCollection: Array[(String, Double, Double, Double, Double)] = { if (isNormalSolver) { var featureNamesLocal = featureNames var coefficients = model.coefficients.toArray @@ -1498,7 +1497,7 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( result } else { throw new UnsupportedOperationException( - "No summary table available for this GeneralizedLinearRegressionModel") + "No summary available for this GeneralizedLinearRegressionModel") } } @@ -1509,8 +1508,8 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( private[regression] def showString(_numRows: Int, truncate: Int = 20, numDigits: Int = 3): String = { val numRows = _numRows.max(1) - val data = coefficientMatrix.take(numRows) - val hasMoreData = coefficientMatrix.size > numRows + val data = coefficientCollection.take(numRows) + val hasMoreData = coefficientCollection.size > numRows val colNames = Array("Feature", "Estimate", "StdError", "TValue", "PValue") val numCols = colNames.size @@ -1598,44 +1597,17 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( sb.toString() } - /** - * Displays the summary of a GeneralizedLinearModel fit. - * - * @since 2.3.0 - */ - def show(): Unit = { - val numRows = coefficientMatrix.size - show(numRows, true, 3) - } - - /** - * Displays the top numRows rows of the summary of a GeneralizedLinearModel fit. - * - * @param numRows Number of rows to show - * - * @since 2.3.0 - */ - @Since("2.3.0") - def show(numRows: Int): Unit = { - show(numRows, true, 3) - } - /** * Displays the summary of a GeneralizedLinearModel fit. Strings more than 20 characters - * will be truncated, and all cells will be aligned right. - * - * @param numRows Number of rows to show - * @param truncate Whether truncate long strings. If true, strings more than 20 characters will - * be truncated and all cells will be aligned right - * @param numDigits Number of decimal places used to round numerical values. + * will be truncated, and all cells will be aligned right. Numbers are rounded to three + * decimal places. * * @since 2.3.0 */ // scalastyle:off println - def show(numRows: Int, truncate: Boolean, numDigits: Int): Unit = if (truncate) { - println(showString(numRows, truncate = 20, numDigits)) - } else { - println(showString(numRows, truncate = 0, numDigits)) + def show(): Unit = { + println(showString(coefficientCollection.size, truncate = 20, 3)) } // scalastyle:on println + } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index e6202f1e4ce37..5a6c884ea43f5 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -1593,13 +1593,13 @@ class GeneralizedLinearRegressionSuite .setFamily("gaussian") .setFitIntercept(fitIntercept) val model = trainer.fit(dataset) - val coefficientMatrix = model.summary.coefficientMatrix + val coefficients = model.summary.coefficientCollection - coefficientMatrix.map(_._1).zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2, + coefficients.map(_._1).zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2, "Feature name mismatch in summaryTable") } - assert(Vectors.dense(coefficientMatrix.map(_._2)) + assert(Vectors.dense(coefficients.map(_._2)) ~== expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable") - assert(Vectors.dense(coefficientMatrix.map(_._3)) + assert(Vectors.dense(coefficients.map(_._3)) ~== expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable") idx += 1 } From 174fc49142f2915c46fc53df4cb024d2e97cc6ca Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 19 Jul 2017 10:30:51 -0700 Subject: [PATCH 18/20] fix align issue --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 9aa439745a73d..392765c20fbe2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -1505,7 +1505,8 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( BigDecimal(x).setScale(digit, BigDecimal.RoundingMode.HALF_UP).toString() } - private[regression] def showString(_numRows: Int, truncate: Int = 20, + private[regression] def showString(_numRows: Int, + truncate: Int = 20, numDigits: Int = 3): String = { val numRows = _numRows.max(1) val data = coefficientCollection.take(numRows) From be11106f42e6bcccbb58222a3bc75a18111cdccc Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 26 Jul 2017 12:47:20 +0800 Subject: [PATCH 19/20] Refactor GLR summary toString. --- .../GeneralizedLinearRegressionWrapper.scala | 8 +- .../GeneralizedLinearRegression.scala | 193 +++++++----------- .../GeneralizedLinearRegressionSuite.scala | 18 +- 3 files changed, 88 insertions(+), 131 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala index 1e7ece6bc7199..176a6cf852914 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala @@ -115,10 +115,10 @@ private[r] object GeneralizedLinearRegressionWrapper } val rCoefficients: Array[Double] = if (summary.isNormalSolver) { - summary.coefficientCollection.map(_._2) ++ - summary.coefficientCollection.map(_._3) ++ - summary.coefficientCollection.map(_._4) ++ - summary.coefficientCollection.map(_._5) + summary.coefficientsWithStatistics.map(_._2) ++ + summary.coefficientsWithStatistics.map(_._3) ++ + summary.coefficientsWithStatistics.map(_._4) ++ + summary.coefficientsWithStatistics.map(_._5) } else { if (glm.getFitIntercept) { Array(glm.intercept) ++ glm.coefficients.toArray diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 392765c20fbe2..f489feb65274c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -142,6 +142,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam /** * Param for offset column name. If this is not set or empty, we treat all instance offsets * as 0.0. The feature specified as offset has a constant coefficient of 1.0. + * * @group param */ @Since("2.3.0") @@ -326,6 +327,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val * * Default is 0.0. * + * * @group setParam */ @Since("2.0.0") @@ -1213,11 +1215,10 @@ class GeneralizedLinearRegressionSummary private[regression] ( private[ml] lazy val featureNames: Array[String] = { val featureAttrs = AttributeGroup.fromStructField( dataset.schema(model.getFeaturesCol)).attributes - if (featureAttrs == None) { - Array.tabulate[String](origModel.numFeatures)( - (x: Int) => (model.getFeaturesCol + "_" + x)) - } else { + if (featureAttrs.isDefined) { featureAttrs.get.map(_.name.get) + } else { + Array.tabulate[String](origModel.numFeatures)((x: Int) => model.getFeaturesCol + "_" + x) } } @@ -1477,138 +1478,94 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( } /** - * Collection of feature name, coefficient, standard error, - * tValue and pValue. + * coefficients with statistics: feature name, coefficients, standard error, tValue and pValue. */ - private[ml] lazy val coefficientCollection: Array[(String, Double, Double, Double, Double)] = { - if (isNormalSolver) { - var featureNamesLocal = featureNames - var coefficients = model.coefficients.toArray - var idx = Array.range(0, coefficients.length) - if (model.getFitIntercept) { - featureNamesLocal = featureNamesLocal :+ "(Intercept)" - coefficients = coefficients :+ model.intercept - // Reorder so that intercept comes first - idx = (coefficients.length - 1) +: idx - } - val result = for (i <- idx) yield - (featureNamesLocal(i), coefficients(i), coefficientStandardErrors(i), + private[ml] lazy val coefficientsWithStatistics: Array[ + (String, Double, Double, Double, Double)] = { + var featureNamesLocal = featureNames + var coefficientsArray = model.coefficients.toArray + var index = Array.range(0, coefficientsArray.length) + if (model.getFitIntercept) { + featureNamesLocal = featureNamesLocal :+ "(Intercept)" + coefficientsArray = coefficientsArray :+ model.intercept + // Reorder so that intercept comes first + index = (coefficientsArray.length - 1) +: index + } + index.map { i => + (featureNamesLocal(i), coefficientsArray(i), coefficientStandardErrors(i), tValues(i), pValues(i)) - result - } else { - throw new UnsupportedOperationException( - "No summary available for this GeneralizedLinearRegressionModel") } } - private def round(x: Double, digit: Int): String = { - BigDecimal(x).setScale(digit, BigDecimal.RoundingMode.HALF_UP).toString() - } + override def toString: String = { + if (isNormalSolver) { - private[regression] def showString(_numRows: Int, - truncate: Int = 20, - numDigits: Int = 3): String = { - val numRows = _numRows.max(1) - val data = coefficientCollection.take(numRows) - val hasMoreData = coefficientCollection.size > numRows - - val colNames = Array("Feature", "Estimate", "StdError", "TValue", "PValue") - val numCols = colNames.size - - val rows = colNames +: data.map( row => { - val mrow = for (cell <- row.productIterator) yield { - val str = cell match { - case s: String => s - case n: Double => round(n, numDigits).toString - } - if (truncate > 0 && str.length > truncate) { - // do not show ellipses for strings shorter than 4 characters. - if (truncate < 4) str.substring(0, truncate) - else str.substring(0, truncate - 3) + "..." - } else { - str - } + def round(x: Double): String = { + BigDecimal(x).setScale(5, BigDecimal.RoundingMode.HALF_UP).toString } - mrow.toArray - }) - val sb = new StringBuilder - val colWidths = Array.fill(numCols)(3) + val colNames = Array("Feature", "Estimate", "Std Error", "T Value", "P Value") - // Compute the width of each column - for (row <- rows) { - for ((cell, i) <- row.zipWithIndex) { - colWidths(i) = math.max(colWidths(i), cell.length) + val data = coefficientsWithStatistics.map { row => + val strRow = row.productIterator.map { cell => + val str = cell match { + case s: String => s + case n: Double => round(n) + } + // Truncate if length > 20 + if (str.length > 20) { + str.substring(0, 17) + "..." + } else { + str + } + } + strRow.toArray } - } - - // Create SeparateLine - val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString() - // column names - rows.head.zipWithIndex.map { case (cell, i) => - if (truncate > 0) { - StringUtils.leftPad(cell, colWidths(i)) - } else { - StringUtils.rightPad(cell, colWidths(i)) + // Compute the width of each column + val colWidths = colNames.map(_.length) + data.foreach { strRow => + strRow.zipWithIndex.foreach { case (cell: String, i: Int) => + colWidths(i) = math.max(colWidths(i), cell.length) + } } - }.addString(sb, "|", "|", "|\n") - sb.append(sep) - // data - rows.tail.map { - _.zipWithIndex.map { case (cell, i) => - if (truncate > 0) { + val sb = new StringBuilder + + // Output coefficients with statistics + sb.append("Coefficients:\n") + colNames.zipWithIndex.map { case (colName: String, i: Int) => + StringUtils.leftPad(colName, colWidths(i)) + }.addString(sb, "", " ", "\n") + + data.foreach { case strRow: Array[String] => + strRow.zipWithIndex.map { case (cell: String, i: Int) => StringUtils.leftPad(cell.toString, colWidths(i)) - } else { - StringUtils.rightPad(cell.toString, colWidths(i)) - } - }.addString(sb, "|", "|", "|\n") - } + }.addString(sb, "", " ", "\n") + } - // For Data that has more than "numRows" records - if (hasMoreData) { - sb.append("...\n") - sb.append(sep) - val rowsString = if (numRows == 1) "row" else "rows" - sb.append(s"only showing top $numRows $rowsString\n") - } else { - sb.append(sep) - } + sb.append("\n") + sb.append(s"(Dispersion parameter for ${family.name} family taken to be " + + s"${round(dispersion)})") - sb.append("\n") - sb.append(s"(Dispersion parameter for ${family.name} family taken to be " + - round(dispersion, numDigits) + ")") - - sb.append("\n") - val nd = "Null deviance: " + round(nullDeviance, numDigits) + - s" on $degreesOfFreedom degrees of freedom" - val rd = "Residual deviance: " + round(deviance, numDigits) + - s" on $residualDegreeOfFreedom degrees of freedom" - val l = math.max(nd.length, rd.length) - sb.append(StringUtils.leftPad(nd, l)) - sb.append("\n") - sb.append(StringUtils.leftPad(rd, l)) - - if (family.name != "tweedie") { sb.append("\n") - sb.append(s"AIC: " + round(aic, numDigits)) - } + val nd = s"Null deviance: ${round(nullDeviance)} on $degreesOfFreedom degrees of freedom" + val rd = s"Residual deviance: ${round(deviance)} on $residualDegreeOfFreedom degrees of " + + "freedom" + val l = math.max(nd.length, rd.length) + sb.append(StringUtils.leftPad(nd, l)) + sb.append("\n") + sb.append(StringUtils.leftPad(rd, l)) - sb.toString() - } + if (family.name != "tweedie") { + sb.append("\n") + sb.append(s"AIC: " + round(aic)) + } - /** - * Displays the summary of a GeneralizedLinearModel fit. Strings more than 20 characters - * will be truncated, and all cells will be aligned right. Numbers are rounded to three - * decimal places. - * - * @since 2.3.0 - */ - // scalastyle:off println - def show(): Unit = { - println(showString(coefficientCollection.size, truncate = 20, 3)) + sb.toString() + } else { + throw new UnsupportedOperationException( + "No summary available for this GeneralizedLinearRegressionModel") + } } - // scalastyle:on println - } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 5a6c884ea43f5..df7dee869d058 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -1556,7 +1556,7 @@ class GeneralizedLinearRegressionSuite } } - test("glm summary: coefficient matrix") { + test("glm summary: coefficient with statistics") { /* R code: @@ -1593,14 +1593,14 @@ class GeneralizedLinearRegressionSuite .setFamily("gaussian") .setFitIntercept(fitIntercept) val model = trainer.fit(dataset) - val coefficients = model.summary.coefficientCollection - - coefficients.map(_._1).zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2, - "Feature name mismatch in summaryTable") } - assert(Vectors.dense(coefficients.map(_._2)) - ~== expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable") - assert(Vectors.dense(coefficients.map(_._3)) - ~== expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable") + val coefficientsWithStatistics = model.summary.coefficientsWithStatistics + + coefficientsWithStatistics.map(_._1).zip(expectedFeature(idx)).foreach { x => + assert(x._1 === x._2, "Feature name mismatch in coefficientsWithStatistics") } + assert(Vectors.dense(coefficientsWithStatistics.map(_._2)) ~= expectedEstimate(idx) + absTol 1E-3, "Coefficients mismatch in coefficientsWithStatistics") + assert(Vectors.dense(coefficientsWithStatistics.map(_._3)) ~= expectedStdError(idx) + absTol 1E-3, "Standard error mismatch in coefficientsWithStatistics") idx += 1 } } From 7281b77880898f5cb421467ef82e10ad42a17638 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Wed, 26 Jul 2017 10:27:30 -0700 Subject: [PATCH 20/20] fix style --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index f489feb65274c..beca5956a2d94 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -327,7 +327,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val * * Default is 0.0. * - * * @group setParam */ @Since("2.0.0") @@ -1478,7 +1477,7 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( } /** - * coefficients with statistics: feature name, coefficients, standard error, tValue and pValue. + * Coefficients with statistics: feature name, coefficients, standard error, tValue and pValue. */ private[ml] lazy val coefficientsWithStatistics: Array[ (String, Double, Double, Double, Double)] = { @@ -1501,7 +1500,7 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] ( if (isNormalSolver) { def round(x: Double): String = { - BigDecimal(x).setScale(5, BigDecimal.RoundingMode.HALF_UP).toString + BigDecimal(x).setScale(4, BigDecimal.RoundingMode.HALF_UP).toString } val colNames = Array("Feature", "Estimate", "Std Error", "T Value", "P Value")