From 19b8de498a4c2a602810738aed0a7ad4088ef172 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Fri, 13 Jan 2017 09:24:14 -0800
Subject: [PATCH 01/20] Start working on summary table

---
 .../GeneralizedLinearRegression.scala         | 39 +++++++++++++++++++
 .../GeneralizedLinearRegressionSuite.scala    |  5 ++-
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index c600b87bdc64a..f8648b2ae4a44 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -32,6 +32,7 @@ import org.apache.spark.ml.optim._
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
+import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
@@ -1204,6 +1205,15 @@ class GeneralizedLinearRegressionSummary private[regression] (
   @Since("2.2.0")
   lazy val numInstances: Long = predictions.count()
 
+
+  /** Name of features. */
+  @Since("2.2.0")
+  lazy val featureName: Array[String] = {
+    val features = AttributeGroup.fromStructField(dataset.schema(model.getFeaturesCol))
+                                  .attributes.get.map(_.name.get)
+    features
+  }
+
   /** The numeric rank of the fitted linear model. */
   @Since("2.0.0")
   lazy val rank: Long = if (model.getFitIntercept) {
@@ -1458,4 +1468,33 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
         "No p-value available for this GeneralizedLinearRegressionModel")
     }
   }
+
+  /**
+    * Summary table with feature name, coefficient standard error,
+    * tValues and pValues.
+    *
+    */
+  @Since("2.2.0")
+  lazy val summaryTable: Seq[_] = {
+    if (isNormalSolver) {
+      val features = if (model.getFitIntercept) {
+        featureName :+ "Intercept"
+      } else {
+        featureName
+      }
+      val coef = if (model.getFitIntercept) {
+        model.coefficients.toArray :+ model.intercept
+      } else {
+        model.coefficients.toArray
+      }
+      val result = for (i <- 0 until coef.length)
+        yield (features(i), coef(i), coefficientStandardErrors(i), tValues(i), pValues(i))
+      result
+    } else {
+      throw new UnsupportedOperationException(
+        "No summary table available for this GeneralizedLinearRegressionModel")
+    }
+  }
+
+
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index a47bd17f47bb1..a2a36925ffc5d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1350,6 +1350,9 @@ class GeneralizedLinearRegressionSuite
     assert(summary.residualDegreeOfFreedomNull === residualDegreeOfFreedomNullR)
     assert(summary.aic ~== aicR absTol 1E-3)
     assert(summary.solver === "irls")
+    println(summary.featureName)
+    println(summary.summaryTable(0))
+    println(summary.summaryTable(1))
   }
 
   test("glm summary: tweedie family with weight and offset") {
@@ -1492,7 +1495,7 @@ class GeneralizedLinearRegressionSuite
     }
   }
 
-  test("read/write") {
+  ignore("read/write") {
     def checkModelData(
         model: GeneralizedLinearRegressionModel,
         model2: GeneralizedLinearRegressionModel): Unit = {

From 93139b90a9c714f0f760780e00c9fa7743f7de1d Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Tue, 17 Jan 2017 00:26:47 -0800
Subject: [PATCH 02/20] convert result to dataframe

---
 .../GeneralizedLinearRegression.scala         | 47 +++++++++++--------
 .../GeneralizedLinearRegressionSuite.scala    |  5 +-
 2 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index f8648b2ae4a44..a55df89bd4d97 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -21,7 +21,6 @@ import java.util.Locale
 
 import breeze.stats.{distributions => dist}
 import org.apache.hadoop.fs.Path
-
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.internal.Logging
@@ -36,7 +35,8 @@ import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
+import org.apache.spark.sql.types.{DataType, DoubleType, StringType, StructType, StructField}
+import org.apache.spark.sql.SparkSession
 
 
 /**
@@ -1206,12 +1206,19 @@ class GeneralizedLinearRegressionSummary private[regression] (
   lazy val numInstances: Long = predictions.count()
 
 
-  /** Name of features. */
+  /**
+   * Name of features. If the name cannot be retrieved from attributes,
+   * use default "V0", "V1", ...
+   */
   @Since("2.2.0")
   lazy val featureName: Array[String] = {
-    val features = AttributeGroup.fromStructField(dataset.schema(model.getFeaturesCol))
-                                  .attributes.get.map(_.name.get)
-    features
+    val featureAttrs = AttributeGroup.fromStructField(
+      dataset.schema(model.getFeaturesCol)).attributes
+    if (featureAttrs == None) {
+      Array.tabulate[String](origModel.numFeatures)((x: Int) => ("V" + x))
+    } else {
+      featureAttrs.get.map(_.name.get)
+    }
   }
 
   /** The numeric rank of the fitted linear model. */
@@ -1475,21 +1482,23 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
     *
     */
   @Since("2.2.0")
-  lazy val summaryTable: Seq[_] = {
+  lazy val summaryTable: DataFrame = {
     if (isNormalSolver) {
-      val features = if (model.getFitIntercept) {
-        featureName :+ "Intercept"
-      } else {
-        featureName
-      }
-      val coef = if (model.getFitIntercept) {
-        model.coefficients.toArray :+ model.intercept
-      } else {
-        model.coefficients.toArray
+      var featureNames = featureName
+      var coefficients = model.coefficients.toArray
+      if (model.getFitIntercept) {
+        featureNames = featureNames :+ "Intercept"
+        coefficients = coefficients :+ model.intercept
       }
-      val result = for (i <- 0 until coef.length)
-        yield (features(i), coef(i), coefficientStandardErrors(i), tValues(i), pValues(i))
-      result
+      var result = for (i <- 0 until coefficients.length) yield
+        (featureNames(i), coefficients(i), coefficientStandardErrors(i), tValues(i), pValues(i))
+      /*if (model.getFitIntercept) {
+        result = ((coefficients.length - 1) +: Array.range(0, (coefficients.length - 1)))
+          .map(result(_)).toSeq
+      }*/
+      val spark = SparkSession.builder().getOrCreate()
+      import spark.implicits._
+      result.toDF("Feature", "Estimate", "StdError", "tValue", "pValue").repartition(1)
     } else {
       throw new UnsupportedOperationException(
         "No summary table available for this GeneralizedLinearRegressionModel")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index a2a36925ffc5d..a47bd17f47bb1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1350,9 +1350,6 @@ class GeneralizedLinearRegressionSuite
     assert(summary.residualDegreeOfFreedomNull === residualDegreeOfFreedomNullR)
     assert(summary.aic ~== aicR absTol 1E-3)
     assert(summary.solver === "irls")
-    println(summary.featureName)
-    println(summary.summaryTable(0))
-    println(summary.summaryTable(1))
   }
 
   test("glm summary: tweedie family with weight and offset") {
@@ -1495,7 +1492,7 @@ class GeneralizedLinearRegressionSuite
     }
   }
 
-  ignore("read/write") {
+  test("read/write") {
     def checkModelData(
         model: GeneralizedLinearRegressionModel,
         model2: GeneralizedLinearRegressionModel): Unit = {

From 0b50f34cd91416c913ff500f33314ef29833dff0 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Tue, 17 Jan 2017 22:59:41 -0800
Subject: [PATCH 03/20] Add test

---
 .../GeneralizedLinearRegression.scala         | 20 ++---
 .../GeneralizedLinearRegressionSuite.scala    | 82 ++++++++++++++++++-
 2 files changed, 90 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index a55df89bd4d97..12343dd8c3bbc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -1208,14 +1208,14 @@ class GeneralizedLinearRegressionSummary private[regression] (
 
   /**
    * Name of features. If the name cannot be retrieved from attributes,
-   * use default "V0", "V1", ...
+   * use default names "V1", "V2", ...
    */
   @Since("2.2.0")
   lazy val featureName: Array[String] = {
     val featureAttrs = AttributeGroup.fromStructField(
       dataset.schema(model.getFeaturesCol)).attributes
     if (featureAttrs == None) {
-      Array.tabulate[String](origModel.numFeatures)((x: Int) => ("V" + x))
+      Array.tabulate[String](origModel.numFeatures)((x: Int) => ("V" + (x + 1)))
     } else {
       featureAttrs.get.map(_.name.get)
     }
@@ -1477,8 +1477,8 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
   }
 
   /**
-    * Summary table with feature name, coefficient standard error,
-    * tValues and pValues.
+    * Summary table with feature name, coefficient, standard error,
+    * tValue and pValue.
     *
     */
   @Since("2.2.0")
@@ -1486,19 +1486,19 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
     if (isNormalSolver) {
       var featureNames = featureName
       var coefficients = model.coefficients.toArray
+      var idx = Array.range(0, coefficients.length)
       if (model.getFitIntercept) {
         featureNames = featureNames :+ "Intercept"
         coefficients = coefficients :+ model.intercept
+        // reorder so that intercept comes first
+        idx = (coefficients.length - 1) +: idx
       }
-      var result = for (i <- 0 until coefficients.length) yield
+      val result = for (i <- idx.toSeq) yield
         (featureNames(i), coefficients(i), coefficientStandardErrors(i), tValues(i), pValues(i))
-      /*if (model.getFitIntercept) {
-        result = ((coefficients.length - 1) +: Array.range(0, (coefficients.length - 1)))
-          .map(result(_)).toSeq
-      }*/
+
       val spark = SparkSession.builder().getOrCreate()
       import spark.implicits._
-      result.toDF("Feature", "Estimate", "StdError", "tValue", "pValue").repartition(1)
+      result.toDF("Feature", "Estimate", "StdError", "TValue", "PValue").repartition(1)
     } else {
       throw new UnsupportedOperationException(
         "No summary table available for this GeneralizedLinearRegressionModel")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index a47bd17f47bb1..fb934b53181ac 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -18,11 +18,10 @@
 package org.apache.spark.ml.regression
 
 import scala.util.Random
-
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.{Instance, OffsetInstance}
-import org.apache.spark.ml.feature.LabeledPoint
+import org.apache.spark.ml.feature.{LabeledPoint, RFormula}
 import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors}
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
@@ -1524,6 +1523,85 @@ class GeneralizedLinearRegressionSuite
       .fit(datasetGaussianIdentity.as[LabeledPoint])
   }
 
+
+  test("glm summary: feature name") {
+    // dataset with no attribute
+    val dataset1 = Seq(
+      Instance(2.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
+      Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)),
+      Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)),
+      Instance(9.0, 4.0, Vectors.dense(3.0, 13.0)),
+      Instance(2.0, 5.0, Vectors.dense(2.0, 3.0))
+    ).toDF()
+
+    // dataset with attribute
+    val datasetTmp = Seq(
+      (2.0, 1.0, 0.0, 5.0),
+      (8.0, 2.0, 1.0, 7.0),
+      (3.0, 3.0, 2.0, 11.0),
+      (9.0, 4.0, 3.0, 13.0),
+      (2.0, 5.0, 2.0, 3.0)
+    ).toDF("y", "w", "x1", "x2")
+    val formula = new RFormula().setFormula("y ~ x1 + x2")
+    val dataset2 = formula.fit(datasetTmp).transform(datasetTmp)
+
+    val expectedFeature = Seq(Array("V1", "V2"), Array("x1", "x2"))
+
+    val trainer = new GeneralizedLinearRegression()
+    var idx = 0
+    for (dataset <- Seq(dataset1, dataset2)) {
+      val model = trainer.fit(dataset)
+      model.summary.featureName
+        .zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2,
+        "Feature name mismatch in glm summary") }
+      idx += 1
+    }
+  }
+
+  test("glm summary: summaryTable") {
+    val dataset = Seq(
+      Instance(2.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
+      Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)),
+      Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)),
+      Instance(9.0, 4.0, Vectors.dense(3.0, 13.0)),
+      Instance(2.0, 5.0, Vectors.dense(2.0, 3.0))
+    ).toDF()
+
+    val expectedFeature = Seq(Array("V1", "V2"),
+      Array("Intercept", "V1", "V2"))
+    val expectedEstimate = Seq(Vectors.dense(0.2884, 0.538),
+      Vectors.dense(0.7903, 0.2258, 0.4677))
+    val expectedStdError = Seq(Vectors.dense(1.724, 0.3787),
+      Vectors.dense(4.0129, 2.1153, 0.5815))
+    val expectedTValue = Seq(Vectors.dense(0.1673, 1.4205),
+      Vectors.dense(0.1969, 0.1067, 0.8043))
+    val expectedPValue = Seq(Vectors.dense(0.8778, 0.2506),
+      Vectors.dense(0.8621, 0.9247, 0.5056))
+
+    var idx = 0
+    for (fitIntercept <- Seq(false, true)) {
+      val trainer = new GeneralizedLinearRegression()
+        .setFamily("gaussian")
+        .setFitIntercept(fitIntercept)
+      val model = trainer.fit(dataset)
+      val summaryTable = model.summary.summaryTable
+
+      summaryTable.select("Feature").rdd.collect.map(_.getString(0))
+        .zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2,
+        "Feature name mismatch in summaryTable") }
+      assert(Vectors.dense(summaryTable.select("Estimate").rdd.collect.map(_.getDouble(0)))
+        ~= expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable")
+      assert(Vectors.dense(summaryTable.select("StdError").rdd.collect.map(_.getDouble(0)))
+        ~= expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable")
+      assert(Vectors.dense(summaryTable.select("TValue").rdd.collect.map(_.getDouble(0)))
+        ~= expectedTValue(idx) absTol 1E-3, "TValue mismatch in summaryTable")
+      assert(Vectors.dense(summaryTable.select("PValue").rdd.collect.map(_.getDouble(0)))
+        ~= expectedPValue(idx) absTol 1E-3, "PValue mismatch in summaryTable")
+
+      idx += 1
+    }
+  }
+
   test("generalized linear regression: regularization parameter") {
     /*
       R code:

From af2dbea625b49e0c7219b794c4eb84a946032d25 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Tue, 17 Jan 2017 23:27:37 -0800
Subject: [PATCH 04/20] minor cleanup

---
 .../spark/ml/regression/GeneralizedLinearRegression.scala     | 3 +--
 .../ml/regression/GeneralizedLinearRegressionSuite.scala      | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 12343dd8c3bbc..cd91f7d5838b0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -1208,7 +1208,7 @@ class GeneralizedLinearRegressionSummary private[regression] (
 
   /**
    * Name of features. If the name cannot be retrieved from attributes,
-   * use default names "V1", "V2", ...
+   * set default names to "V1", "V2", and so on.
    */
   @Since("2.2.0")
   lazy val featureName: Array[String] = {
@@ -1505,5 +1505,4 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
     }
   }
 
-
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index fb934b53181ac..41e827d5f5586 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1525,7 +1525,7 @@ class GeneralizedLinearRegressionSuite
 
 
   test("glm summary: feature name") {
-    // dataset with no attribute
+    // dataset1 with no attribute
     val dataset1 = Seq(
       Instance(2.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
       Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)),
@@ -1534,7 +1534,7 @@ class GeneralizedLinearRegressionSuite
       Instance(2.0, 5.0, Vectors.dense(2.0, 3.0))
     ).toDF()
 
-    // dataset with attribute
+    // dataset2 with attribute
     val datasetTmp = Seq(
       (2.0, 1.0, 0.0, 5.0),
       (8.0, 2.0, 1.0, 7.0),

From e2ac2d48b5eaedfb368cad7b1e8b7d38ea255ba8 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Tue, 17 Jan 2017 23:34:44 -0800
Subject: [PATCH 05/20] clean up test

---
 .../GeneralizedLinearRegressionSuite.scala          | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 41e827d5f5586..7750fddaea891 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.regression
 
 import scala.util.Random
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.{Instance, OffsetInstance}
@@ -1527,7 +1528,7 @@ class GeneralizedLinearRegressionSuite
   test("glm summary: feature name") {
     // dataset1 with no attribute
     val dataset1 = Seq(
-      Instance(2.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
+      Instance(2.0, 1.0, Vectors.dense(0.0, 5.0)),
       Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)),
       Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)),
       Instance(9.0, 4.0, Vectors.dense(3.0, 13.0)),
@@ -1547,20 +1548,18 @@ class GeneralizedLinearRegressionSuite
 
     val expectedFeature = Seq(Array("V1", "V2"), Array("x1", "x2"))
 
-    val trainer = new GeneralizedLinearRegression()
     var idx = 0
     for (dataset <- Seq(dataset1, dataset2)) {
-      val model = trainer.fit(dataset)
-      model.summary.featureName
-        .zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2,
-        "Feature name mismatch in glm summary") }
+      val model = new GeneralizedLinearRegression().fit(dataset)
+      model.summary.featureName.zip(expectedFeature(idx))
+        .foreach{ x => assert(x._1 === x._2) }
       idx += 1
     }
   }
 
   test("glm summary: summaryTable") {
     val dataset = Seq(
-      Instance(2.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
+      Instance(2.0, 1.0, Vectors.dense(0.0, 5.0)),
       Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)),
       Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)),
       Instance(9.0, 4.0, Vectors.dense(3.0, 13.0)),

From eec31b46f135393ed2dead3a8ac469669e86bafc Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 18 Jan 2017 15:14:06 -0800
Subject: [PATCH 06/20] clean up imports

---
 .../spark/ml/regression/GeneralizedLinearRegression.scala   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index cd91f7d5838b0..545b5d0fd83a0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -21,21 +21,22 @@ import java.util.Locale
 
 import breeze.stats.{distributions => dist}
 import org.apache.hadoop.fs.Path
+
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.PredictorParams
+import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.feature.{Instance, OffsetInstance}
 import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.ml.optim._
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
-import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.{DataType, DoubleType, StringType, StructType, StructField}
+import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 import org.apache.spark.sql.SparkSession
 
 
@@ -1504,5 +1505,4 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
         "No summary table available for this GeneralizedLinearRegressionModel")
     }
   }
-
 }

From 602c3bd91ff481da6fb37743d1c007305b3d47a5 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 25 Jan 2017 10:17:16 -0800
Subject: [PATCH 07/20] fix style issues

---
 .../ml/regression/GeneralizedLinearRegression.scala      | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 545b5d0fd83a0..7152e603bccb5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -1478,10 +1478,9 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
   }
 
   /**
-    * Summary table with feature name, coefficient, standard error,
-    * tValue and pValue.
-    *
-    */
+   * Summary table with feature name, coefficient, standard error,
+   * tValue and pValue.
+   */
   @Since("2.2.0")
   lazy val summaryTable: DataFrame = {
     if (isNormalSolver) {
@@ -1491,7 +1490,7 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
       if (model.getFitIntercept) {
         featureNames = featureNames :+ "Intercept"
         coefficients = coefficients :+ model.intercept
-        // reorder so that intercept comes first
+        // Reorder so that intercept comes first
         idx = (coefficients.length - 1) +: idx
       }
       val result = for (i <- idx.toSeq) yield

From 6882be4b9e01502237b5272ce538ed4b2e47458f Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Tue, 14 Feb 2017 11:35:54 -0800
Subject: [PATCH 08/20] change default name to use feature colname

---
 .../GeneralizedLinearRegression.scala         | 18 ++++++++++-------
 .../GeneralizedLinearRegressionSuite.scala    | 20 +++++++++----------
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 7152e603bccb5..982c911b3480b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -471,6 +471,8 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
 
   private[regression] val epsilon: Double = 1E-16
 
+  private[regression] val Intercept: String = "Intercept"
+
   /**
    * Wrapper of family and link combination used in the model.
    */
@@ -1209,14 +1211,15 @@ class GeneralizedLinearRegressionSummary private[regression] (
 
   /**
    * Name of features. If the name cannot be retrieved from attributes,
-   * set default names to "V1", "V2", and so on.
+   * set default names to feature column name with numbered suffix "_0", "_1", and so on.
    */
   @Since("2.2.0")
-  lazy val featureName: Array[String] = {
+  lazy val featureNames: Array[String] = {
     val featureAttrs = AttributeGroup.fromStructField(
       dataset.schema(model.getFeaturesCol)).attributes
     if (featureAttrs == None) {
-      Array.tabulate[String](origModel.numFeatures)((x: Int) => ("V" + (x + 1)))
+      Array.tabulate[String](origModel.numFeatures)(
+        (x: Int) => (model.getFeaturesCol + "_" + x))
     } else {
       featureAttrs.get.map(_.name.get)
     }
@@ -1484,21 +1487,22 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
   @Since("2.2.0")
   lazy val summaryTable: DataFrame = {
     if (isNormalSolver) {
-      var featureNames = featureName
+      var featureNamesLocal = featureNames
       var coefficients = model.coefficients.toArray
       var idx = Array.range(0, coefficients.length)
       if (model.getFitIntercept) {
-        featureNames = featureNames :+ "Intercept"
+        featureNamesLocal = featureNamesLocal :+ Intercept
         coefficients = coefficients :+ model.intercept
         // Reorder so that intercept comes first
         idx = (coefficients.length - 1) +: idx
       }
       val result = for (i <- idx.toSeq) yield
-        (featureNames(i), coefficients(i), coefficientStandardErrors(i), tValues(i), pValues(i))
+        (featureNamesLocal(i), coefficients(i), coefficientStandardErrors(i),
+        tValues(i), pValues(i))
 
       val spark = SparkSession.builder().getOrCreate()
       import spark.implicits._
-      result.toDF("Feature", "Estimate", "StdError", "TValue", "PValue").repartition(1)
+      result.toDF("Feature", "Coefficient", "StdError", "TValue", "PValue").repartition(1)
     } else {
       throw new UnsupportedOperationException(
         "No summary table available for this GeneralizedLinearRegressionModel")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 7750fddaea891..9b1e1fc7a5bf4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1546,12 +1546,12 @@ class GeneralizedLinearRegressionSuite
     val formula = new RFormula().setFormula("y ~ x1 + x2")
     val dataset2 = formula.fit(datasetTmp).transform(datasetTmp)
 
-    val expectedFeature = Seq(Array("V1", "V2"), Array("x1", "x2"))
+    val expectedFeature = Seq(Array("features_0", "features_1"), Array("x1", "x2"))
 
     var idx = 0
     for (dataset <- Seq(dataset1, dataset2)) {
       val model = new GeneralizedLinearRegression().fit(dataset)
-      model.summary.featureName.zip(expectedFeature(idx))
+      model.summary.featureNames.zip(expectedFeature(idx))
         .foreach{ x => assert(x._1 === x._2) }
       idx += 1
     }
@@ -1566,8 +1566,8 @@ class GeneralizedLinearRegressionSuite
       Instance(2.0, 5.0, Vectors.dense(2.0, 3.0))
     ).toDF()
 
-    val expectedFeature = Seq(Array("V1", "V2"),
-      Array("Intercept", "V1", "V2"))
+    val expectedFeature = Seq(Array("features_0", "features_1"),
+      Array("Intercept", "features_0", "features_1"))
     val expectedEstimate = Seq(Vectors.dense(0.2884, 0.538),
       Vectors.dense(0.7903, 0.2258, 0.4677))
     val expectedStdError = Seq(Vectors.dense(1.724, 0.3787),
@@ -1585,17 +1585,17 @@ class GeneralizedLinearRegressionSuite
       val model = trainer.fit(dataset)
       val summaryTable = model.summary.summaryTable
 
-      summaryTable.select("Feature").rdd.collect.map(_.getString(0))
+      summaryTable.select("Feature").collect.map(_.getString(0))
         .zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2,
         "Feature name mismatch in summaryTable") }
-      assert(Vectors.dense(summaryTable.select("Estimate").rdd.collect.map(_.getDouble(0)))
-        ~= expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable")
+      assert(Vectors.dense(summaryTable.select("Coefficient").rdd.collect.map(_.getDouble(0)))
+        ~== expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable")
       assert(Vectors.dense(summaryTable.select("StdError").rdd.collect.map(_.getDouble(0)))
-        ~= expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable")
+        ~== expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable")
       assert(Vectors.dense(summaryTable.select("TValue").rdd.collect.map(_.getDouble(0)))
-        ~= expectedTValue(idx) absTol 1E-3, "TValue mismatch in summaryTable")
+        ~== expectedTValue(idx) absTol 1E-3, "TValue mismatch in summaryTable")
       assert(Vectors.dense(summaryTable.select("PValue").rdd.collect.map(_.getDouble(0)))
-        ~= expectedPValue(idx) absTol 1E-3, "PValue mismatch in summaryTable")
+        ~== expectedPValue(idx) absTol 1E-3, "PValue mismatch in summaryTable")
 
       idx += 1
     }

From 8405501969155980817b73d466fca81ddc070d7f Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Tue, 14 Feb 2017 14:08:06 -0800
Subject: [PATCH 09/20] glmTable

---
 .../spark/ml/regression/GeneralizedLinearRegression.scala       | 2 +-
 .../spark/ml/regression/GeneralizedLinearRegressionSuite.scala  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 982c911b3480b..2263fe2d18fd3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -471,7 +471,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
 
   private[regression] val epsilon: Double = 1E-16
 
-  private[regression] val Intercept: String = "Intercept"
+  private[regression] val Intercept: String = "(Intercept)"
 
   /**
    * Wrapper of family and link combination used in the model.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 9b1e1fc7a5bf4..7969f69148c67 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1567,7 +1567,7 @@ class GeneralizedLinearRegressionSuite
     ).toDF()
 
     val expectedFeature = Seq(Array("features_0", "features_1"),
-      Array("Intercept", "features_0", "features_1"))
+      Array("(Intercept)", "features_0", "features_1"))
     val expectedEstimate = Seq(Vectors.dense(0.2884, 0.538),
       Vectors.dense(0.7903, 0.2258, 0.4677))
     val expectedStdError = Seq(Vectors.dense(1.724, 0.3787),

From 10f0f9bcf112a4019a41d87e8a6a841f5938bfaf Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Fri, 17 Feb 2017 10:25:34 -0800
Subject: [PATCH 10/20] update R glm wrapper to use summaryTable

---
 .../GeneralizedLinearRegressionWrapper.scala  | 61 ++++++-------------
 1 file changed, 19 insertions(+), 42 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index ee1fc9b14ceaa..89185e887e6cb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -83,11 +83,7 @@ private[r] object GeneralizedLinearRegressionWrapper
       .setStringIndexerOrderType(stringIndexerOrderType)
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
-    // get labels and feature names from output schema
-    val schema = rFormulaModel.transform(data).schema
-    val featureAttrs = AttributeGroup.fromStructField(schema(rFormula.getFeaturesCol))
-      .attributes.get
-    val features = featureAttrs.map(_.name.get)
+
     // assemble and fit the pipeline
     val glr = new GeneralizedLinearRegression()
       .setFamily(family)
@@ -112,44 +108,25 @@ private[r] object GeneralizedLinearRegressionWrapper
       pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel]
     val summary = glm.summary
 
-    val rFeatures: Array[String] = if (glm.getFitIntercept) {
-      Array("(Intercept)") ++ features
-    } else {
-      features
-    }
+    val features = summary.featureNames
 
-    val rCoefficients: Array[Double] = if (summary.isNormalSolver) {
-      val rCoefficientStandardErrors = if (glm.getFitIntercept) {
-        Array(summary.coefficientStandardErrors.last) ++
-          summary.coefficientStandardErrors.dropRight(1)
-      } else {
-        summary.coefficientStandardErrors
-      }
-
-      val rTValues = if (glm.getFitIntercept) {
-        Array(summary.tValues.last) ++ summary.tValues.dropRight(1)
-      } else {
-        summary.tValues
-      }
-
-      val rPValues = if (glm.getFitIntercept) {
-        Array(summary.pValues.last) ++ summary.pValues.dropRight(1)
-      } else {
-        summary.pValues
-      }
-
-      if (glm.getFitIntercept) {
-        Array(glm.intercept) ++ glm.coefficients.toArray ++
-          rCoefficientStandardErrors ++ rTValues ++ rPValues
-      } else {
-        glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
-      }
-    } else {
-      if (glm.getFitIntercept) {
-        Array(glm.intercept) ++ glm.coefficients.toArray
-      } else {
-        glm.coefficients.toArray
-      }
+    val rFeatures: Array[String] =
+      summary.summaryTable.select("Feature").collect.map(_.getString(0))
+
+    var rCoefficients: Array[Double] =
+      summary.summaryTable.select("Coefficient").collect.map(_.getDouble(0))
+
+    if (summary.isNormalSolver) {
+      val rCoefficientStandardErrors =
+        summary.summaryTable.select("StdError").collect.map(_.getDouble(0))
+
+      val rTValues =
+        summary.summaryTable.select("TValue").collect.map(_.getDouble(0))
+
+      val rPValues =
+        summary.summaryTable.select("PValue").collect.map(_.getDouble(0))
+
+      rCoefficients = rCoefficients ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
     }
 
     val rDispersion: Double = summary.dispersion

From 3d72cf52666dc2d28334419c3ed2572bebf7f45a Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Fri, 17 Feb 2017 10:55:34 -0800
Subject: [PATCH 11/20] clean up test

---
 .../GeneralizedLinearRegressionSuite.scala    | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 7969f69148c67..b92253d1cfb82 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1558,6 +1558,21 @@ class GeneralizedLinearRegressionSuite
   }
 
   test("glm summary: summaryTable") {
+    /*
+      R code:
+
+      A <- matrix(c(0, 1, 2, 3, 2, 5, 7, 11, 13, 3), 5, 2)
+      b <- c(2, 8, 3, 9, 2)
+      df <- as.data.frame(cbind(A, b))
+      model <- glm(formula = "b ~ .",  data = df)
+      summary(model)
+
+      Coefficients:
+                  Estimate Std. Error t value Pr(>|t|)
+      (Intercept)   0.7903     4.0129   0.197    0.862
+      V1            0.2258     2.1153   0.107    0.925
+      V2            0.4677     0.5815   0.804    0.506
+    */
     val dataset = Seq(
       Instance(2.0, 1.0, Vectors.dense(0.0, 5.0)),
       Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)),
@@ -1588,13 +1603,13 @@ class GeneralizedLinearRegressionSuite
       summaryTable.select("Feature").collect.map(_.getString(0))
         .zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2,
         "Feature name mismatch in summaryTable") }
-      assert(Vectors.dense(summaryTable.select("Coefficient").rdd.collect.map(_.getDouble(0)))
+      assert(Vectors.dense(summaryTable.select("Coefficient").collect.map(_.getDouble(0)))
         ~== expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable")
-      assert(Vectors.dense(summaryTable.select("StdError").rdd.collect.map(_.getDouble(0)))
+      assert(Vectors.dense(summaryTable.select("StdError").collect.map(_.getDouble(0)))
         ~== expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable")
-      assert(Vectors.dense(summaryTable.select("TValue").rdd.collect.map(_.getDouble(0)))
+      assert(Vectors.dense(summaryTable.select("TValue").collect.map(_.getDouble(0)))
         ~== expectedTValue(idx) absTol 1E-3, "TValue mismatch in summaryTable")
-      assert(Vectors.dense(summaryTable.select("PValue").rdd.collect.map(_.getDouble(0)))
+      assert(Vectors.dense(summaryTable.select("PValue").collect.map(_.getDouble(0)))
         ~== expectedPValue(idx) absTol 1E-3, "PValue mismatch in summaryTable")
 
       idx += 1

From 07a6784d30e6baaa74750a65c9f1ee1d5f98167d Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Fri, 17 Feb 2017 14:46:03 -0800
Subject: [PATCH 12/20] fix issue in R wrapper

---
 .../GeneralizedLinearRegressionWrapper.scala  | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 89185e887e6cb..efc37cb3b86fe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -108,15 +108,13 @@ private[r] object GeneralizedLinearRegressionWrapper
       pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel]
     val summary = glm.summary
 
-    val features = summary.featureNames
-
-    val rFeatures: Array[String] =
-      summary.summaryTable.select("Feature").collect.map(_.getString(0))
-
-    var rCoefficients: Array[Double] =
-      summary.summaryTable.select("Coefficient").collect.map(_.getDouble(0))
+    val rFeatures: Array[String] = if (glm.getFitIntercept) {
+      Array("(Intercept)") ++ summary.featureNames
+    } else {
+      summary.featureNames
+    }
 
-    if (summary.isNormalSolver) {
+    val rCoefficients: Array[Double] = if (summary.isNormalSolver) {
       val rCoefficientStandardErrors =
         summary.summaryTable.select("StdError").collect.map(_.getDouble(0))
 
@@ -126,7 +124,14 @@ private[r] object GeneralizedLinearRegressionWrapper
       val rPValues =
         summary.summaryTable.select("PValue").collect.map(_.getDouble(0))
 
-      rCoefficients = rCoefficients ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
+      summary.summaryTable.select("Coefficient").collect.map(_.getDouble(0)) ++
+        rCoefficientStandardErrors ++ rTValues ++ rPValues
+    } else {
+      if (glm.getFitIntercept) {
+        Array(glm.intercept) ++ glm.coefficients.toArray
+      } else {
+        glm.coefficients.toArray
+      }
     }
 
     val rDispersion: Double = summary.dispersion

From 1c1d3e6e0cb1e39fbf8afe0660b5033809415df9 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Fri, 24 Feb 2017 10:35:27 -0800
Subject: [PATCH 13/20] sort import

---
 .../spark/ml/regression/GeneralizedLinearRegression.scala     | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 2263fe2d18fd3..2a3d236301f8e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -34,11 +34,9 @@ import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
+import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
-import org.apache.spark.sql.SparkSession
-
 
 /**
  * Params for Generalized Linear Regression.

From a16cbee4e86cf044a90015bdd6900b9a22116200 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Mon, 17 Jul 2017 14:45:54 -0700
Subject: [PATCH 14/20] use 2D array for summary table

---
 .../GeneralizedLinearRegressionWrapper.scala  |  15 +-
 .../GeneralizedLinearRegression.scala         | 160 ++++++++++++++++--
 .../GeneralizedLinearRegressionSuite.scala    |  21 +--
 3 files changed, 157 insertions(+), 39 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index efc37cb3b86fe..13cd9b602f011 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -115,17 +115,10 @@ private[r] object GeneralizedLinearRegressionWrapper
     }
 
     val rCoefficients: Array[Double] = if (summary.isNormalSolver) {
-      val rCoefficientStandardErrors =
-        summary.summaryTable.select("StdError").collect.map(_.getDouble(0))
-
-      val rTValues =
-        summary.summaryTable.select("TValue").collect.map(_.getDouble(0))
-
-      val rPValues =
-        summary.summaryTable.select("PValue").collect.map(_.getDouble(0))
-
-      summary.summaryTable.select("Coefficient").collect.map(_.getDouble(0)) ++
-        rCoefficientStandardErrors ++ rTValues ++ rPValues
+      summary.coefficientMatrix.map(_._2) ++
+        summary.coefficientMatrix.map(_._3) ++
+        summary.coefficientMatrix.map(_._4) ++
+        summary.coefficientMatrix.map(_._5)
     } else {
       if (glm.getFitIntercept) {
         Array(glm.intercept) ++ glm.coefficients.toArray
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 2a3d236301f8e..72b3c8d399615 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -20,6 +20,9 @@ package org.apache.spark.ml.regression
 import java.util.Locale
 
 import breeze.stats.{distributions => dist}
+
+import org.apache.commons.lang3.StringUtils
+
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
@@ -34,7 +37,7 @@ import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession}
+import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 
@@ -1211,8 +1214,7 @@ class GeneralizedLinearRegressionSummary private[regression] (
    * Name of features. If the name cannot be retrieved from attributes,
    * set default names to feature column name with numbered suffix "_0", "_1", and so on.
    */
-  @Since("2.2.0")
-  lazy val featureNames: Array[String] = {
+  private[ml] lazy val featureNames: Array[String] = {
     val featureAttrs = AttributeGroup.fromStructField(
       dataset.schema(model.getFeaturesCol)).attributes
     if (featureAttrs == None) {
@@ -1479,31 +1481,165 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
   }
 
   /**
-   * Summary table with feature name, coefficient, standard error,
+   * Coefficient matrix with feature name, coefficient, standard error,
    * tValue and pValue.
    */
-  @Since("2.2.0")
-  lazy val summaryTable: DataFrame = {
+  @Since("2.3.0")
+  lazy val coefficientMatrix: Array[(String, Double, Double, Double, Double)] = {
     if (isNormalSolver) {
       var featureNamesLocal = featureNames
       var coefficients = model.coefficients.toArray
       var idx = Array.range(0, coefficients.length)
       if (model.getFitIntercept) {
-        featureNamesLocal = featureNamesLocal :+ Intercept
+        featureNamesLocal = featureNamesLocal :+ "(Intercept)"
         coefficients = coefficients :+ model.intercept
         // Reorder so that intercept comes first
         idx = (coefficients.length - 1) +: idx
       }
-      val result = for (i <- idx.toSeq) yield
+      val result = for (i <- idx) yield
         (featureNamesLocal(i), coefficients(i), coefficientStandardErrors(i),
         tValues(i), pValues(i))
-
-      val spark = SparkSession.builder().getOrCreate()
-      import spark.implicits._
-      result.toDF("Feature", "Coefficient", "StdError", "TValue", "PValue").repartition(1)
+      result
     } else {
       throw new UnsupportedOperationException(
         "No summary table available for this GeneralizedLinearRegressionModel")
     }
   }
+
+  private def round(x: Double, digit: Int): String = {
+    BigDecimal(x).setScale(digit, BigDecimal.RoundingMode.HALF_UP).toString()
+  }
+
+  private[regression] def showString(_numRows: Int, truncate: Int = 20,
+                                     numDigits: Int = 3): String = {
+    val numRows = _numRows.max(1)
+    val data = coefficientMatrix.take(numRows)
+    val hasMoreData = coefficientMatrix.size > numRows
+
+    val colNames = Array("Feature", "Estimate", "StdError", "TValue", "PValue")
+    val numCols = colNames.size
+
+    val rows = colNames +: data.map( row => {
+      val mrow = for (cell <- row.productIterator) yield {
+        val str = cell match {
+          case s: String => s
+          case n: Double => round(n, numDigits).toString
+        }
+        if (truncate > 0 && str.length > truncate) {
+          // do not show ellipses for strings shorter than 4 characters.
+          if (truncate < 4) str.substring(0, truncate)
+          else str.substring(0, truncate - 3) + "..."
+        } else {
+          str
+        }
+      }
+      mrow.toArray
+    })
+
+    val sb = new StringBuilder
+    val colWidths = Array.fill(numCols)(3)
+
+    // Compute the width of each column
+    for (row <- rows) {
+      for ((cell, i) <- row.zipWithIndex) {
+        colWidths(i) = math.max(colWidths(i), cell.length)
+      }
+    }
+
+    // Create SeparateLine
+    val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()
+
+    // column names
+    rows.head.zipWithIndex.map { case (cell, i) =>
+      if (truncate > 0) {
+        StringUtils.leftPad(cell, colWidths(i))
+      } else {
+        StringUtils.rightPad(cell, colWidths(i))
+      }
+    }.addString(sb, "|", "|", "|\n")
+    sb.append(sep)
+
+    // data
+    rows.tail.map {
+      _.zipWithIndex.map { case (cell, i) =>
+        if (truncate > 0) {
+          StringUtils.leftPad(cell.toString, colWidths(i))
+        } else {
+          StringUtils.rightPad(cell.toString, colWidths(i))
+        }
+      }.addString(sb, "|", "|", "|\n")
+    }
+
+    // For Data that has more than "numRows" records
+    if (hasMoreData) {
+      sb.append("...\n")
+      sb.append(sep)
+      val rowsString = if (numRows == 1) "row" else "rows"
+      sb.append(s"only showing top $numRows $rowsString\n")
+    } else {
+      sb.append(sep)
+    }
+
+    sb.append("\n")
+    sb.append(s"(Dispersion parameter for ${family.name} family taken to be " +
+      round(dispersion, numDigits) + ")")
+
+    sb.append("\n")
+    val nd = "Null deviance: " + round(nullDeviance, numDigits) +
+      s" on $degreesOfFreedom degrees of freedom"
+    val rd = "Residual deviance: " + round(deviance, numDigits) +
+      s" on $residualDegreeOfFreedom degrees of freedom"
+    val l = math.max(nd.length, rd.length)
+    sb.append(StringUtils.leftPad(nd, l))
+    sb.append("\n")
+    sb.append(StringUtils.leftPad(rd, l))
+
+    if (family.name != "tweedie") {
+      sb.append("\n")
+      sb.append(s"AIC: " + round(aic, numDigits))
+    }
+
+    sb.toString()
+  }
+
+  /**
+   * Displays the summary of a GeneralizedLinearModel fit.
+   *
+   * @since 2.3.0
+   */
+  def show(): Unit = {
+    val numRows = coefficientMatrix.size
+    show(numRows, true, 3)
+  }
+
+  /**
+   * Displays the top numRows rows of the summary of a GeneralizedLinearModel fit.
+   *
+   * @param numRows Number of rows to show
+   *
+   * @since 2.3.0
+   */
+  @Since("2.3.0")
+  def show(numRows: Int): Unit = {
+    show(numRows, true, 3)
+  }
+
+  /**
+   * Displays the summary of a GeneralizedLinearModel fit. Strings more than 20 characters
+   * will be truncated, and all cells will be aligned right.
+   *
+   * @param numRows Number of rows to show
+   * @param truncate Whether truncate long strings. If true, strings more than 20 characters will
+   *              be truncated and all cells will be aligned right
+   * @param numDigits Number of decimal places used to round numerical values.
+   *
+   * @since 2.3.0
+   */
+  // scalastyle:off println
+  def show(numRows: Int, truncate: Boolean, numDigits: Int): Unit = if (truncate) {
+    println(showString(numRows, truncate = 20, numDigits))
+  } else {
+    println(showString(numRows, truncate = 0, numDigits))
+  }
+  // scalastyle:on println
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index b92253d1cfb82..e6202f1e4ce37 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1524,7 +1524,6 @@ class GeneralizedLinearRegressionSuite
       .fit(datasetGaussianIdentity.as[LabeledPoint])
   }
 
-
   test("glm summary: feature name") {
     // dataset1 with no attribute
     val dataset1 = Seq(
@@ -1557,7 +1556,7 @@ class GeneralizedLinearRegressionSuite
     }
   }
 
-  test("glm summary: summaryTable") {
+  test("glm summary: coefficient matrix") {
     /*
       R code:
 
@@ -1587,10 +1586,6 @@ class GeneralizedLinearRegressionSuite
       Vectors.dense(0.7903, 0.2258, 0.4677))
     val expectedStdError = Seq(Vectors.dense(1.724, 0.3787),
       Vectors.dense(4.0129, 2.1153, 0.5815))
-    val expectedTValue = Seq(Vectors.dense(0.1673, 1.4205),
-      Vectors.dense(0.1969, 0.1067, 0.8043))
-    val expectedPValue = Seq(Vectors.dense(0.8778, 0.2506),
-      Vectors.dense(0.8621, 0.9247, 0.5056))
 
     var idx = 0
     for (fitIntercept <- Seq(false, true)) {
@@ -1598,20 +1593,14 @@ class GeneralizedLinearRegressionSuite
         .setFamily("gaussian")
         .setFitIntercept(fitIntercept)
       val model = trainer.fit(dataset)
-      val summaryTable = model.summary.summaryTable
+      val coefficientMatrix = model.summary.coefficientMatrix
 
-      summaryTable.select("Feature").collect.map(_.getString(0))
-        .zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2,
+      coefficientMatrix.map(_._1).zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2,
         "Feature name mismatch in summaryTable") }
-      assert(Vectors.dense(summaryTable.select("Coefficient").collect.map(_.getDouble(0)))
+      assert(Vectors.dense(coefficientMatrix.map(_._2))
         ~== expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable")
-      assert(Vectors.dense(summaryTable.select("StdError").collect.map(_.getDouble(0)))
+      assert(Vectors.dense(coefficientMatrix.map(_._3))
         ~== expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable")
-      assert(Vectors.dense(summaryTable.select("TValue").collect.map(_.getDouble(0)))
-        ~== expectedTValue(idx) absTol 1E-3, "TValue mismatch in summaryTable")
-      assert(Vectors.dense(summaryTable.select("PValue").collect.map(_.getDouble(0)))
-        ~== expectedPValue(idx) absTol 1E-3, "PValue mismatch in summaryTable")
-
       idx += 1
     }
   }

From 640d56442e6f5d1a14b4a0cb895d6da713b003fd Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Mon, 17 Jul 2017 16:37:09 -0700
Subject: [PATCH 15/20] fix import

---
 .../spark/ml/regression/GeneralizedLinearRegression.scala       | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 72b3c8d399615..383c7c0ff28a5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -20,9 +20,7 @@ package org.apache.spark.ml.regression
 import java.util.Locale
 
 import breeze.stats.{distributions => dist}
-
 import org.apache.commons.lang3.StringUtils
-
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException

From 57f1e5c259d7f237324dd1b3b481b7e82952b53e Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Mon, 17 Jul 2017 16:45:08 -0700
Subject: [PATCH 16/20] remove intercept

---
 .../spark/ml/regression/GeneralizedLinearRegression.scala       | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 383c7c0ff28a5..5164622063a8f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -470,8 +470,6 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
 
   private[regression] val epsilon: Double = 1E-16
 
-  private[regression] val Intercept: String = "(Intercept)"
-
   /**
    * Wrapper of family and link combination used in the model.
    */

From 167af016b5319b71137102dd69e3b8331616d4ad Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 19 Jul 2017 10:27:57 -0700
Subject: [PATCH 17/20] simplify show method

---
 .../GeneralizedLinearRegressionWrapper.scala  |  8 ++--
 .../GeneralizedLinearRegression.scala         | 48 ++++---------------
 .../GeneralizedLinearRegressionSuite.scala    |  8 ++--
 3 files changed, 18 insertions(+), 46 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 13cd9b602f011..1e7ece6bc7199 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -115,10 +115,10 @@ private[r] object GeneralizedLinearRegressionWrapper
     }
 
     val rCoefficients: Array[Double] = if (summary.isNormalSolver) {
-      summary.coefficientMatrix.map(_._2) ++
-        summary.coefficientMatrix.map(_._3) ++
-        summary.coefficientMatrix.map(_._4) ++
-        summary.coefficientMatrix.map(_._5)
+      summary.coefficientCollection.map(_._2) ++
+        summary.coefficientCollection.map(_._3) ++
+        summary.coefficientCollection.map(_._4) ++
+        summary.coefficientCollection.map(_._5)
     } else {
       if (glm.getFitIntercept) {
         Array(glm.intercept) ++ glm.coefficients.toArray
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 5164622063a8f..9aa439745a73d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -1477,11 +1477,10 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
   }
 
   /**
-   * Coefficient matrix with feature name, coefficient, standard error,
+   * Collection of feature name, coefficient, standard error,
    * tValue and pValue.
    */
-  @Since("2.3.0")
-  lazy val coefficientMatrix: Array[(String, Double, Double, Double, Double)] = {
+  private[ml] lazy val coefficientCollection: Array[(String, Double, Double, Double, Double)] = {
     if (isNormalSolver) {
       var featureNamesLocal = featureNames
       var coefficients = model.coefficients.toArray
@@ -1498,7 +1497,7 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
       result
     } else {
       throw new UnsupportedOperationException(
-        "No summary table available for this GeneralizedLinearRegressionModel")
+        "No summary available for this GeneralizedLinearRegressionModel")
     }
   }
 
@@ -1509,8 +1508,8 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
   private[regression] def showString(_numRows: Int, truncate: Int = 20,
                                      numDigits: Int = 3): String = {
     val numRows = _numRows.max(1)
-    val data = coefficientMatrix.take(numRows)
-    val hasMoreData = coefficientMatrix.size > numRows
+    val data = coefficientCollection.take(numRows)
+    val hasMoreData = coefficientCollection.size > numRows
 
     val colNames = Array("Feature", "Estimate", "StdError", "TValue", "PValue")
     val numCols = colNames.size
@@ -1598,44 +1597,17 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
     sb.toString()
   }
 
-  /**
-   * Displays the summary of a GeneralizedLinearModel fit.
-   *
-   * @since 2.3.0
-   */
-  def show(): Unit = {
-    val numRows = coefficientMatrix.size
-    show(numRows, true, 3)
-  }
-
-  /**
-   * Displays the top numRows rows of the summary of a GeneralizedLinearModel fit.
-   *
-   * @param numRows Number of rows to show
-   *
-   * @since 2.3.0
-   */
-  @Since("2.3.0")
-  def show(numRows: Int): Unit = {
-    show(numRows, true, 3)
-  }
-
   /**
    * Displays the summary of a GeneralizedLinearModel fit. Strings more than 20 characters
-   * will be truncated, and all cells will be aligned right.
-   *
-   * @param numRows Number of rows to show
-   * @param truncate Whether truncate long strings. If true, strings more than 20 characters will
-   *              be truncated and all cells will be aligned right
-   * @param numDigits Number of decimal places used to round numerical values.
+   * will be truncated, and all cells will be aligned right. Numbers are rounded to three
+   * decimal places.
    *
    * @since 2.3.0
    */
   // scalastyle:off println
-  def show(numRows: Int, truncate: Boolean, numDigits: Int): Unit = if (truncate) {
-    println(showString(numRows, truncate = 20, numDigits))
-  } else {
-    println(showString(numRows, truncate = 0, numDigits))
+  def show(): Unit = {
+    println(showString(coefficientCollection.size, truncate = 20, 3))
   }
   // scalastyle:on println
+
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index e6202f1e4ce37..5a6c884ea43f5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1593,13 +1593,13 @@ class GeneralizedLinearRegressionSuite
         .setFamily("gaussian")
         .setFitIntercept(fitIntercept)
       val model = trainer.fit(dataset)
-      val coefficientMatrix = model.summary.coefficientMatrix
+      val coefficients = model.summary.coefficientCollection
 
-      coefficientMatrix.map(_._1).zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2,
+      coefficients.map(_._1).zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2,
         "Feature name mismatch in summaryTable") }
-      assert(Vectors.dense(coefficientMatrix.map(_._2))
+      assert(Vectors.dense(coefficients.map(_._2))
         ~== expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable")
-      assert(Vectors.dense(coefficientMatrix.map(_._3))
+      assert(Vectors.dense(coefficients.map(_._3))
         ~== expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable")
       idx += 1
     }

From 174fc49142f2915c46fc53df4cb024d2e97cc6ca Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 19 Jul 2017 10:30:51 -0700
Subject: [PATCH 18/20] fix align issue

---
 .../spark/ml/regression/GeneralizedLinearRegression.scala      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 9aa439745a73d..392765c20fbe2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -1505,7 +1505,8 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
     BigDecimal(x).setScale(digit, BigDecimal.RoundingMode.HALF_UP).toString()
   }
 
-  private[regression] def showString(_numRows: Int, truncate: Int = 20,
+  private[regression] def showString(_numRows: Int,
+                                     truncate: Int = 20,
                                      numDigits: Int = 3): String = {
     val numRows = _numRows.max(1)
     val data = coefficientCollection.take(numRows)

From be11106f42e6bcccbb58222a3bc75a18111cdccc Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 26 Jul 2017 12:47:20 +0800
Subject: [PATCH 19/20] Refactor GLR summary toString.

---
 .../GeneralizedLinearRegressionWrapper.scala  |   8 +-
 .../GeneralizedLinearRegression.scala         | 193 +++++++-----------
 .../GeneralizedLinearRegressionSuite.scala    |  18 +-
 3 files changed, 88 insertions(+), 131 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 1e7ece6bc7199..176a6cf852914 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -115,10 +115,10 @@ private[r] object GeneralizedLinearRegressionWrapper
     }
 
     val rCoefficients: Array[Double] = if (summary.isNormalSolver) {
-      summary.coefficientCollection.map(_._2) ++
-        summary.coefficientCollection.map(_._3) ++
-        summary.coefficientCollection.map(_._4) ++
-        summary.coefficientCollection.map(_._5)
+      summary.coefficientsWithStatistics.map(_._2) ++
+        summary.coefficientsWithStatistics.map(_._3) ++
+        summary.coefficientsWithStatistics.map(_._4) ++
+        summary.coefficientsWithStatistics.map(_._5)
     } else {
       if (glm.getFitIntercept) {
         Array(glm.intercept) ++ glm.coefficients.toArray
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 392765c20fbe2..f489feb65274c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -142,6 +142,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
   /**
    * Param for offset column name. If this is not set or empty, we treat all instance offsets
    * as 0.0. The feature specified as offset has a constant coefficient of 1.0.
+   *
    * @group param
    */
   @Since("2.3.0")
@@ -326,6 +327,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
    * </blockquote>
    * Default is 0.0.
    *
+   *
    * @group setParam
    */
   @Since("2.0.0")
@@ -1213,11 +1215,10 @@ class GeneralizedLinearRegressionSummary private[regression] (
   private[ml] lazy val featureNames: Array[String] = {
     val featureAttrs = AttributeGroup.fromStructField(
       dataset.schema(model.getFeaturesCol)).attributes
-    if (featureAttrs == None) {
-      Array.tabulate[String](origModel.numFeatures)(
-        (x: Int) => (model.getFeaturesCol + "_" + x))
-    } else {
+    if (featureAttrs.isDefined) {
       featureAttrs.get.map(_.name.get)
+    } else {
+      Array.tabulate[String](origModel.numFeatures)((x: Int) => model.getFeaturesCol + "_" + x)
     }
   }
 
@@ -1477,138 +1478,94 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
   }
 
   /**
-   * Collection of feature name, coefficient, standard error,
-   * tValue and pValue.
+   * coefficients with statistics: feature name, coefficients, standard error, tValue and pValue.
    */
-  private[ml] lazy val coefficientCollection: Array[(String, Double, Double, Double, Double)] = {
-    if (isNormalSolver) {
-      var featureNamesLocal = featureNames
-      var coefficients = model.coefficients.toArray
-      var idx = Array.range(0, coefficients.length)
-      if (model.getFitIntercept) {
-        featureNamesLocal = featureNamesLocal :+ "(Intercept)"
-        coefficients = coefficients :+ model.intercept
-        // Reorder so that intercept comes first
-        idx = (coefficients.length - 1) +: idx
-      }
-      val result = for (i <- idx) yield
-        (featureNamesLocal(i), coefficients(i), coefficientStandardErrors(i),
+  private[ml] lazy val coefficientsWithStatistics: Array[
+    (String, Double, Double, Double, Double)] = {
+    var featureNamesLocal = featureNames
+    var coefficientsArray = model.coefficients.toArray
+    var index = Array.range(0, coefficientsArray.length)
+    if (model.getFitIntercept) {
+      featureNamesLocal = featureNamesLocal :+ "(Intercept)"
+      coefficientsArray = coefficientsArray :+ model.intercept
+      // Reorder so that intercept comes first
+      index = (coefficientsArray.length - 1) +: index
+    }
+    index.map { i =>
+      (featureNamesLocal(i), coefficientsArray(i), coefficientStandardErrors(i),
         tValues(i), pValues(i))
-      result
-    } else {
-      throw new UnsupportedOperationException(
-        "No summary available for this GeneralizedLinearRegressionModel")
     }
   }
 
-  private def round(x: Double, digit: Int): String = {
-    BigDecimal(x).setScale(digit, BigDecimal.RoundingMode.HALF_UP).toString()
-  }
+  override def toString: String = {
+    if (isNormalSolver) {
 
-  private[regression] def showString(_numRows: Int,
-                                     truncate: Int = 20,
-                                     numDigits: Int = 3): String = {
-    val numRows = _numRows.max(1)
-    val data = coefficientCollection.take(numRows)
-    val hasMoreData = coefficientCollection.size > numRows
-
-    val colNames = Array("Feature", "Estimate", "StdError", "TValue", "PValue")
-    val numCols = colNames.size
-
-    val rows = colNames +: data.map( row => {
-      val mrow = for (cell <- row.productIterator) yield {
-        val str = cell match {
-          case s: String => s
-          case n: Double => round(n, numDigits).toString
-        }
-        if (truncate > 0 && str.length > truncate) {
-          // do not show ellipses for strings shorter than 4 characters.
-          if (truncate < 4) str.substring(0, truncate)
-          else str.substring(0, truncate - 3) + "..."
-        } else {
-          str
-        }
+      def round(x: Double): String = {
+        BigDecimal(x).setScale(5, BigDecimal.RoundingMode.HALF_UP).toString
       }
-      mrow.toArray
-    })
 
-    val sb = new StringBuilder
-    val colWidths = Array.fill(numCols)(3)
+      val colNames = Array("Feature", "Estimate", "Std Error", "T Value", "P Value")
 
-    // Compute the width of each column
-    for (row <- rows) {
-      for ((cell, i) <- row.zipWithIndex) {
-        colWidths(i) = math.max(colWidths(i), cell.length)
+      val data = coefficientsWithStatistics.map { row =>
+        val strRow = row.productIterator.map { cell =>
+          val str = cell match {
+            case s: String => s
+            case n: Double => round(n)
+          }
+          // Truncate if length > 20
+          if (str.length > 20) {
+            str.substring(0, 17) + "..."
+          } else {
+            str
+          }
+        }
+        strRow.toArray
       }
-    }
-
-    // Create SeparateLine
-    val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()
 
-    // column names
-    rows.head.zipWithIndex.map { case (cell, i) =>
-      if (truncate > 0) {
-        StringUtils.leftPad(cell, colWidths(i))
-      } else {
-        StringUtils.rightPad(cell, colWidths(i))
+      // Compute the width of each column
+      val colWidths = colNames.map(_.length)
+      data.foreach { strRow =>
+        strRow.zipWithIndex.foreach { case (cell: String, i: Int) =>
+          colWidths(i) = math.max(colWidths(i), cell.length)
+        }
       }
-    }.addString(sb, "|", "|", "|\n")
-    sb.append(sep)
 
-    // data
-    rows.tail.map {
-      _.zipWithIndex.map { case (cell, i) =>
-        if (truncate > 0) {
+      val sb = new StringBuilder
+
+      // Output coefficients with statistics
+      sb.append("Coefficients:\n")
+      colNames.zipWithIndex.map { case (colName: String, i: Int) =>
+        StringUtils.leftPad(colName, colWidths(i))
+      }.addString(sb, "", " ", "\n")
+
+      data.foreach { case strRow: Array[String] =>
+        strRow.zipWithIndex.map { case (cell: String, i: Int) =>
           StringUtils.leftPad(cell.toString, colWidths(i))
-        } else {
-          StringUtils.rightPad(cell.toString, colWidths(i))
-        }
-      }.addString(sb, "|", "|", "|\n")
-    }
+        }.addString(sb, "", " ", "\n")
+      }
 
-    // For Data that has more than "numRows" records
-    if (hasMoreData) {
-      sb.append("...\n")
-      sb.append(sep)
-      val rowsString = if (numRows == 1) "row" else "rows"
-      sb.append(s"only showing top $numRows $rowsString\n")
-    } else {
-      sb.append(sep)
-    }
+      sb.append("\n")
+      sb.append(s"(Dispersion parameter for ${family.name} family taken to be " +
+        s"${round(dispersion)})")
 
-    sb.append("\n")
-    sb.append(s"(Dispersion parameter for ${family.name} family taken to be " +
-      round(dispersion, numDigits) + ")")
-
-    sb.append("\n")
-    val nd = "Null deviance: " + round(nullDeviance, numDigits) +
-      s" on $degreesOfFreedom degrees of freedom"
-    val rd = "Residual deviance: " + round(deviance, numDigits) +
-      s" on $residualDegreeOfFreedom degrees of freedom"
-    val l = math.max(nd.length, rd.length)
-    sb.append(StringUtils.leftPad(nd, l))
-    sb.append("\n")
-    sb.append(StringUtils.leftPad(rd, l))
-
-    if (family.name != "tweedie") {
       sb.append("\n")
-      sb.append(s"AIC: " + round(aic, numDigits))
-    }
+      val nd = s"Null deviance: ${round(nullDeviance)} on $degreesOfFreedom degrees of freedom"
+      val rd = s"Residual deviance: ${round(deviance)} on $residualDegreeOfFreedom degrees of " +
+        "freedom"
+      val l = math.max(nd.length, rd.length)
+      sb.append(StringUtils.leftPad(nd, l))
+      sb.append("\n")
+      sb.append(StringUtils.leftPad(rd, l))
 
-    sb.toString()
-  }
+      if (family.name != "tweedie") {
+        sb.append("\n")
+        sb.append(s"AIC: " + round(aic))
+      }
 
-  /**
-   * Displays the summary of a GeneralizedLinearModel fit. Strings more than 20 characters
-   * will be truncated, and all cells will be aligned right. Numbers are rounded to three
-   * decimal places.
-   *
-   * @since 2.3.0
-   */
-  // scalastyle:off println
-  def show(): Unit = {
-    println(showString(coefficientCollection.size, truncate = 20, 3))
+      sb.toString()
+    } else {
+      throw new UnsupportedOperationException(
+        "No summary available for this GeneralizedLinearRegressionModel")
+    }
   }
-  // scalastyle:on println
-
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 5a6c884ea43f5..df7dee869d058 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1556,7 +1556,7 @@ class GeneralizedLinearRegressionSuite
     }
   }
 
-  test("glm summary: coefficient matrix") {
+  test("glm summary: coefficient with statistics") {
     /*
       R code:
 
@@ -1593,14 +1593,14 @@ class GeneralizedLinearRegressionSuite
         .setFamily("gaussian")
         .setFitIntercept(fitIntercept)
       val model = trainer.fit(dataset)
-      val coefficients = model.summary.coefficientCollection
-
-      coefficients.map(_._1).zip(expectedFeature(idx)).foreach{ x => assert(x._1 === x._2,
-        "Feature name mismatch in summaryTable") }
-      assert(Vectors.dense(coefficients.map(_._2))
-        ~== expectedEstimate(idx) absTol 1E-3, "Coefficient mismatch in summaryTable")
-      assert(Vectors.dense(coefficients.map(_._3))
-        ~== expectedStdError(idx) absTol 1E-3, "Standard error mismatch in summaryTable")
+      val coefficientsWithStatistics = model.summary.coefficientsWithStatistics
+
+      coefficientsWithStatistics.map(_._1).zip(expectedFeature(idx)).foreach { x =>
+        assert(x._1 === x._2, "Feature name mismatch in coefficientsWithStatistics") }
+      assert(Vectors.dense(coefficientsWithStatistics.map(_._2)) ~= expectedEstimate(idx)
+        absTol 1E-3, "Coefficients mismatch in coefficientsWithStatistics")
+      assert(Vectors.dense(coefficientsWithStatistics.map(_._3)) ~= expectedStdError(idx)
+        absTol 1E-3, "Standard error mismatch in coefficientsWithStatistics")
       idx += 1
     }
   }

From 7281b77880898f5cb421467ef82e10ad42a17638 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang10@gmail.com>
Date: Wed, 26 Jul 2017 10:27:30 -0700
Subject: [PATCH 20/20] fix style

---
 .../spark/ml/regression/GeneralizedLinearRegression.scala    | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index f489feb65274c..beca5956a2d94 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -327,7 +327,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
    * </blockquote>
    * Default is 0.0.
    *
-   *
    * @group setParam
    */
   @Since("2.0.0")
@@ -1478,7 +1477,7 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
   }
 
   /**
-   * coefficients with statistics: feature name, coefficients, standard error, tValue and pValue.
+   * Coefficients with statistics: feature name, coefficients, standard error, tValue and pValue.
    */
   private[ml] lazy val coefficientsWithStatistics: Array[
     (String, Double, Double, Double, Double)] = {
@@ -1501,7 +1500,7 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
     if (isNormalSolver) {
 
       def round(x: Double): String = {
-        BigDecimal(x).setScale(5, BigDecimal.RoundingMode.HALF_UP).toString
+        BigDecimal(x).setScale(4, BigDecimal.RoundingMode.HALF_UP).toString
       }
 
       val colNames = Array("Feature", "Estimate", "Std Error", "T Value", "P Value")