[SPARK-13925] [ML] [SparkR] Expose R-like summary statistics in SparkR::glm for more family and link functions #12393

yanboliang · 2016-04-14T14:00:05Z

Add this S3 function for formatted output of summary(GeneralizedLinearRegressionModel).

mengxr · 2016-04-15T04:46:28Z

12.19313 -> baseSummary$deviance

Here we just test R native summary works by checking the value of baseSummary$deviance, we did not compared with output from other functions. This is consistent with other test case.

mengxr · 2016-04-15T04:40:36Z

~~We can remove the r prefix and just call it features. The whole class is a R wrapper.~~ No worries. I saw why.

-Original file line number
+Diff line change
@@ Expand Up / @@ -292,7 +292,8 @@ export("as.DataFrame", @@
            "tableToDF",
            "tableNames",
            "tables",
-           "uncacheTable")
+           "uncacheTable",
+           "print.summary.GeneralizedLinearRegressionModel")
     export("structField",
            "structField.jobj",
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up @@
                 jobj <- object@jobj
                 features <- callJMethod(jobj, "rFeatures")
                 coefficients <- callJMethod(jobj, "rCoefficients")
-                coefficients <- as.matrix(unlist(coefficients))
-                colnames(coefficients) <- c("Estimate")
+                deviance.resid <- callJMethod(jobj, "rDevianceResiduals")
+                dispersion <- callJMethod(jobj, "rDispersion")
+                null.deviance <- callJMethod(jobj, "rNullDeviance")
+                deviance <- callJMethod(jobj, "rDeviance")
+                df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull")
+                df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom")
+                aic <- callJMethod(jobj, "rAic")
+                iter <- callJMethod(jobj, "rNumIterations")
+                family <- callJMethod(jobj, "rFamily")
+                deviance.resid <- dataFrame(deviance.resid)
+                coefficients <- matrix(coefficients, ncol = 4)
+                colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
                 rownames(coefficients) <- unlist(features)
-                return(list(coefficients = coefficients))
+                ans <- list(deviance.resid = deviance.resid, coefficients = coefficients,
+                            dispersion = dispersion, null.deviance = null.deviance,
+                            deviance = deviance, df.null = df.null, df.residual = df.residual,
+                            aic = aic, iter = iter, family = family)
+                class(ans) <- "summary.GeneralizedLinearRegressionModel"
+                return(ans)
               })
+    #' Print the summary of GeneralizedLinearRegressionModel
+    #'
+    #' @rdname print
+    #' @name print.summary.GeneralizedLinearRegressionModel
+    #' @export
+    print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
+      x$deviance.resid <- setNames(unlist(approxQuantile(x$deviance.resid, "devianceResiduals",
+        c(0.0, 0.25, 0.5, 0.75, 1.0), 0.01)), c("Min", "1Q", "Median", "3Q", "Max"))
+      x$deviance.resid <- zapsmall(x$deviance.resid, 5L)
+      cat("\nDeviance Residuals: \n")
+      cat("(Note: These are approximate quantiles with relative error <= 0.01)\n")
+      print.default(x$deviance.resid, digits = 5L, na.print = "", print.gap = 2L)
+      cat("\nCoefficients:\n")
+      print.default(x$coefficients, digits = 5L, na.print = "", print.gap = 2L)
+      cat("\n(Dispersion parameter for ", x$family, " family taken to be ", format(x$dispersion),
+        ")\n\n", apply(cbind(paste(format(c("Null", "Residual"), justify = "right"), "deviance:"),
+        format(unlist(x[c("null.deviance", "deviance")]), digits = 5L),
+        " on", format(unlist(x[c("df.null", "df.residual")])), " degrees of freedom\n"),
+L, paste, collapse = " "), sep = "")
+      cat("AIC: ", format(x$aic, digits = 4L), "\n\n",
+        "Number of Fisher Scoring iterations: ", x$iter, "\n", sep = "")
+      cat("\n")
+      invisible(x)
+      }
     #' Make predictions from a generalized linear model
     #'
     #' Makes predictions from a generalized linear model produced by glm(), similarly to R's predict().
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -77,6 +77,55 @@ test_that("glm and predict", { @@
       expect_equal(length(predict(lm(y ~ x))), 15)
     })
+    test_that("glm summary", {
+      # gaussian family
+      training <- suppressWarnings(createDataFrame(sqlContext, iris))
+      stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
+      rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
+      coefs <- unlist(stats$coefficients)
+      rCoefs <- unlist(rStats$coefficients)
+      expect_true(all(abs(rCoefs - coefs) < 1e-4))
+      expect_true(all(
+        rownames(stats$coefficients) ==
+        c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
+      expect_equal(stats$dispersion, rStats$dispersion)
+      expect_equal(stats$null.deviance, rStats$null.deviance)
+      expect_equal(stats$deviance, rStats$deviance)
+      expect_equal(stats$df.null, rStats$df.null)
+      expect_equal(stats$df.residual, rStats$df.residual)
+      expect_equal(stats$aic, rStats$aic)
+      # binomial family
+      df <- suppressWarnings(createDataFrame(sqlContext, iris))
+      training <- df[df$Species %in% c("versicolor", "virginica"), ]
+      stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
+        family = binomial(link = "logit")))
+      rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
+      rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
+        family = binomial(link = "logit")))
+      coefs <- unlist(stats$coefficients)
+      rCoefs <- unlist(rStats$coefficients)
+      expect_true(all(abs(rCoefs - coefs) < 1e-4))
+      expect_true(all(
+        rownames(stats$coefficients) ==
+        c("(Intercept)", "Sepal_Length", "Sepal_Width")))
+      expect_equal(stats$dispersion, rStats$dispersion)
+      expect_equal(stats$null.deviance, rStats$null.deviance)
+      expect_equal(stats$deviance, rStats$deviance)
+      expect_equal(stats$df.null, rStats$df.null)
+      expect_equal(stats$df.residual, rStats$df.residual)
+      expect_equal(stats$aic, rStats$aic)
+      # Test summary works on base GLM models
+      baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
+      baseSummary <- summary(baseModel)
+      expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
+    })
     test_that("kmeans", {
       newIris <- iris
       newIris$Species <- NULL
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up @@
       private val glm: GeneralizedLinearRegressionModel =
         pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel]
+      lazy val rFeatures: Array[String] = if (glm.getFitIntercept) {
+        Array("(Intercept)") ++ features
+      } else {
+        features
+      }
       lazy val rCoefficients: Array[Double] = if (glm.getFitIntercept) {
-        Array(glm.intercept) ++ glm.coefficients.toArray
+        Array(glm.intercept) ++ glm.coefficients.toArray ++
+          rCoefficientStandardErrors ++ rTValues ++ rPValues
       } else {
-        glm.coefficients.toArray
+        glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
       }
-      lazy val rFeatures: Array[String] = if (glm.getFitIntercept) {
-        Array("(Intercept)") ++ features
+      private lazy val rCoefficientStandardErrors = if (glm.getFitIntercept) {
+        Array(glm.summary.coefficientStandardErrors.last) ++
+          glm.summary.coefficientStandardErrors.dropRight(1)
       } else {
-        features
+        glm.summary.coefficientStandardErrors
+      }
+      private lazy val rTValues = if (glm.getFitIntercept) {
+        Array(glm.summary.tValues.last) ++ glm.summary.tValues.dropRight(1)
+      } else {
+        glm.summary.tValues
       }
-      def transform(dataset: DataFrame): DataFrame = {
+      private lazy val rPValues = if (glm.getFitIntercept) {
+        Array(glm.summary.pValues.last) ++ glm.summary.pValues.dropRight(1)
+      } else {
+        glm.summary.pValues
+      }
+      lazy val rDispersion: Double = glm.summary.dispersion
+      lazy val rNullDeviance: Double = glm.summary.nullDeviance
+      lazy val rDeviance: Double = glm.summary.deviance
+      lazy val rResidualDegreeOfFreedomNull: Long = glm.summary.residualDegreeOfFreedomNull
+      lazy val rResidualDegreeOfFreedom: Long = glm.summary.residualDegreeOfFreedom
+      lazy val rAic: Double = glm.summary.aic
+      lazy val rNumIterations: Int = glm.summary.numIterations
+      lazy val rDevianceResiduals: DataFrame = glm.summary.residuals()
+      lazy val rFamily: String = glm.getFamily
+      def residuals(residualsType: String): DataFrame = glm.summary.residuals(residualsType)
+      def transform(dataset: Dataset[_]): DataFrame = {
         pipeline.transform(dataset).drop(glm.getFeaturesCol)
       }
     }
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[SPARK-13925] [ML] [SparkR] Expose R-like summary statistics in SparkR::glm for more family and link functions #12393

Uh oh!

Diff view

Diff view

There are no files selected for viewing

yanboliang Apr 14, 2016

Uh oh!

mengxr Apr 15, 2016

Uh oh!

yanboliang Apr 15, 2016 •

edited

Loading

Uh oh!

mengxr Apr 15, 2016 •

edited

Loading

Uh oh!

[SPARK-13925] [ML] [SparkR] Expose R-like summary statistics in SparkR::glm for more family and link functions #12393

Uh oh!

[SPARK-13925] [ML] [SparkR] Expose R-like summary statistics in SparkR::glm for more family and link functions #12393

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

yanboliang Apr 14, 2016

Choose a reason for hiding this comment

Uh oh!

mengxr Apr 15, 2016

Choose a reason for hiding this comment

Uh oh!

yanboliang Apr 15, 2016 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

mengxr Apr 15, 2016 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

yanboliang Apr 15, 2016 •

edited

Loading

mengxr Apr 15, 2016 •

edited

Loading