Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,8 @@ export("as.DataFrame",
"tableToDF",
"tableNames",
"tables",
"uncacheTable")
"uncacheTable",
"print.summary.GeneralizedLinearRegressionModel")

export("structField",
"structField.jobj",
Expand Down
49 changes: 46 additions & 3 deletions R/pkg/R/mllib.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,55 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
jobj <- object@jobj
features <- callJMethod(jobj, "rFeatures")
coefficients <- callJMethod(jobj, "rCoefficients")
coefficients <- as.matrix(unlist(coefficients))
colnames(coefficients) <- c("Estimate")
deviance.resid <- callJMethod(jobj, "rDevianceResiduals")
dispersion <- callJMethod(jobj, "rDispersion")
null.deviance <- callJMethod(jobj, "rNullDeviance")
deviance <- callJMethod(jobj, "rDeviance")
df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull")
df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom")
aic <- callJMethod(jobj, "rAic")
iter <- callJMethod(jobj, "rNumIterations")
family <- callJMethod(jobj, "rFamily")

deviance.resid <- dataFrame(deviance.resid)
coefficients <- matrix(coefficients, ncol = 4)
colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
rownames(coefficients) <- unlist(features)
return(list(coefficients = coefficients))
ans <- list(deviance.resid = deviance.resid, coefficients = coefficients,
dispersion = dispersion, null.deviance = null.deviance,
deviance = deviance, df.null = df.null, df.residual = df.residual,
aic = aic, iter = iter, family = family)
class(ans) <- "summary.GeneralizedLinearRegressionModel"
return(ans)
})

#' Print the summary of GeneralizedLinearRegressionModel
#'
#' @rdname print
#' @name print.summary.GeneralizedLinearRegressionModel
#' @export
print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add this S3 function for formatted output of summary(GeneralizedLinearRegressionModel).

x$deviance.resid <- setNames(unlist(approxQuantile(x$deviance.resid, "devianceResiduals",
c(0.0, 0.25, 0.5, 0.75, 1.0), 0.01)), c("Min", "1Q", "Median", "3Q", "Max"))
x$deviance.resid <- zapsmall(x$deviance.resid, 5L)
cat("\nDeviance Residuals: \n")
cat("(Note: These are approximate quantiles with relative error <= 0.01)\n")
print.default(x$deviance.resid, digits = 5L, na.print = "", print.gap = 2L)

cat("\nCoefficients:\n")
print.default(x$coefficients, digits = 5L, na.print = "", print.gap = 2L)

cat("\n(Dispersion parameter for ", x$family, " family taken to be ", format(x$dispersion),
")\n\n", apply(cbind(paste(format(c("Null", "Residual"), justify = "right"), "deviance:"),
format(unlist(x[c("null.deviance", "deviance")]), digits = 5L),
" on", format(unlist(x[c("df.null", "df.residual")])), " degrees of freedom\n"),
1L, paste, collapse = " "), sep = "")
cat("AIC: ", format(x$aic, digits = 4L), "\n\n",
"Number of Fisher Scoring iterations: ", x$iter, "\n", sep = "")
cat("\n")
invisible(x)
}

#' Make predictions from a generalized linear model
#'
#' Makes predictions from a generalized linear model produced by glm(), similarly to R's predict().
Expand Down
49 changes: 49 additions & 0 deletions R/pkg/inst/tests/testthat/test_mllib.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,55 @@ test_that("glm and predict", {
expect_equal(length(predict(lm(y ~ x))), 15)
})

test_that("glm summary", {
# gaussian family
training <- suppressWarnings(createDataFrame(sqlContext, iris))
stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))

rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))

coefs <- unlist(stats$coefficients)
rCoefs <- unlist(rStats$coefficients)
expect_true(all(abs(rCoefs - coefs) < 1e-4))
expect_true(all(
rownames(stats$coefficients) ==
c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
expect_equal(stats$dispersion, rStats$dispersion)
expect_equal(stats$null.deviance, rStats$null.deviance)
expect_equal(stats$deviance, rStats$deviance)
expect_equal(stats$df.null, rStats$df.null)
expect_equal(stats$df.residual, rStats$df.residual)
expect_equal(stats$aic, rStats$aic)

# binomial family
df <- suppressWarnings(createDataFrame(sqlContext, iris))
training <- df[df$Species %in% c("versicolor", "virginica"), ]
stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
family = binomial(link = "logit")))

rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
family = binomial(link = "logit")))

coefs <- unlist(stats$coefficients)
rCoefs <- unlist(rStats$coefficients)
expect_true(all(abs(rCoefs - coefs) < 1e-4))
expect_true(all(
rownames(stats$coefficients) ==
c("(Intercept)", "Sepal_Length", "Sepal_Width")))
expect_equal(stats$dispersion, rStats$dispersion)
expect_equal(stats$null.deviance, rStats$null.deviance)
expect_equal(stats$deviance, rStats$deviance)
expect_equal(stats$df.null, rStats$df.null)
expect_equal(stats$df.residual, rStats$df.residual)
expect_equal(stats$aic, rStats$aic)

# Test summary works on base GLM models
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
baseSummary <- summary(baseModel)
expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

12.19313 -> baseSummary$deviance

Copy link
Contributor Author

@yanboliang yanboliang Apr 15, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we just test R native summary works by checking the value of baseSummary$deviance, we did not compared with output from other functions. This is consistent with other test case.

})

test_that("kmeans", {
newIris <- iris
newIris$Species <- NULL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,59 @@ private[r] class GeneralizedLinearRegressionWrapper private (
private val glm: GeneralizedLinearRegressionModel =
pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel]

lazy val rFeatures: Array[String] = if (glm.getFitIntercept) {
Copy link
Contributor

@mengxr mengxr Apr 15, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can remove the r prefix and just call it features. The whole class is a R wrapper. No worries. I saw why.

Array("(Intercept)") ++ features
} else {
features
}

lazy val rCoefficients: Array[Double] = if (glm.getFitIntercept) {
Array(glm.intercept) ++ glm.coefficients.toArray
Array(glm.intercept) ++ glm.coefficients.toArray ++
rCoefficientStandardErrors ++ rTValues ++ rPValues
} else {
glm.coefficients.toArray
glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
}

lazy val rFeatures: Array[String] = if (glm.getFitIntercept) {
Array("(Intercept)") ++ features
private lazy val rCoefficientStandardErrors = if (glm.getFitIntercept) {
Array(glm.summary.coefficientStandardErrors.last) ++
glm.summary.coefficientStandardErrors.dropRight(1)
} else {
features
glm.summary.coefficientStandardErrors
}

private lazy val rTValues = if (glm.getFitIntercept) {
Array(glm.summary.tValues.last) ++ glm.summary.tValues.dropRight(1)
} else {
glm.summary.tValues
}

def transform(dataset: DataFrame): DataFrame = {
private lazy val rPValues = if (glm.getFitIntercept) {
Array(glm.summary.pValues.last) ++ glm.summary.pValues.dropRight(1)
} else {
glm.summary.pValues
}

lazy val rDispersion: Double = glm.summary.dispersion

lazy val rNullDeviance: Double = glm.summary.nullDeviance

lazy val rDeviance: Double = glm.summary.deviance

lazy val rResidualDegreeOfFreedomNull: Long = glm.summary.residualDegreeOfFreedomNull

lazy val rResidualDegreeOfFreedom: Long = glm.summary.residualDegreeOfFreedom

lazy val rAic: Double = glm.summary.aic

lazy val rNumIterations: Int = glm.summary.numIterations

lazy val rDevianceResiduals: DataFrame = glm.summary.residuals()

lazy val rFamily: String = glm.getFamily

def residuals(residualsType: String): DataFrame = glm.summary.residuals(residualsType)

def transform(dataset: Dataset[_]): DataFrame = {
pipeline.transform(dataset).drop(glm.getFeaturesCol)
}
}
Expand Down