From a5644ba531a6e0fbf653daa650d80523435b6aa8 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Thu, 25 Aug 2016 16:03:44 -0700 Subject: [PATCH 1/4] add glmnet R part code --- R/pkg/NAMESPACE | 1 + R/pkg/R/generics.R | 5 ++++ R/pkg/R/mllib.R | 75 ++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 267a38c21530b..af250d428e2be 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -44,6 +44,7 @@ exportMethods("glm", "spark.gaussianMixture", "spark.als", "spark.kstest") + "spark.glmnet") # Job group lifecycle management methods export("setJobGroup", diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 67a999da9bc26..b51f77217a731 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1304,6 +1304,11 @@ setGeneric("year", function(x) { standardGeneric("year") }) #' @export setGeneric("spark.glm", function(data, formula, ...) { standardGeneric("spark.glm") }) +#' @rdname spark.glmnet +#' @export +setGeneric("spark.glmnet", function(data, formula, ...) { standardGeneric("spark.glmnet") }) + + #' @param x,y For \code{glm}: logical values indicating whether the response vector #' and model matrix used in the fitting process should be returned as #' components of the returned value. diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 98db367a856ee..35b97faa5b7f7 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -32,6 +32,13 @@ #' @note GeneralizedLinearRegressionModel since 2.0.0 setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj")) +#' S4 class that represents a MultinomialLogisticRegressionModel +#' +#' @param jobj a Java object reference to the backing Scala MultinomialLogisticRegressionModelWrapper +#' @export +#' @note GeneralizedLinearRegressionModel since 2.1.0 +setClass("MultinomialLogisticRegressionModel", representation(jobj = "jobj")) + #' S4 class that represents a NaiveBayesModel #' #' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper @@ -102,7 +109,7 @@ setClass("KSTest", representation(jobj = "jobj")) #' @rdname write.ml #' @name write.ml #' @export -#' @seealso \link{spark.glm}, \link{glm}, +#' @seealso \link{spark.glm}, \link{glm}, \link{spark.glmnet}, #' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans}, #' @seealso \link{spark.lda}, \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg} #' @seealso \link{read.ml} @@ -115,7 +122,7 @@ NULL #' @rdname predict #' @name predict #' @export -#' @seealso \link{spark.glm}, \link{glm}, +#' @seealso \link{spark.glm}, \link{glm}, \link{spark.glmnet}, #' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans}, #' @seealso \link{spark.mlp}, \link{spark.naiveBayes}, \link{spark.survreg} NULL @@ -320,6 +327,54 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"), predict_internal(object, newData) }) +setMethod("spark.glmnet", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, regParam = 0.0, elasticNetParam = 0.0, tol = 1e-6, maxIter = 100, + fitIntercept = TRUE, standardization = TRUE, thresholds = NULL, weightCol = NULL) { + + formula <- paste0(deparse(formula), collapse = "") + if (is.null(weightCol)) { + weightCol <- "" + } + + jobj <- callJStatic("org.apache.spark.ml.r.MultinomialLogisticRegressionModelWrapper", + "fit", formula, data@sdf, as.numeric(regParam), as.numeric(elasticNetParam), + tol, as.integer(maxIter), as.logical(fitIntercept), + as.logical(standardization), as.array(thresholds), as.character(weightCol)) + new("MultinomialLogisticRegressionModel", jobj = jobj) + }) + +# Predicted values based on a MultinomialLogisticRegression model + +#' @param object a fitted MultinomialLogisticRegressionModel +#' @param newData SparkDataFrame for testing +#' @return \code{predict} returns a SparkDataFrame containing predicted values +#' @rdname spark.glmnet +#' @aliases predict,MultinomialLogisticRegressionModel,SparkDataFrame-method +#' @export +#' @note predict(MultinomialLogisticRegressionModel) since 2.1.0 +setMethod("predict", signature(object = "MultinomialLogisticRegressionModel"), + function(object, newData) { + predict_internal(object, newData) + }) + +# Get the summary of a MultinomialLogisticRegression model + +#' @return \code{summary} returns the model's coefficients, intercepts and numClasses +#' @rdname spark.glmnet +#' @aliases summary,MultinomialLogisticRegression-method +#' @export +#' @note summary(MultinomialLogisticRegressionModel) since 2.1.0 +setMethod("summary", signature(object = "MultinomialLogisticRegressionModel"), + function(object) { + jobj <- object@jobj + coefficients <- callJMethod(jobj, "coefficients") + intercepts <- callJMethod(jobj, "intercepts") + numClasses <- callJMethod(jobj, "numClasses") + k <- callJMethod(jobj, "numFeatures") + coefficients <- t(matrix(coefficients, ncol = k)) + list(coefficients = coefficients, intercepts = intercepts, numClasses = numClasses) + }) + # Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(), # similarly to R package e1071's predict. @@ -826,6 +881,20 @@ setMethod("write.ml", signature(object = "GeneralizedLinearRegressionModel", pat write_internal(object, path, overwrite) }) +# Saves the multinomial logistic regressionModel to the input path. + +#' @param path the directory where the model is saved. +#' @param overwrite overwrites or not if the output path already exists. Default is FALSE +#' which means throw exception if the output path exists. +#' +#' @rdname spark.glmnet +#' @export +#' @note write.ml(MultinomialLogisticRegressionModel, character) since 2.1.0 +setMethod("write.ml", signature(object = "MultinomialLogisticRegressionModel", path = "character"), + function(object, path, overwrite = FALSE) { + write_internal(object, path, overwrite) + }) + # Save fitted MLlib model to the input path #' @param path the directory where the model is saved. @@ -922,6 +991,8 @@ read.ml <- function(path) { new("GaussianMixtureModel", jobj = jobj) } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.ALSWrapper")) { new("ALSModel", jobj = jobj) + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.MultinomialLogisticRegressionModelWrapper")) { + new("MultinomialLogisticRegressionModel", jobj = jobj) } else { stop("Unsupported model: ", jobj) } From eef5acd980578ce95e751ebb2010971886e39290 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Thu, 25 Aug 2016 16:10:50 -0700 Subject: [PATCH 2/4] rename wrapper --- R/pkg/R/mllib.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 35b97faa5b7f7..6c19beadb0b6b 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -34,7 +34,7 @@ setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj")) #' S4 class that represents a MultinomialLogisticRegressionModel #' -#' @param jobj a Java object reference to the backing Scala MultinomialLogisticRegressionModelWrapper +#' @param jobj a Java object reference to the backing Scala MultinomialLogisticRegressionModel #' @export #' @note GeneralizedLinearRegressionModel since 2.1.0 setClass("MultinomialLogisticRegressionModel", representation(jobj = "jobj")) @@ -336,7 +336,7 @@ setMethod("spark.glmnet", signature(data = "SparkDataFrame", formula = "formula" weightCol <- "" } - jobj <- callJStatic("org.apache.spark.ml.r.MultinomialLogisticRegressionModelWrapper", + jobj <- callJStatic("org.apache.spark.ml.r.MultinomialLogisticRegressionWrapper", "fit", formula, data@sdf, as.numeric(regParam), as.numeric(elasticNetParam), tol, as.integer(maxIter), as.logical(fitIntercept), as.logical(standardization), as.array(thresholds), as.character(weightCol)) @@ -991,7 +991,7 @@ read.ml <- function(path) { new("GaussianMixtureModel", jobj = jobj) } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.ALSWrapper")) { new("ALSModel", jobj = jobj) - } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.MultinomialLogisticRegressionModelWrapper")) { + } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.MultinomialLogisticRegressionWrapper")) { new("MultinomialLogisticRegressionModel", jobj = jobj) } else { stop("Unsupported model: ", jobj) From 9f14338baa3bfd32b7b2e07f7cd28b56e4a2eea0 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Fri, 26 Aug 2016 09:40:25 -0700 Subject: [PATCH 3/4] back up change and hold the PR --- ...MultinomialLogisticRegressionWrapper.scala | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/r/MultinomialLogisticRegressionWrapper.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/MultinomialLogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/MultinomialLogisticRegressionWrapper.scala new file mode 100644 index 0000000000000..67e6258c52403 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/r/MultinomialLogisticRegressionWrapper.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.r + +import org.apache.hadoop.fs.Path +import org.json4s._ +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods._ + +import org.apache.spark.ml.{Pipeline, PipelineModel} +import org.apache.spark.ml.attribute.AttributeGroup +import org.apache.spark.ml.classification.{MultinomialLogisticRegression, MultinomialLogisticRegressionModel} +import org.apache.spark.ml.feature.RFormula +import org.apache.spark.ml.linalg._ +import org.apache.spark.ml.util._ +import org.apache.spark.sql.{DataFrame, Dataset} + +private[r] class MultinomialLogisticRegressionWrapper private ( + val pipeline: PipelineModel, + val features: Array[String]) extends MLWritable { + + private val multinomialLogisticRegressionModel: MultinomialLogisticRegressionModel = + pipeline.stages(1).asInstanceOf[MultinomialLogisticRegressionModel] + + lazy val coefficients: Matrix = multinomialLogisticRegressionModel.coefficients + + lazy val intercepts: Vector = multinomialLogisticRegressionModel.intercepts + + lazy val numClasses: Int = multinomialLogisticRegressionModel.numClasses + + lazy val numFeatures: Int = multinomialLogisticRegressionModel.numFeatures + + override def write: MLWriter = + new MultinomialLogisticRegressionWrapper.MultinomialLogisticRegressionWrapperWriter(this) +} + +private[r] object MultinomialLogisticRegressionWrapper + extends MLReadable[MultinomialLogisticRegressionWrapper] { + def fit(): MultinomialLogisticRegressionWrapper = { + + } + + override def read: MLReader[MultinomialLogisticRegressionWrapper] = + new MultinomialLogisticRegressionWrapperReader + + override def load(path: String): MultinomialLogisticRegressionWrapper = super.load(path) + + class MultinomialLogisticRegressionWrapperWriter(instance: MultinomialLogisticRegressionWrapper) + extends MLWriter { + + override protected def saveImpl(path: String): Unit = ??? + } + + class MultinomialLogisticRegressionWrapperReader + extends MLReader[MultinomialLogisticRegressionWrapper] { + + override def load(path: String): MultinomialLogisticRegressionWrapper = ??? + } +} From ed1a0fb7cdb57e763c6318b30544506719593622 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Fri, 23 Sep 2016 10:33:05 -0700 Subject: [PATCH 4/4] delete unused scala --- R/pkg/NAMESPACE | 2 +- ...MultinomialLogisticRegressionWrapper.scala | 74 ------------------- 2 files changed, 1 insertion(+), 75 deletions(-) delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/r/MultinomialLogisticRegressionWrapper.scala diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index af250d428e2be..38969efabc8ce 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -43,7 +43,7 @@ exportMethods("glm", "spark.isoreg", "spark.gaussianMixture", "spark.als", - "spark.kstest") + "spark.kstest", "spark.glmnet") # Job group lifecycle management methods diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/MultinomialLogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/MultinomialLogisticRegressionWrapper.scala deleted file mode 100644 index 67e6258c52403..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/r/MultinomialLogisticRegressionWrapper.scala +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.r - -import org.apache.hadoop.fs.Path -import org.json4s._ -import org.json4s.JsonDSL._ -import org.json4s.jackson.JsonMethods._ - -import org.apache.spark.ml.{Pipeline, PipelineModel} -import org.apache.spark.ml.attribute.AttributeGroup -import org.apache.spark.ml.classification.{MultinomialLogisticRegression, MultinomialLogisticRegressionModel} -import org.apache.spark.ml.feature.RFormula -import org.apache.spark.ml.linalg._ -import org.apache.spark.ml.util._ -import org.apache.spark.sql.{DataFrame, Dataset} - -private[r] class MultinomialLogisticRegressionWrapper private ( - val pipeline: PipelineModel, - val features: Array[String]) extends MLWritable { - - private val multinomialLogisticRegressionModel: MultinomialLogisticRegressionModel = - pipeline.stages(1).asInstanceOf[MultinomialLogisticRegressionModel] - - lazy val coefficients: Matrix = multinomialLogisticRegressionModel.coefficients - - lazy val intercepts: Vector = multinomialLogisticRegressionModel.intercepts - - lazy val numClasses: Int = multinomialLogisticRegressionModel.numClasses - - lazy val numFeatures: Int = multinomialLogisticRegressionModel.numFeatures - - override def write: MLWriter = - new MultinomialLogisticRegressionWrapper.MultinomialLogisticRegressionWrapperWriter(this) -} - -private[r] object MultinomialLogisticRegressionWrapper - extends MLReadable[MultinomialLogisticRegressionWrapper] { - def fit(): MultinomialLogisticRegressionWrapper = { - - } - - override def read: MLReader[MultinomialLogisticRegressionWrapper] = - new MultinomialLogisticRegressionWrapperReader - - override def load(path: String): MultinomialLogisticRegressionWrapper = super.load(path) - - class MultinomialLogisticRegressionWrapperWriter(instance: MultinomialLogisticRegressionWrapper) - extends MLWriter { - - override protected def saveImpl(path: String): Unit = ??? - } - - class MultinomialLogisticRegressionWrapperReader - extends MLReader[MultinomialLogisticRegressionWrapper] { - - override def load(path: String): MultinomialLogisticRegressionWrapper = ??? - } -}