From 1ed3ba0c031b6d45fdaa025f3831020417ce164d Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 19 May 2017 19:33:27 +0800 Subject: [PATCH 1/4] Code reorg and cleanup for SparkR linear SVM. --- R/pkg/R/mllib_classification.R | 40 ++++++++----------- .../apache/spark/ml/r/LinearSVCWrapper.scala | 12 +++++- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index 4db9cc30fb0c..bf0b8bb6fe31 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -46,15 +46,16 @@ setClass("MultilayerPerceptronClassificationModel", representation(jobj = "jobj" #' @note NaiveBayesModel since 2.0.0 setClass("NaiveBayesModel", representation(jobj = "jobj")) -#' linear SVM Model +#' Linear SVM Model #' -#' Fits an linear SVM model against a SparkDataFrame. It is a binary classifier, similar to svm in glmnet package +#' Fits a linear SVM model against a SparkDataFrame, similar to svm in e1071 package. +#' Currently only supports binary classification model with linear kernal. #' Users can print, make predictions on the produced model and save the model to the input path. #' #' @param data SparkDataFrame for training. #' @param formula A symbolic description of the model to be fitted. Currently only a few formula #' operators are supported, including '~', '.', ':', '+', and '-'. -#' @param regParam The regularization parameter. +#' @param regParam The regularization parameter. Only supports L2 regularization currently. #' @param maxIter Maximum iteration number. #' @param tol Convergence tolerance of iterations. #' @param standardization Whether to standardize the training features before fitting the model. The coefficients @@ -111,10 +112,10 @@ setMethod("spark.svmLinear", signature(data = "SparkDataFrame", formula = "formu new("LinearSVCModel", jobj = jobj) }) -# Predicted values based on an LinearSVCModel model +# Predicted values based on a linear SVM model. #' @param newData a SparkDataFrame for testing. -#' @return \code{predict} returns the predicted values based on an LinearSVCModel. +#' @return \code{predict} returns the predicted values based on a linear SVM model. #' @rdname spark.svmLinear #' @aliases predict,LinearSVCModel,SparkDataFrame-method #' @export @@ -124,13 +125,12 @@ setMethod("predict", signature(object = "LinearSVCModel"), predict_internal(object, newData) }) -# Get the summary of an LinearSVCModel +# Get the summary of a linear SVM model. -#' @param object an LinearSVCModel fitted by \code{spark.svmLinear}. +#' @param object a linear SVM model fitted by \code{spark.svmLinear}. #' @return \code{summary} returns summary information of the fitted model, which is a list. #' The list includes \code{coefficients} (coefficients of the fitted model), -#' \code{intercept} (intercept of the fitted model), \code{numClasses} (number of classes), -#' \code{numFeatures} (number of features). +#' \code{numClasses} (number of classes), \code{numFeatures} (number of features). #' @rdname spark.svmLinear #' @aliases summary,LinearSVCModel-method #' @export @@ -138,25 +138,17 @@ setMethod("predict", signature(object = "LinearSVCModel"), setMethod("summary", signature(object = "LinearSVCModel"), function(object) { jobj <- object@jobj - features <- callJMethod(jobj, "features") - labels <- callJMethod(jobj, "labels") - coefficients <- callJMethod(jobj, "coefficients") - nCol <- length(coefficients) / length(features) - coefficients <- matrix(unlist(coefficients), ncol = nCol) - intercept <- callJMethod(jobj, "intercept") + features <- callJMethod(jobj, "rFeatures") + coefficients <- callJMethod(jobj, "rCoefficients") + coefficients <- as.matrix(unlist(coefficients)) + colnames(coefficients) <- c("Estimate") + rownames(coefficients) <- unlist(features) numClasses <- callJMethod(jobj, "numClasses") numFeatures <- callJMethod(jobj, "numFeatures") - if (nCol == 1) { - colnames(coefficients) <- c("Estimate") - } else { - colnames(coefficients) <- unlist(labels) - } - rownames(coefficients) <- unlist(features) - list(coefficients = coefficients, intercept = intercept, - numClasses = numClasses, numFeatures = numFeatures) + list(coefficients = coefficients, numClasses = numClasses, numFeatures = numFeatures) }) -# Save fitted LinearSVCModel to the input path +# Save fitted linear SVM model to the input path. #' @param path The directory where the model is saved. #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LinearSVCWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LinearSVCWrapper.scala index cfd043b66ed9..0dd1f1146fbf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/LinearSVCWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/LinearSVCWrapper.scala @@ -38,9 +38,17 @@ private[r] class LinearSVCWrapper private ( private val svcModel: LinearSVCModel = pipeline.stages(1).asInstanceOf[LinearSVCModel] - lazy val coefficients: Array[Double] = svcModel.coefficients.toArray + lazy val rFeatures: Array[String] = if (svcModel.getFitIntercept) { + Array("(Intercept)") ++ features + } else { + features + } - lazy val intercept: Double = svcModel.intercept + lazy val rCoefficients: Array[Double] = if (svcModel.getFitIntercept) { + Array(svcModel.intercept) ++ svcModel.coefficients.toArray + } else { + svcModel.coefficients.toArray + } lazy val numClasses: Int = svcModel.numClasses From 39317c1d06361f50fd80f1bcf6eef97c6123070d Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Sat, 20 May 2017 00:10:20 +0800 Subject: [PATCH 2/4] Update test case. --- R/pkg/inst/tests/testthat/test_mllib_classification.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R b/R/pkg/inst/tests/testthat/test_mllib_classification.R index f3eaeb381afc..51a4db17fabd 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_classification.R +++ b/R/pkg/inst/tests/testthat/test_mllib_classification.R @@ -38,9 +38,8 @@ test_that("spark.svmLinear", { expect_true(class(summary$coefficients[, 1]) == "numeric") coefs <- summary$coefficients[, "Estimate"] - expected_coefs <- c(-0.1563083, -0.460648, 0.2276626, 1.055085) + expected_coefs <- c(-0.06004978, -0.1563083, -0.460648, 0.2276626, 1.055085) expect_true(all(abs(coefs - expected_coefs) < 0.1)) - expect_equal(summary$intercept, -0.06004978, tolerance = 1e-2) # Test prediction with string label prediction <- predict(model, training) From 3c14d15c0f97f91b0fe69b843fe038b7fc776f2e Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Mon, 22 May 2017 17:55:11 +0800 Subject: [PATCH 3/4] Update docs. --- R/pkg/R/mllib_classification.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index bf0b8bb6fe31..7988a568d531 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -112,10 +112,10 @@ setMethod("spark.svmLinear", signature(data = "SparkDataFrame", formula = "formu new("LinearSVCModel", jobj = jobj) }) -# Predicted values based on a linear SVM model. +# Predicted values based on a LinearSVCModel model #' @param newData a SparkDataFrame for testing. -#' @return \code{predict} returns the predicted values based on a linear SVM model. +#' @return \code{predict} returns the predicted values based on a LinearSVCModel. #' @rdname spark.svmLinear #' @aliases predict,LinearSVCModel,SparkDataFrame-method #' @export @@ -125,9 +125,9 @@ setMethod("predict", signature(object = "LinearSVCModel"), predict_internal(object, newData) }) -# Get the summary of a linear SVM model. +# Get the summary of a LinearSVCModel -#' @param object a linear SVM model fitted by \code{spark.svmLinear}. +#' @param object a LinearSVCModel fitted by \code{spark.svmLinear}. #' @return \code{summary} returns summary information of the fitted model, which is a list. #' The list includes \code{coefficients} (coefficients of the fitted model), #' \code{numClasses} (number of classes), \code{numFeatures} (number of features). @@ -148,7 +148,7 @@ setMethod("summary", signature(object = "LinearSVCModel"), list(coefficients = coefficients, numClasses = numClasses, numFeatures = numFeatures) }) -# Save fitted linear SVM model to the input path. +# Save fitted LinearSVCModel to the input path #' @param path The directory where the model is saved. #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE From 5d9afe06b665464b06705d618a18a8032255fe1d Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Tue, 23 May 2017 09:56:48 +0800 Subject: [PATCH 4/4] Fix typo. --- R/pkg/R/mllib_classification.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index 7988a568d531..306a9b867653 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -49,7 +49,7 @@ setClass("NaiveBayesModel", representation(jobj = "jobj")) #' Linear SVM Model #' #' Fits a linear SVM model against a SparkDataFrame, similar to svm in e1071 package. -#' Currently only supports binary classification model with linear kernal. +#' Currently only supports binary classification model with linear kernel. #' Users can print, make predictions on the produced model and save the model to the input path. #' #' @param data SparkDataFrame for training.