@@ -69,6 +69,11 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
6969# ' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
7070# ' or the number of partitions are large, this param could be adjusted to a larger size.
7171# ' This is an expert parameter. Default value should be good for most cases.
72+ # ' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
73+ # ' column of string type.
74+ # ' Supported options: "skip" (filter out rows with invalid data),
75+ # ' "error" (throw an error), "keep" (put invalid data in a special additional
76+ # ' bucket, at index numLabels). Default is "error".
7277# ' @param ... additional arguments passed to the method.
7378# ' @return \code{spark.svmLinear} returns a fitted linear SVM model.
7479# ' @rdname spark.svmLinear
@@ -98,7 +103,8 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
98103# ' @note spark.svmLinear since 2.2.0
99104setMethod ("spark.svmLinear ", signature(data = "SparkDataFrame", formula = "formula"),
100105 function (data , formula , regParam = 0.0 , maxIter = 100 , tol = 1E-6 , standardization = TRUE ,
101- threshold = 0.0 , weightCol = NULL , aggregationDepth = 2 ) {
106+ threshold = 0.0 , weightCol = NULL , aggregationDepth = 2 ,
107+ handleInvalid = c(" error" , " keep" , " skip" )) {
102108 formula <- paste(deparse(formula ), collapse = " " )
103109
104110 if (! is.null(weightCol ) && weightCol == " " ) {
@@ -107,10 +113,12 @@ setMethod("spark.svmLinear", signature(data = "SparkDataFrame", formula = "formu
107113 weightCol <- as.character(weightCol )
108114 }
109115
116+ handleInvalid <- match.arg(handleInvalid )
117+
110118 jobj <- callJStatic(" org.apache.spark.ml.r.LinearSVCWrapper" , " fit" ,
111119 data @ sdf , formula , as.numeric(regParam ), as.integer(maxIter ),
112120 as.numeric(tol ), as.logical(standardization ), as.numeric(threshold ),
113- weightCol , as.integer(aggregationDepth ))
121+ weightCol , as.integer(aggregationDepth ), handleInvalid )
114122 new(" LinearSVCModel" , jobj = jobj )
115123 })
116124
@@ -218,6 +226,11 @@ function(object, path, overwrite = FALSE) {
218226# ' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization.
219227# ' The bound vector size must be equal to 1 for binomial regression, or the number
220228# ' of classes for multinomial regression.
229+ # ' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
230+ # ' column of string type.
231+ # ' Supported options: "skip" (filter out rows with invalid data),
232+ # ' "error" (throw an error), "keep" (put invalid data in a special additional
233+ # ' bucket, at index numLabels). Default is "error".
221234# ' @param ... additional arguments passed to the method.
222235# ' @return \code{spark.logit} returns a fitted logistic regression model.
223236# ' @rdname spark.logit
@@ -257,7 +270,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
257270 tol = 1E-6 , family = " auto" , standardization = TRUE ,
258271 thresholds = 0.5 , weightCol = NULL , aggregationDepth = 2 ,
259272 lowerBoundsOnCoefficients = NULL , upperBoundsOnCoefficients = NULL ,
260- lowerBoundsOnIntercepts = NULL , upperBoundsOnIntercepts = NULL ) {
273+ lowerBoundsOnIntercepts = NULL , upperBoundsOnIntercepts = NULL ,
274+ handleInvalid = c(" error" , " keep" , " skip" )) {
261275 formula <- paste(deparse(formula ), collapse = " " )
262276 row <- 0
263277 col <- 0
@@ -304,6 +318,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
304318 upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients ))
305319 }
306320
321+ handleInvalid <- match.arg(handleInvalid )
322+
307323 jobj <- callJStatic(" org.apache.spark.ml.r.LogisticRegressionWrapper" , " fit" ,
308324 data @ sdf , formula , as.numeric(regParam ),
309325 as.numeric(elasticNetParam ), as.integer(maxIter ),
@@ -312,7 +328,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
312328 weightCol , as.integer(aggregationDepth ),
313329 as.integer(row ), as.integer(col ),
314330 lowerBoundsOnCoefficients , upperBoundsOnCoefficients ,
315- lowerBoundsOnIntercepts , upperBoundsOnIntercepts )
331+ lowerBoundsOnIntercepts , upperBoundsOnIntercepts ,
332+ handleInvalid )
316333 new(" LogisticRegressionModel" , jobj = jobj )
317334 })
318335
@@ -394,7 +411,12 @@ setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "char
394411# ' @param stepSize stepSize parameter.
395412# ' @param seed seed parameter for weights initialization.
396413# ' @param initialWeights initialWeights parameter for weights initialization, it should be a
397- # ' numeric vector.
414+ # ' numeric vector.
415+ # ' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
416+ # ' column of string type.
417+ # ' Supported options: "skip" (filter out rows with invalid data),
418+ # ' "error" (throw an error), "keep" (put invalid data in a special additional
419+ # ' bucket, at index numLabels). Default is "error".
398420# ' @param ... additional arguments passed to the method.
399421# ' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model.
400422# ' @rdname spark.mlp
@@ -426,7 +448,8 @@ setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "char
426448# ' @note spark.mlp since 2.1.0
427449setMethod ("spark.mlp ", signature(data = "SparkDataFrame", formula = "formula"),
428450 function (data , formula , layers , blockSize = 128 , solver = " l-bfgs" , maxIter = 100 ,
429- tol = 1E-6 , stepSize = 0.03 , seed = NULL , initialWeights = NULL ) {
451+ tol = 1E-6 , stepSize = 0.03 , seed = NULL , initialWeights = NULL ,
452+ handleInvalid = c(" error" , " keep" , " skip" )) {
430453 formula <- paste(deparse(formula ), collapse = " " )
431454 if (is.null(layers )) {
432455 stop (" layers must be a integer vector with length > 1." )
@@ -441,10 +464,11 @@ setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
441464 if (! is.null(initialWeights )) {
442465 initialWeights <- as.array(as.numeric(na.omit(initialWeights )))
443466 }
467+ handleInvalid <- match.arg(handleInvalid )
444468 jobj <- callJStatic(" org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper" ,
445469 " fit" , data @ sdf , formula , as.integer(blockSize ), as.array(layers ),
446470 as.character(solver ), as.integer(maxIter ), as.numeric(tol ),
447- as.numeric(stepSize ), seed , initialWeights )
471+ as.numeric(stepSize ), seed , initialWeights , handleInvalid )
448472 new(" MultilayerPerceptronClassificationModel" , jobj = jobj )
449473 })
450474
@@ -514,6 +538,11 @@ setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationMode
514538# ' @param formula a symbolic description of the model to be fitted. Currently only a few formula
515539# ' operators are supported, including '~', '.', ':', '+', and '-'.
516540# ' @param smoothing smoothing parameter.
541+ # ' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
542+ # ' column of string type.
543+ # ' Supported options: "skip" (filter out rows with invalid data),
544+ # ' "error" (throw an error), "keep" (put invalid data in a special additional
545+ # ' bucket, at index numLabels). Default is "error".
517546# ' @param ... additional argument(s) passed to the method. Currently only \code{smoothing}.
518547# ' @return \code{spark.naiveBayes} returns a fitted naive Bayes model.
519548# ' @rdname spark.naiveBayes
@@ -543,10 +572,12 @@ setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationMode
543572# ' }
544573# ' @note spark.naiveBayes since 2.0.0
545574setMethod ("spark.naiveBayes ", signature(data = "SparkDataFrame", formula = "formula"),
546- function (data , formula , smoothing = 1.0 ) {
575+ function (data , formula , smoothing = 1.0 ,
576+ handleInvalid = c(" error" , " keep" , " skip" )) {
547577 formula <- paste(deparse(formula ), collapse = " " )
578+ handleInvalid <- match.arg(handleInvalid )
548579 jobj <- callJStatic(" org.apache.spark.ml.r.NaiveBayesWrapper" , " fit" ,
549- formula , data @ sdf , smoothing )
580+ formula , data @ sdf , smoothing , handleInvalid )
550581 new(" NaiveBayesModel" , jobj = jobj )
551582 })
552583
0 commit comments