From 7a0230a07d8650ded80b3f255047b7558ba8989c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 14 Apr 2016 16:22:57 -0700 Subject: [PATCH 1/5] update codegen --- .../ml/param/shared/SharedParamsCodeGen.scala | 5 +++- .../ml/param/_shared_params_code_gen.py | 6 ++++- python/pyspark/ml/param/shared.py | 25 +++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index 1d03a5b4f4048..ecc826381133b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -76,7 +76,10 @@ private[shared] object SharedParamsCodeGen { ParamDesc[String]("weightCol", "weight column name. If this is not set or empty, we treat " + "all instance weights as 1.0"), ParamDesc[String]("solver", "the solver algorithm for optimization. If this is not set or " + - "empty, default value is 'auto'", Some("\"auto\""))) + "empty, default value is 'auto'", Some("\"auto\"")), + ParamDesc[Boolean]("binary", "If true, all non zero results are set to 1. This is useful for " + + "discrete probabilistic models that model binary events rather than integer counts. " + + "Default False.", Some("false"))) val code = genSharedParams(params) val file = "src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala" diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index a7615c43bee24..339dd3bb04547 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -148,7 +148,11 @@ def get$Name(self): ("solver", "the solver algorithm for optimization. If this is not set or empty, " + "default value is 'auto'.", "'auto'", "TypeConverters.toString"), ("varianceCol", "column name for the biased sample variance of prediction.", - None, "TypeConverters.toString")] + None, "TypeConverters.toString"), + ("binary", "If True, all non zero results are set to 1. This is useful for discrete " + + "probabilistic models that model binary events rather than integer counts. Default False.", + "False", "TypeConverters.toBoolean") + ] code = [] for name, doc, defaultValueStr, typeConverter in shared: diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index c9e975525ce1f..4edaec7c36c72 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -583,6 +583,31 @@ def getVarianceCol(self): return self.getOrDefault(self.varianceCol) +class HasBinary(Params): + """ + Mixin for param binary: If True, all non zero results are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False. + """ + + binary = Param(Params._dummy(), "binary", "If True, all non zero results are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.", typeConverter=TypeConverters.toBoolean) + + def __init__(self): + super(HasBinary, self).__init__() + self._setDefault(binary=False) + + def setBinary(self, value): + """ + Sets the value of :py:attr:`binary`. + """ + self._set(binary=value) + return self + + def getBinary(self): + """ + Gets the value of binary or its default value. + """ + return self.getOrDefault(self.binary) + + class DecisionTreeParams(Params): """ Mixin for Decision Tree parameters. From ccd28a47c9e96ac9df15a199ed85c42a2bb59787 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 14 Apr 2016 16:37:54 -0700 Subject: [PATCH 2/5] Generate binary param --- .../spark/ml/param/shared/sharedParams.scala | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index 64d6af2766ca9..e151b79b8b8ca 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -389,4 +389,21 @@ private[ml] trait HasSolver extends Params { /** @group getParam */ final def getSolver: String = $(solver) } + +/** + * Trait for shared param binary (default: false). + */ +private[ml] trait HasBinary extends Params { + + /** + * Param for If true, all non zero results are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.. + * @group param + */ + final val binary: BooleanParam = new BooleanParam(this, "binary", "If true, all non zero results are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.") + + setDefault(binary, false) + + /** @group getParam */ + final def getBinary: Boolean = $(binary) +} // scalastyle:on From edbd652f85c9a1003ad23a76ef02e789987328c9 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 14 Apr 2016 16:41:55 -0700 Subject: [PATCH 3/5] Use shared param --- .../spark/ml/feature/CountVectorizer.scala | 18 +++--------------- .../apache/spark/ml/feature/HashingTF.scala | 18 ++---------------- .../ml/param/shared/SharedParamsCodeGen.scala | 4 ++-- python/pyspark/ml/feature.py | 17 ++++------------- 4 files changed, 11 insertions(+), 46 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index 922670a41b6b3..681b31e8bf441 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -22,7 +22,7 @@ import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.param.shared.{HasBinary, HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.{Vectors, VectorUDT} import org.apache.spark.rdd.RDD @@ -34,7 +34,8 @@ import org.apache.spark.util.collection.OpenHashMap /** * Params for [[CountVectorizer]] and [[CountVectorizerModel]]. */ -private[feature] trait CountVectorizerParams extends Params with HasInputCol with HasOutputCol { +private[feature] trait CountVectorizerParams extends Params with HasBinary with HasInputCol + with HasOutputCol { /** * Max size of the vocabulary. @@ -101,19 +102,6 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit /** @group getParam */ def getMinTF: Double = $(minTF) - /** - * Binary toggle to control the output vector values. - * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for - * discrete probabilistic models that model binary events rather than integer counts. - * Default: false - * @group param - */ - val binary: BooleanParam = - new BooleanParam(this, "binary", "If True, all non zero counts are set to 1.") - - /** @group getParam */ - def getBinary: Boolean = $(binary) - setDefault(binary -> false) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala index 467ad7307462a..dd574ca494cdd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala @@ -21,7 +21,7 @@ import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{BooleanParam, IntParam, ParamMap, ParamValidators} -import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.param.shared.{HasBinary, HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} @@ -34,7 +34,7 @@ import org.apache.spark.sql.types.{ArrayType, StructType} */ @Experimental class HashingTF(override val uid: String) - extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { + extends Transformer with HasBinary with HasInputCol with HasOutputCol with DefaultParamsWritable { def this() = this(Identifiable.randomUID("hashingTF")) @@ -52,17 +52,6 @@ class HashingTF(override val uid: String) val numFeatures = new IntParam(this, "numFeatures", "number of features (> 0)", ParamValidators.gt(0)) - /** - * Binary toggle to control term frequency counts. - * If true, all non-zero counts are set to 1. This is useful for discrete probabilistic - * models that model binary events rather than integer counts. - * (default = false) - * @group param - */ - val binary = new BooleanParam(this, "binary", "If true, all non zero counts are set to 1. " + - "This is useful for discrete probabilistic models that model binary events rather " + - "than integer counts") - setDefault(numFeatures -> (1 << 18), binary -> false) /** @group getParam */ @@ -71,9 +60,6 @@ class HashingTF(override val uid: String) /** @group setParam */ def setNumFeatures(value: Int): this.type = set(numFeatures, value) - /** @group getParam */ - def getBinary: Boolean = $(binary) - /** @group setParam */ def setBinary(value: Boolean): this.type = set(binary, value) diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index ecc826381133b..3663896e8ab03 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -77,8 +77,8 @@ private[shared] object SharedParamsCodeGen { "all instance weights as 1.0"), ParamDesc[String]("solver", "the solver algorithm for optimization. If this is not set or " + "empty, default value is 'auto'", Some("\"auto\"")), - ParamDesc[Boolean]("binary", "If true, all non zero results are set to 1. This is useful for " + - "discrete probabilistic models that model binary events rather than integer counts. " + + ParamDesc[Boolean]("binary", "If true, all non zero results are set to 1. This is useful " + + "for discrete probabilistic models that model binary events rather than integer counts. " + "Default False.", Some("false"))) val code = genSharedParams(params) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 809a513316f9f..9e85f9d246508 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -200,7 +200,8 @@ def getSplits(self): @inherit_doc -class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): +class CountVectorizer(JavaEstimator, HasBinary, HasInputCol, HasOutputCol, JavaMLReadable, + JavaMLWritable): """ .. note:: Experimental @@ -256,11 +257,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, vocabSize = Param( Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.", typeConverter=TypeConverters.toInt) - binary = Param( - Params._dummy(), "binary", "Binary toggle to control the output vector values." + - " If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful" + - " for discrete probabilistic models that model binary events rather than integer counts." + - " Default False", typeConverter=TypeConverters.toBoolean) @keyword_only def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None, @@ -510,8 +506,8 @@ def getScalingVec(self): @inherit_doc -class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable, - JavaMLWritable): +class HashingTF(JavaTransformer, HasBinary, HasInputCol, HasOutputCol, HasNumFeatures, + JavaMLReadable, JavaMLWritable): """ .. note:: Experimental @@ -536,11 +532,6 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java .. versionadded:: 1.3.0 """ - binary = Param(Params._dummy(), "binary", "If True, all non zero counts are set to 1. " + - "This is useful for discrete probabilistic models that model binary events " + - "rather than integer counts. Default False.", - typeConverter=TypeConverters.toBoolean) - @keyword_only def __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None): """ From 59366262abecfda4ec0759096fdcdce4717c44b4 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 15 Apr 2016 13:52:08 -0700 Subject: [PATCH 4/5] Update the param description --- .../apache/spark/ml/param/shared/SharedParamsCodeGen.scala | 6 +++--- python/pyspark/ml/param/_shared_params_code_gen.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index 3663896e8ab03..26630401118be 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -77,9 +77,9 @@ private[shared] object SharedParamsCodeGen { "all instance weights as 1.0"), ParamDesc[String]("solver", "the solver algorithm for optimization. If this is not set or " + "empty, default value is 'auto'", Some("\"auto\"")), - ParamDesc[Boolean]("binary", "If true, all non zero results are set to 1. This is useful " + - "for discrete probabilistic models that model binary events rather than integer counts. " + - "Default False.", Some("false"))) + ParamDesc[Boolean]("binary", "If true, all non-zero counts (after any filters are applied) " + + "are set to 1. This is useful for discrete probabilistic models that model binary events " + + "rather than integer counts. Default False.", Some("false"))) val code = genSharedParams(params) val file = "src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala" diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 339dd3bb04547..1a3d9aec5088b 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -149,9 +149,9 @@ def get$Name(self): "default value is 'auto'.", "'auto'", "TypeConverters.toString"), ("varianceCol", "column name for the biased sample variance of prediction.", None, "TypeConverters.toString"), - ("binary", "If True, all non zero results are set to 1. This is useful for discrete " + - "probabilistic models that model binary events rather than integer counts. Default False.", - "False", "TypeConverters.toBoolean") + ("binary", "If True, all non-zero counts (after any filters are applied) are set to 1. " + + "This is useful for discrete probabilistic models that model binary events rather than " + + "integer counts. Default False.", "False", "TypeConverters.toBoolean") ] code = [] From b702e7278c840e4d917ea321e96fe28dcef8dee6 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 15 Apr 2016 13:53:08 -0700 Subject: [PATCH 5/5] Update docstring for binary param --- .../scala/org/apache/spark/ml/param/shared/sharedParams.scala | 4 ++-- python/pyspark/ml/param/shared.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index e151b79b8b8ca..9370a412ea0a8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -396,10 +396,10 @@ private[ml] trait HasSolver extends Params { private[ml] trait HasBinary extends Params { /** - * Param for If true, all non zero results are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.. + * Param for If true, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.. * @group param */ - final val binary: BooleanParam = new BooleanParam(this, "binary", "If true, all non zero results are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.") + final val binary: BooleanParam = new BooleanParam(this, "binary", "If true, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.") setDefault(binary, false) diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index 4edaec7c36c72..35983ca7bdaf9 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -585,10 +585,10 @@ def getVarianceCol(self): class HasBinary(Params): """ - Mixin for param binary: If True, all non zero results are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False. + Mixin for param binary: If True, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False. """ - binary = Param(Params._dummy(), "binary", "If True, all non zero results are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.", typeConverter=TypeConverters.toBoolean) + binary = Param(Params._dummy(), "binary", "If True, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.", typeConverter=TypeConverters.toBoolean) def __init__(self): super(HasBinary, self).__init__()