From dbabade554b93bee191b56115a516f39d4e128ea Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 1 Dec 2015 14:03:01 -0800 Subject: [PATCH 01/11] Start working towards implementing python interface for quantilediscrectizer. One question (for review) is do we want to change the bucketizer as I've done or create a different wrapper? I think this way is better but it does introduce an extra param so no sure --- python/pyspark/ml/feature.py | 85 +++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 6 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index b02d41b52ab2..8ff96cddc9ed 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -30,10 +30,10 @@ __all__ = ['Binarizer', 'Bucketizer', 'CountVectorizer', 'CountVectorizerModel', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', 'IndexToString', 'MinMaxScaler', 'MinMaxScalerModel', 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel', - 'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer', - 'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer', - 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer', - 'Word2Vec', 'Word2VecModel'] + 'PolynomialExpansion', 'QuantileDiscretizer', 'RegexTokenizer', 'RFormula', + 'RFormulaModel', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', + 'StopWordsRemover', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', + 'VectorAssembler', 'VectorIndexer', 'VectorSlicer', 'Word2Vec', 'Word2VecModel'] @inherit_doc @@ -135,9 +135,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol): "specified will be treated as errors.") @keyword_only - def __init__(self, splits=None, inputCol=None, outputCol=None): + def __init__(self, splits=None, inputCol=None, outputCol=None, _java_model=None): """ - __init__(self, splits=None, inputCol=None, outputCol=None) + __init__(self, splits=None, inputCol=None, outputCol=None, _java_model=None) """ super(Bucketizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid) @@ -970,6 +970,79 @@ def getDegree(self): """ return self.getOrDefault(self.degree) +@inherit_doc +class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): + """ + .. note:: Experimental + + `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned + categorical features. The bin ranges are chosen by taking a sample of the data and dividing it + into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity, + covering all real values. This attempts to find numBuckets partitions based on a sample of data, + but it may find fewer depending on the data sample values. + + >>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) + >>> qds = QuantileDiscretizer(numBuckets=2, + ... inputCol="values", outputCol="buckets") + >>> bucketizer = qds.fit(df).buckets + magic + + .. versionadded:: 1.6.0 + """ + + # a placeholder to make it appear in the generated doc + self.numBuckets = Param(Params._dummy(), "numBuckets", + "Maximum number of buckets (quantiles, or " + + "categories) into which data points are grouped. Must be >= 2.") + + @keyword_only + def __init__(self, numBuckets=2, inputCol=None, outputCol=None): + """ + __init__(self, numBuckets=2, inputCol=None, outputCol=None) + """ + super(QuantileDiscretizer, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid) + self.numBuckets = Param(self, "numBuckets", + "Maximum number of buckets (quantiles, or " + + "categories) into which data points are grouped. Must be >= 2.") + self._setDefault(numBuckets=2) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + @since("1.6.0") + def setParams(self, numBuckets=2, inputCol=None, outputCol=None): + """ + setParams(self, numBuckets=2, inputCol=None, outputCol=None) + Set the params for the QuantileDiscertizerBase + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + @since("1.6.0") + def setNumBuckets(self, value): + """ + Sets the value of :py:attr:`numBuckets`. + """ + self._paramMap[self.numBuckets] = value + return self + + @since("1.6.0") + def getNumBuckets(self): + """ + Gets the value of numBuckets or its default value. + """ + return self.getOrDefault(self.numBuckets) + + def _create_model(self, java_model): + """ + Private method to convert the java_model to a Python model. + """ + return Bucketizer(splits=java_model.getSplits(), + inputCol=self.getInputCol(), + outputCol=self.getOutputCol(), + _java_model=java_model) + @inherit_doc @ignore_unicode_prefix From 1cacd7667ac0fa37b94fcb842e1d1616898279e9 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 1 Dec 2015 17:11:51 -0800 Subject: [PATCH 02/11] Ok remove _java_model before setting the params since it isn't really a param, print out the splits from the trained bucketizer --- python/pyspark/ml/feature.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 8ff96cddc9ed..2d0b936d37e7 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -155,6 +155,7 @@ def __init__(self, splits=None, inputCol=None, outputCol=None, _java_model=None) "provided to cover all Double values; otherwise, values outside the splits " + "specified will be treated as errors.") kwargs = self.__init__._input_kwargs + kwargs.pop("_java_model", None) self.setParams(**kwargs) @keyword_only @@ -984,14 +985,15 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): >>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) >>> qds = QuantileDiscretizer(numBuckets=2, ... inputCol="values", outputCol="buckets") - >>> bucketizer = qds.fit(df).buckets - magic + >>> bucketizer = qds.fit(df) + >>> bucketizer.getSplits() + [-inf, 0.4, 1.5, inf] .. versionadded:: 1.6.0 """ # a placeholder to make it appear in the generated doc - self.numBuckets = Param(Params._dummy(), "numBuckets", + numBuckets = Param(Params._dummy(), "numBuckets", "Maximum number of buckets (quantiles, or " + "categories) into which data points are grouped. Must be >= 2.") @@ -1001,7 +1003,8 @@ def __init__(self, numBuckets=2, inputCol=None, outputCol=None): __init__(self, numBuckets=2, inputCol=None, outputCol=None) """ super(QuantileDiscretizer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid) + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer", + self.uid) self.numBuckets = Param(self, "numBuckets", "Maximum number of buckets (quantiles, or " + "categories) into which data points are grouped. Must be >= 2.") @@ -1038,7 +1041,7 @@ def _create_model(self, java_model): """ Private method to convert the java_model to a Python model. """ - return Bucketizer(splits=java_model.getSplits(), + return Bucketizer(splits=list(java_model.getSplits()), inputCol=self.getInputCol(), outputCol=self.getOutputCol(), _java_model=java_model) From cfb255fc903f8283ef3fc55cf52e0fed8634f9bb Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 1 Dec 2015 17:19:06 -0800 Subject: [PATCH 03/11] And make sure the generated model works --- python/pyspark/ml/feature.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 2d0b936d37e7..b30e08b4ef96 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -988,6 +988,9 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): >>> bucketizer = qds.fit(df) >>> bucketizer.getSplits() [-inf, 0.4, 1.5, inf] + >>> bucketed = bucketizer.transform(df).collect() + >>> bucketed[0] + Row(values=0.1, buckets=0.0) .. versionadded:: 1.6.0 """ From 254010184eeee33a4a9f8aeda7b77bfd365b18f3 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 1 Dec 2015 17:23:25 -0800 Subject: [PATCH 04/11] pep8 style fix --- python/pyspark/ml/feature.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index b30e08b4ef96..07a05d819e77 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -971,6 +971,7 @@ def getDegree(self): """ return self.getOrDefault(self.degree) + @inherit_doc class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): """ @@ -997,8 +998,8 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): # a placeholder to make it appear in the generated doc numBuckets = Param(Params._dummy(), "numBuckets", - "Maximum number of buckets (quantiles, or " + - "categories) into which data points are grouped. Must be >= 2.") + "Maximum number of buckets (quantiles, or " + + "categories) into which data points are grouped. Must be >= 2.") @keyword_only def __init__(self, numBuckets=2, inputCol=None, outputCol=None): From 1145ec420590fc2e2cfc554433d9ba9ebabeb821 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 1 Dec 2015 18:19:18 -0800 Subject: [PATCH 05/11] Floating point funtimes with doctest --- python/pyspark/ml/feature.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 07a05d819e77..2c7e6a6473ce 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -987,11 +987,14 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): >>> qds = QuantileDiscretizer(numBuckets=2, ... inputCol="values", outputCol="buckets") >>> bucketizer = qds.fit(df) - >>> bucketizer.getSplits() - [-inf, 0.4, 1.5, inf] + >>> splits = bucketizer.getSplits() + >>> splits[0] + -inf + >>> splits[1]*10 + 4 >>> bucketed = bucketizer.transform(df).collect() - >>> bucketed[0] - Row(values=0.1, buckets=0.0) + >>> bucketed[0].buckets + 0.0 .. versionadded:: 1.6.0 """ From 2afd197cf74ca9552333ddd7a13bbfe8bd35490c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 1 Dec 2015 18:47:11 -0800 Subject: [PATCH 06/11] chop of extra --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 2c7e6a6473ce..136bcbe29f17 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -990,7 +990,7 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): >>> splits = bucketizer.getSplits() >>> splits[0] -inf - >>> splits[1]*10 + >>> int(splits[1]*10) 4 >>> bucketed = bucketizer.transform(df).collect() >>> bucketed[0].buckets From 27a40986b3a53caaeff8e70042524c6af36c27b8 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 16 Jan 2016 09:39:30 -0800 Subject: [PATCH 07/11] Add default 2 to the parameter name --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index d4697125cf65..bf4930219f9a 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1022,7 +1022,7 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): # a placeholder to make it appear in the generated doc numBuckets = Param(Params._dummy(), "numBuckets", "Maximum number of buckets (quantiles, or " + - "categories) into which data points are grouped. Must be >= 2.") + "categories) into which data points are grouped. Must be >= 2. Default 2.") @keyword_only def __init__(self, numBuckets=2, inputCol=None, outputCol=None): From 5e18778d04266b0fd63ec70b871404bde83b0c58 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 16 Jan 2016 09:40:51 -0800 Subject: [PATCH 08/11] Updated since tags to 2.0.0 since we didn't make the 1.6 cut --- python/pyspark/ml/feature.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index bf4930219f9a..9f31fc2d9030 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1016,7 +1016,7 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): >>> bucketed[0].buckets 0.0 - .. versionadded:: 1.6.0 + .. versionadded:: 2.0.0 """ # a placeholder to make it appear in the generated doc @@ -1040,7 +1040,7 @@ def __init__(self, numBuckets=2, inputCol=None, outputCol=None): self.setParams(**kwargs) @keyword_only - @since("1.6.0") + @since("2.0.0") def setParams(self, numBuckets=2, inputCol=None, outputCol=None): """ setParams(self, numBuckets=2, inputCol=None, outputCol=None) @@ -1049,7 +1049,7 @@ def setParams(self, numBuckets=2, inputCol=None, outputCol=None): kwargs = self.setParams._input_kwargs return self._set(**kwargs) - @since("1.6.0") + @since("2.0.0") def setNumBuckets(self, value): """ Sets the value of :py:attr:`numBuckets`. @@ -1057,7 +1057,7 @@ def setNumBuckets(self, value): self._paramMap[self.numBuckets] = value return self - @since("1.6.0") + @since("2.0.0") def getNumBuckets(self): """ Gets the value of numBuckets or its default value. From f21ebefc1e0edc16c2eed8e5474033e8d3baf1ae Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 19 Jan 2016 11:23:48 -0800 Subject: [PATCH 09/11] CR feedback --- python/pyspark/ml/feature.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 9f31fc2d9030..8365ba6214e0 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -135,9 +135,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol): "specified will be treated as errors.") @keyword_only - def __init__(self, splits=None, inputCol=None, outputCol=None, _java_model=None): + def __init__(self, splits=None, inputCol=None, outputCol=None): """ - __init__(self, splits=None, inputCol=None, outputCol=None, _java_model=None) + __init__(self, splits=None, inputCol=None, outputCol=None) """ super(Bucketizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid) @@ -155,7 +155,6 @@ def __init__(self, splits=None, inputCol=None, outputCol=None, _java_model=None) "provided to cover all Double values; otherwise, values outside the splits " + "specified will be treated as errors.") kwargs = self.__init__._input_kwargs - kwargs.pop("_java_model", None) self.setParams(**kwargs) @keyword_only @@ -1012,8 +1011,8 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): -inf >>> int(splits[1]*10) 4 - >>> bucketed = bucketizer.transform(df).collect() - >>> bucketed[0].buckets + >>> bucketed = bucketizer.transform(df).head() + >>> bucketed.buckets 0.0 .. versionadded:: 2.0.0 @@ -1044,7 +1043,7 @@ def __init__(self, numBuckets=2, inputCol=None, outputCol=None): def setParams(self, numBuckets=2, inputCol=None, outputCol=None): """ setParams(self, numBuckets=2, inputCol=None, outputCol=None) - Set the params for the QuantileDiscertizerBase + Set the params for the QuantileDiscretizer """ kwargs = self.setParams._input_kwargs return self._set(**kwargs) @@ -1070,8 +1069,7 @@ def _create_model(self, java_model): """ return Bucketizer(splits=list(java_model.getSplits()), inputCol=self.getInputCol(), - outputCol=self.getOutputCol(), - _java_model=java_model) + outputCol=self.getOutputCol()) @inherit_doc From f9e3086b2fa7eae24f22aa2fd32eb644d829e52b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 19 Jan 2016 15:01:37 -0800 Subject: [PATCH 10/11] Round to one digit --- python/pyspark/ml/feature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 8365ba6214e0..837eaf623269 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1009,8 +1009,8 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): >>> splits = bucketizer.getSplits() >>> splits[0] -inf - >>> int(splits[1]*10) - 4 + >>> round(splits[1], 1) + 0.4 >>> bucketed = bucketizer.transform(df).head() >>> bucketed.buckets 0.0 From 194ec6daa6da519490c33af4aa431f91cc7df88d Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 19 Jan 2016 15:41:31 -0800 Subject: [PATCH 11/11] consistent formatting for the printed split --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 837eaf623269..53ae60cdb2c3 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1009,7 +1009,7 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): >>> splits = bucketizer.getSplits() >>> splits[0] -inf - >>> round(splits[1], 1) + >>> print("%2.1f" % round(splits[1], 1)) 0.4 >>> bucketed = bucketizer.transform(df).head() >>> bucketed.buckets