From 59e67c81c5f14e41fddfef01f26222a59227a99f Mon Sep 17 00:00:00 2001 From: Jeff Zhang Date: Tue, 7 Jun 2016 18:18:16 +0800 Subject: [PATCH 1/9] [SPARK-15819][PYSPARK] Add KMeanSummary in KMeans of PySpark --- python/pyspark/ml/clustering.py | 80 +++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index b29b5ac70e6f..8989d1f7ff65 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -316,6 +316,72 @@ def computeCost(self, dataset): """ return self._call_java("computeCost", dataset) + @since("2.0.0") + def hasSummary(self): + """ + Return true if there exists summary of model. + """ + return self.summary is not None + + @property + @since("2.0.0") + def summary(self): + """ + Gets summary of model on training set. + """ + return KMeansSummary(self._call_java("summary")) + + +class KMeansSummary(JavaWrapper): + """ + Summary of KMeans. + + .. versionadded:: 2.0.0 + """ + + def __init__(self, _java_summary): + super(KMeansSummary, self).__init__(_java_summary) + + @property + @since("2.0.0") + def clusterSizes(self): + """ + Size of (number of data points in) each cluster. + """ + return self._call_java("clusterSizes") + + @property + @since("2.0.0") + def predictions(self): + """ + return the DataFrame of predictions that is produced by KMeansModel.transform(). + """ + return self._call_java("predictions") + + @property + @since("2.0.0") + def predictionCol(self): + """ + Name for column of predicted clusters. + """ + return self._call_java("predictionCol") + + @property + @since("2.0.0") + def featuresCol(self): + """ + Name for column of features. + """ + return self._call_java("featuresCol") + + @property + @since("2.0.0") + def k(self): + """ + Number of clusters. + """ + return self._call_java("k") + @inherit_doc class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed, @@ -330,6 +396,20 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol >>> df = spark.createDataFrame(data, ["features"]) >>> kmeans = KMeans(k=2, seed=1) >>> model = kmeans.fit(df) + >>> summary = model.summary + >>> summary.k + 2 + >>> summary.predictionCol + u'prediction' + >>> summary.featuresCol + u'features' + >>> summary.clusterSizes + [2, 2] + >>> rows = summary.predictions.collect() + >>> rows[0].prediction == rows[1].prediction + True + >>> rows[2].prediction == rows[3].prediction + True >>> centers = model.clusterCenters() >>> len(centers) 2 From 112316d05284c0a53c8b8a56b001b6369d4d6d15 Mon Sep 17 00:00:00 2001 From: Jeff Zhang Date: Wed, 8 Jun 2016 18:01:46 +0800 Subject: [PATCH 2/9] fix unit test failure --- python/pyspark/ml/clustering.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 8989d1f7ff65..253b67b9919b 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -400,9 +400,9 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol >>> summary.k 2 >>> summary.predictionCol - u'prediction' + 'prediction' >>> summary.featuresCol - u'features' + 'features' >>> summary.clusterSizes [2, 2] >>> rows = summary.predictions.collect() From 11e5d8ebae5e0031e912941edd56bbe5a06a4f08 Mon Sep 17 00:00:00 2001 From: Jeff Zhang Date: Wed, 8 Jun 2016 18:36:53 +0800 Subject: [PATCH 3/9] fix unit test failure --- python/pyspark/ml/clustering.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 253b67b9919b..12106c4d4860 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -332,6 +332,7 @@ def summary(self): return KMeansSummary(self._call_java("summary")) +@ignore_unicode_prefix class KMeansSummary(JavaWrapper): """ Summary of KMeans. From 73b37633b984a26935415ded93f923d028d13d6f Mon Sep 17 00:00:00 2001 From: Jeff Zhang Date: Wed, 8 Jun 2016 19:26:47 +0800 Subject: [PATCH 4/9] fix code style --- python/pyspark/ml/clustering.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 12106c4d4860..40de77dd4536 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -20,6 +20,8 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaWrapper from pyspark.ml.param.shared import * from pyspark.ml.common import inherit_doc +from pyspark.rdd import ignore_unicode_prefix + __all__ = ['BisectingKMeans', 'BisectingKMeansModel', 'BisectingKMeansSummary', 'KMeans', 'KMeansModel', @@ -332,7 +334,6 @@ def summary(self): return KMeansSummary(self._call_java("summary")) -@ignore_unicode_prefix class KMeansSummary(JavaWrapper): """ Summary of KMeans. @@ -355,7 +356,7 @@ def clusterSizes(self): @since("2.0.0") def predictions(self): """ - return the DataFrame of predictions that is produced by KMeansModel.transform(). + return the DataFrame of predictions that is produced by KMeansModel.transform() """ return self._call_java("predictions") @@ -384,6 +385,7 @@ def k(self): return self._call_java("k") +@ignore_unicode_prefix @inherit_doc class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed, JavaMLWritable, JavaMLReadable): From 356885b55fa5f4cabda833a4eec7e7fcbc083248 Mon Sep 17 00:00:00 2001 From: Jeff Zhang Date: Wed, 8 Jun 2016 21:24:23 +0800 Subject: [PATCH 5/9] trigger build --- python/pyspark/ml/clustering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 40de77dd4536..1699855718b9 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -356,7 +356,7 @@ def clusterSizes(self): @since("2.0.0") def predictions(self): """ - return the DataFrame of predictions that is produced by KMeansModel.transform() + return the DataFrame of predictions that is produced by KMeansModel.transform(). """ return self._call_java("predictions") From 2c78c69ac4a18f6460dab7d1e5eaa81b011fd658 Mon Sep 17 00:00:00 2001 From: Jeff Zhang Date: Thu, 9 Jun 2016 22:50:06 +0800 Subject: [PATCH 6/9] fix test failure --- python/pyspark/ml/clustering.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 1699855718b9..35e7482736ad 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -403,9 +403,9 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol >>> summary.k 2 >>> summary.predictionCol - 'prediction' + u'prediction' >>> summary.featuresCol - 'features' + u'features' >>> summary.clusterSizes [2, 2] >>> rows = summary.predictions.collect() From ef9436a1ce0354cf0c185413a66bb8443da51644 Mon Sep 17 00:00:00 2001 From: Jeff Zhang Date: Mon, 17 Oct 2016 11:55:43 +0800 Subject: [PATCH 7/9] address comments --- python/pyspark/ml/clustering.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 35e7482736ad..eff8bc309010 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -318,7 +318,7 @@ def computeCost(self, dataset): """ return self._call_java("computeCost", dataset) - @since("2.0.0") + @since("2.1.0") def hasSummary(self): """ Return true if there exists summary of model. @@ -326,10 +326,10 @@ def hasSummary(self): return self.summary is not None @property - @since("2.0.0") + @since("2.1.0") def summary(self): """ - Gets summary of model on training set. + Gets summary of model on training set. Or raise exception if no summary is present. """ return KMeansSummary(self._call_java("summary")) @@ -338,14 +338,14 @@ class KMeansSummary(JavaWrapper): """ Summary of KMeans. - .. versionadded:: 2.0.0 + .. versionadded:: 2.1.0 """ def __init__(self, _java_summary): super(KMeansSummary, self).__init__(_java_summary) @property - @since("2.0.0") + @since("2.1.0") def clusterSizes(self): """ Size of (number of data points in) each cluster. @@ -353,7 +353,7 @@ def clusterSizes(self): return self._call_java("clusterSizes") @property - @since("2.0.0") + @since("2.1.0") def predictions(self): """ return the DataFrame of predictions that is produced by KMeansModel.transform(). @@ -361,7 +361,7 @@ def predictions(self): return self._call_java("predictions") @property - @since("2.0.0") + @since("2.1.0") def predictionCol(self): """ Name for column of predicted clusters. @@ -369,7 +369,7 @@ def predictionCol(self): return self._call_java("predictionCol") @property - @since("2.0.0") + @since("2.1.0") def featuresCol(self): """ Name for column of features. @@ -377,7 +377,7 @@ def featuresCol(self): return self._call_java("featuresCol") @property - @since("2.0.0") + @since("2.1.0") def k(self): """ Number of clusters. From b3a8068ce2a53b52223d4969c24baca59315f03a Mon Sep 17 00:00:00 2001 From: Jeff Zhang Date: Mon, 28 Nov 2016 21:10:33 +0800 Subject: [PATCH 8/9] extends ClusteringSummary --- python/pyspark/ml/clustering.py | 46 ++------------------------------- 1 file changed, 2 insertions(+), 44 deletions(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index eff8bc309010..45481b50330b 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -334,55 +334,13 @@ def summary(self): return KMeansSummary(self._call_java("summary")) -class KMeansSummary(JavaWrapper): +class KMeansSummary(ClusteringSummary): """ Summary of KMeans. .. versionadded:: 2.1.0 """ - - def __init__(self, _java_summary): - super(KMeansSummary, self).__init__(_java_summary) - - @property - @since("2.1.0") - def clusterSizes(self): - """ - Size of (number of data points in) each cluster. - """ - return self._call_java("clusterSizes") - - @property - @since("2.1.0") - def predictions(self): - """ - return the DataFrame of predictions that is produced by KMeansModel.transform(). - """ - return self._call_java("predictions") - - @property - @since("2.1.0") - def predictionCol(self): - """ - Name for column of predicted clusters. - """ - return self._call_java("predictionCol") - - @property - @since("2.1.0") - def featuresCol(self): - """ - Name for column of features. - """ - return self._call_java("featuresCol") - - @property - @since("2.1.0") - def k(self): - """ - Number of clusters. - """ - return self._call_java("k") + pass @ignore_unicode_prefix From 032bb9d4917335bfc5ed748df5902768557997c9 Mon Sep 17 00:00:00 2001 From: Jeff Zhang Date: Tue, 29 Nov 2016 11:15:56 +0800 Subject: [PATCH 9/9] address the comments --- python/pyspark/ml/clustering.py | 60 ++++++++++++++++----------------- python/pyspark/ml/tests.py | 15 +++++++++ 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 45481b50330b..d05d737a4e9c 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -20,8 +20,6 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaWrapper from pyspark.ml.param.shared import * from pyspark.ml.common import inherit_doc -from pyspark.rdd import ignore_unicode_prefix - __all__ = ['BisectingKMeans', 'BisectingKMeansModel', 'BisectingKMeansSummary', 'KMeans', 'KMeansModel', @@ -298,6 +296,17 @@ def probability(self): return self._call_java("probability") +class KMeansSummary(ClusteringSummary): + """ + .. note:: Experimental + + Summary of KMeans. + + .. versionadded:: 2.1.0 + """ + pass + + class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable): """ Model fitted by KMeans. @@ -318,32 +327,28 @@ def computeCost(self, dataset): """ return self._call_java("computeCost", dataset) + @property @since("2.1.0") def hasSummary(self): """ - Return true if there exists summary of model. + Indicates whether a training summary exists for this model instance. """ - return self.summary is not None + return self._call_java("hasSummary") @property @since("2.1.0") def summary(self): """ - Gets summary of model on training set. Or raise exception if no summary is present. + Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the + training set. An exception is thrown if no summary exists. """ - return KMeansSummary(self._call_java("summary")) - - -class KMeansSummary(ClusteringSummary): - """ - Summary of KMeans. - - .. versionadded:: 2.1.0 - """ - pass + if self.hasSummary: + return KMeansSummary(self._call_java("summary")) + else: + raise RuntimeError("No training summary available for this %s" % + self.__class__.__name__) -@ignore_unicode_prefix @inherit_doc class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed, JavaMLWritable, JavaMLReadable): @@ -357,20 +362,6 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol >>> df = spark.createDataFrame(data, ["features"]) >>> kmeans = KMeans(k=2, seed=1) >>> model = kmeans.fit(df) - >>> summary = model.summary - >>> summary.k - 2 - >>> summary.predictionCol - u'prediction' - >>> summary.featuresCol - u'features' - >>> summary.clusterSizes - [2, 2] - >>> rows = summary.predictions.collect() - >>> rows[0].prediction == rows[1].prediction - True - >>> rows[2].prediction == rows[3].prediction - True >>> centers = model.clusterCenters() >>> len(centers) 2 @@ -382,6 +373,13 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol True >>> rows[2].prediction == rows[3].prediction True + >>> model.hasSummary + True + >>> summary = model.summary + >>> summary.k + 2 + >>> summary.clusterSizes + [2, 2] >>> kmeans_path = temp_path + "/kmeans" >>> kmeans.save(kmeans_path) >>> kmeans2 = KMeans.load(kmeans_path) @@ -390,6 +388,8 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol >>> model_path = temp_path + "/kmeans_model" >>> model.save(model_path) >>> model2 = KMeansModel.load(model_path) + >>> model2.hasSummary + False >>> model.clusterCenters()[0] == model2.clusterCenters()[0] array([ True, True], dtype=bool) >>> model.clusterCenters()[1] == model2.clusterCenters()[1] diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index c0f0d4073564..a0c288a0b71a 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -1129,6 +1129,21 @@ def test_bisecting_kmeans_summary(self): self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) + def test_kmeans_summary(self): + data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), + (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] + df = self.spark.createDataFrame(data, ["features"]) + kmeans = KMeans(k=2, seed=1) + model = kmeans.fit(df) + self.assertTrue(model.hasSummary) + s = model.summary + self.assertTrue(isinstance(s.predictions, DataFrame)) + self.assertEqual(s.featuresCol, "features") + self.assertEqual(s.predictionCol, "prediction") + self.assertTrue(isinstance(s.cluster, DataFrame)) + self.assertEqual(len(s.clusterSizes), 2) + self.assertEqual(s.k, 2) + class OneVsRestTests(SparkSessionTestCase):