[SPARK-14272][ML] Add Loglikelihood in GaussianMixtureSummary

zhengruifeng · yanboliang · commit 8ccca9170f98 · 2017-01-19T03:46:37.000-08:00
## What changes were proposed in this pull request? add loglikelihood in GMM.summary ## How was this patch tested? added tests Author: Zheng RuiFeng <ruifengz@foxmail.com> Author: Ruifeng Zheng <ruifengz@foxmail.com> Closes #12064 from zhengruifeng/gmm_metric.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -416,7 +416,7 @@ class GaussianMixture @Since("2.0.0") (
 
     val model = copyValues(new GaussianMixtureModel(uid, weights, gaussianDists)).setParent(this)
     val summary = new GaussianMixtureSummary(model.transform(dataset),
-      $(predictionCol), $(probabilityCol), $(featuresCol), $(k))
+      $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood)
     model.setSummary(Some(summary))
     instr.logSuccess(model)
     model
@@ -674,6 +674,7 @@ private class ExpectationAggregator(
  *                        in `predictions`.
  * @param featuresCol  Name for column of features in `predictions`.
  * @param k  Number of clusters.
+ * @param logLikelihood  Total log-likelihood for this model on the given data.
  */
 @Since("2.0.0")
 @Experimental
@@ -682,7 +683,9 @@ class GaussianMixtureSummary private[clustering] (
     predictionCol: String,
     @Since("2.0.0") val probabilityCol: String,
     featuresCol: String,
-    k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) {
+    k: Int,
+    @Since("2.2.0") val logLikelihood: Double)
+  extends ClusteringSummary(predictions, predictionCol, featuresCol, k) {
 
   /**
    * Probability of each cluster.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -34,7 +34,7 @@ import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.random.BernoulliCellSampler
 
 /**
- * Helper methods to load, save and pre-process data used in ML Lib.
+ * Helper methods to load, save and pre-process data used in MLLib.
  */
 @Since("0.8.0")
 object MLUtils extends Logging {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -207,6 +207,10 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
                 [,1]     [,2]
       [1,] 0.2961543 0.160783
       [2,] 0.1607830 1.008878
+
+      model$loglik
+
+      [1] -46.89499
      */
     val weights = Array(0.5333333, 0.4666667)
     val means = Array(Vectors.dense(10.363673, 9.897081), Vectors.dense(0.11731091, -0.06192351))
@@ -219,6 +223,9 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     val expected = new GaussianMixtureModel("dummy", weights, gaussians)
     val actual = new GaussianMixture().setK(2).setSeed(seed).fit(rDataset)
     modelEquals(expected, actual)
+
+    val llk = actual.summary.logLikelihood
+    assert(llk ~== -46.89499 absTol 1E-5)
   }
 
   test("upper triangular matrix unpacking") {
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
@@ -46,7 +46,10 @@ object MimaExcludes {
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.streaming.scheduler.StreamingListener.onStreamingStarted"),
 
     // [SPARK-19148][SQL] do not expose the external table concept in Catalog
-    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable")
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable"),
+
+    // [SPARK-14272][ML] Add logLikelihood in GaussianMixtureSummary
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this")
   )
 
   // Exclude rules for 2.1.x
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
@@ -175,6 +175,8 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     3
     >>> summary.clusterSizes
     [2, 2, 2]
+    >>> summary.logLikelihood
+    8.14636...
     >>> weights = model.weights
     >>> len(weights)
     3
@@ -281,6 +283,14 @@ def probability(self):
         """
         return self._call_java("probability")
 
+    @property
+    @since("2.2.0")
+    def logLikelihood(self):
+        """
+        Total log-likelihood for this model on the given data.
+        """
+        return self._call_java("logLikelihood")
+
 
 class KMeansSummary(ClusteringSummary):
     """