[SPARK-17439][SQL] Fixing compression issues with approximate quantiles and adding more tests

thunterdb · srowen · commit bde54526845a · 2016-09-11T09:56:54.000+01:00
This PR build on #14976 and fixes a correctness bug that would cause the wrong quantile to be returned for small target errors. This PR adds 8 unit tests that were failing without the fix. Author: Timothy Hunter <timhunter@databricks.com> Author: Sean Owen <sowen@cloudera.com> Closes #15002 from thunterdb/ml-1783. (cherry picked from commit 180796e) Signed-off-by: Sean Owen <sowen@cloudera.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution.stat
 
-import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
@@ -119,7 +119,7 @@ object StatFunctions extends Logging {
   class QuantileSummaries(
       val compressThreshold: Int,
       val relativeError: Double,
-      val sampled: ArrayBuffer[Stats] = ArrayBuffer.empty,
+      val sampled: Array[Stats] = Array.empty,
       private[stat] var count: Long = 0L,
       val headSampled: ArrayBuffer[Double] = ArrayBuffer.empty) extends Serializable {
 
@@ -134,7 +134,12 @@ object StatFunctions extends Logging {
     def insert(x: Double): QuantileSummaries = {
       headSampled.append(x)
       if (headSampled.size >= defaultHeadSize) {
-        this.withHeadBufferInserted
+        val result = this.withHeadBufferInserted
+        if (result.sampled.length >= compressThreshold) {
+          result.compress()
+        } else {
+          result
+        }
       } else {
         this
       }
@@ -186,7 +191,7 @@ object StatFunctions extends Logging {
         newSamples.append(sampled(sampleIdx))
         sampleIdx += 1
       }
-      new QuantileSummaries(compressThreshold, relativeError, newSamples, currentCount)
+      new QuantileSummaries(compressThreshold, relativeError, newSamples.toArray, currentCount)
     }
 
     /**
@@ -305,10 +310,10 @@ object StatFunctions extends Logging {
 
     private def compressImmut(
         currentSamples: IndexedSeq[Stats],
-        mergeThreshold: Double): ArrayBuffer[Stats] = {
-      val res: ArrayBuffer[Stats] = ArrayBuffer.empty
+        mergeThreshold: Double): Array[Stats] = {
+      val res = ListBuffer.empty[Stats]
       if (currentSamples.isEmpty) {
-        return res
+        return res.toArray
       }
       // Start for the last element, which is always part of the set.
       // The head contains the current new head, that may be merged with the current element.
@@ -331,8 +336,11 @@ object StatFunctions extends Logging {
       }
       res.prepend(head)
       // If necessary, add the minimum element:
-      res.prepend(currentSamples.head)
-      res
+      val currHead = currentSamples.head
+      if (currHead.value < head.value) {
+        res.prepend(currentSamples.head)
+      }
+      res.toArray
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/stat/ApproxQuantileSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/stat/ApproxQuantileSuite.scala
@@ -42,6 +42,20 @@ class ApproxQuantileSuite extends SparkFunSuite {
     summary.compress()
   }
 
+  /**
+   * Interleaves compression and insertions.
+   */
+  private def buildCompressSummary(
+      data: Seq[Double],
+      epsi: Double,
+      threshold: Int): QuantileSummaries = {
+    var summary = new QuantileSummaries(threshold, epsi)
+    data.foreach { x =>
+      summary = summary.insert(x).compress()
+    }
+    summary
+  }
+
   private def checkQuantile(quant: Double, data: Seq[Double], summary: QuantileSummaries): Unit = {
     val approx = summary.query(quant)
     // The rank of the approximation.
@@ -56,8 +70,8 @@ class ApproxQuantileSuite extends SparkFunSuite {
 
   for {
     (seq_name, data) <- Seq(increasing, decreasing, random)
-    epsi <- Seq(0.1, 0.0001)
-    compression <- Seq(1000, 10)
+    epsi <- Seq(0.1, 0.0001) // With a significant value and with full precision
+    compression <- Seq(1000, 10) // This interleaves n so that we test without and with compression
   } {
 
     test(s"Extremas with epsi=$epsi and seq=$seq_name, compression=$compression") {
@@ -77,6 +91,17 @@ class ApproxQuantileSuite extends SparkFunSuite {
       checkQuantile(0.1, data, s)
       checkQuantile(0.001, data, s)
     }
+
+    test(s"Some quantile values with epsi=$epsi and seq=$seq_name, compression=$compression " +
+      s"(interleaved)") {
+      val s = buildCompressSummary(data, epsi, compression)
+      assert(s.count == data.size, s"Found count=${s.count} but data size=${data.size}")
+      checkQuantile(0.9999, data, s)
+      checkQuantile(0.9, data, s)
+      checkQuantile(0.5, data, s)
+      checkQuantile(0.1, data, s)
+      checkQuantile(0.001, data, s)
+    }
   }
 
   // Tests for merging procedure