show distribution with probabilities.

jinxing · jinxing · commit 4f992fcd363b · 2017-03-24T12:41:27.000+08:00
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -47,6 +47,7 @@
 import org.apache.spark.shuffle.IndexShuffleBlockResolver;
 import org.apache.spark.shuffle.ShuffleWriter;
 import org.apache.spark.storage.*;
+import org.apache.spark.util.Distribution;
 import org.apache.spark.util.Utils;
 
 /**
@@ -181,48 +182,22 @@ public void write(Iterator<Product2<K, V>> records) throws IOException {
         }
       }
       writeMetrics.incUnderestimatedBlocksSize(underestimatedBlocksSize);
-      if (logger.isDebugEnabled()) {
+      if (logger.isDebugEnabled() && partitionLengths.length > 0) {
         int underestimatedBlocksNum = 0;
-        // Distribution of sizes in MapStatus. The ranges are: [0, 1k), [1k, 10k), [10k, 100k),
-        // [100k, 1m), [1m, 10m), [10m, 100m), [100m, 1g), [1g, 10g), [10g, Long.MaxValue).
-        int[] lenDistribution = {0, 0, 0, 0, 0, 0, 0, 0, 0};
+        // Distribution of sizes in MapStatus.
+        double[] cp = new double[partitionLengths.length];
         for (int i = 0; i < partitionLengths.length; i++) {
-          long len = partitionLengths[i];
-          if (len > mapStatus.getSizeForBlock(i)) {
-            underestimatedBlocksNum++;
-          }
-          if (len >= 0L && len < 1024L) {
-            lenDistribution[0]++;
-          } else if (len >= 1024L && len < 10240L) {
-            lenDistribution[1]++;
-          } else if (len >= 10240L && len < 102400L) {
-            lenDistribution[2]++;
-          } else if (len >= 102400L && len < 1048576L ) {
-            lenDistribution[3]++;
-          } else if (len >= 1048576L && len < 10485760L) {
-            lenDistribution[4]++;
-          } else if (len >= 10485760L && len < 104857600L) {
-            lenDistribution[5]++;
-          } else if (len >= 104857600L && len < 1073741824L) {
-            lenDistribution[6]++;
-          } else if (len >= 1073741824L && len < 10737418240L) {
-            lenDistribution[7]++;
-          } else {
-            lenDistribution[8]++;
-          }
-        }
-        String[] ranges = {"[0, 1k)", "[1k, 10k)", "[10k, 100k)", "[100k, 1m)", "[1m, 10m)",
-          "[10m, 100m)", "[100m, 1g)", "[1g, 10g)", ">10g"};
-        String[] rangesAndDistribute = new String[9];
-        for (int j = 0; j < 9; j++) {
-          rangesAndDistribute[j] = ranges[j] + ":" + lenDistribution[j];
+          cp[i] = partitionLengths[i];
         }
+        Distribution distribution = new Distribution(cp, 0, cp.length);
+        double[] probabilities = {0.0, 0.25, 0.5, 0.75, 1.0};
+        String distributionStr = distribution.getQuantiles(probabilities).mkString(", ");
         logger.debug("For task {}.{} in stage {} (TID {}), the block sizes in MapStatus are " +
           "inaccurate (average is {}, {} blocks underestimated, size of underestimated is {})," +
-          " distribution is {}.", taskContext.partitionId(), taskContext.attemptNumber(),
-          taskContext.stageId(), taskContext.taskAttemptId(), hc.getAvgSize(),
-          underestimatedBlocksNum, underestimatedBlocksSize,
-          String.join(", ", rangesAndDistribute));
+          " distribution at the given probabilities(0, 0.25, 0.5, 0.75, 1.0) is {}.",
+          taskContext.partitionId(), taskContext.attemptNumber(), taskContext.stageId(),
+          taskContext.taskAttemptId(), hc.getAvgSize(),
+          underestimatedBlocksNum, underestimatedBlocksSize, distributionStr);
       }
     }
   }
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -54,6 +54,7 @@
 import org.apache.spark.storage.BlockManager;
 import org.apache.spark.storage.TimeTrackingOutputStream;
 import org.apache.spark.unsafe.Platform;
+import org.apache.spark.util.Distribution;
 import org.apache.spark.util.Utils;
 
 @Private
@@ -238,48 +239,22 @@ void closeAndWriteOutput() throws IOException {
         }
       }
       writeMetrics.incUnderestimatedBlocksSize(underestimatedBlocksSize);
-      if (logger.isDebugEnabled()) {
+      if (logger.isDebugEnabled() && partitionLengths.length > 0) {
         int underestimatedBlocksNum = 0;
-        // Distribution of sizes in MapStatus. The ranges are: [0, 1k), [1k, 10k), [10k, 100k),
-        // [100k, 1m), [1m, 10m), [10m, 100m), [100m, 1g), [1g, 10g), [10g, Long.MaxValue).
-        int[] lenDistribution = {0, 0, 0, 0, 0, 0, 0, 0, 0};
+        // Distribution of sizes in MapStatus.
+        double[] cp = new double[partitionLengths.length];
         for (int i = 0; i < partitionLengths.length; i++) {
-          long len = partitionLengths[i];
-          if (len > mapStatus.getSizeForBlock(i)) {
-            underestimatedBlocksNum++;
-          }
-          if (len >= 0L && len < 1024L) {
-            lenDistribution[0]++;
-          } else if (len >= 1024L && len < 10240L) {
-            lenDistribution[1]++;
-          } else if (len >= 10240L && len < 102400L) {
-            lenDistribution[2]++;
-          } else if (len >= 102400L && len < 1048576L ) {
-            lenDistribution[3]++;
-          } else if (len >= 1048576L && len < 10485760L) {
-            lenDistribution[4]++;
-          } else if (len >= 10485760L && len < 104857600L) {
-            lenDistribution[5]++;
-          } else if (len >= 104857600L && len < 1073741824L) {
-            lenDistribution[6]++;
-          } else if (len >= 1073741824L && len < 10737418240L) {
-            lenDistribution[7]++;
-          } else {
-            lenDistribution[8]++;
-          }
-        }
-        String[] ranges = {"[0, 1k)", "[1k, 10k)", "[10k, 100k)", "[100k, 1m)", "[1m, 10m)",
-          "[10m, 100m)", "[100m, 1g)", "[1g, 10g)", ">10g"};
-        String[] rangesAndDistribute = new String[9];
-        for (int j = 0; j < 9; j++) {
-          rangesAndDistribute[j] = ranges[j] + ":" + lenDistribution[j];
+          cp[i] = partitionLengths[i];
         }
+        Distribution distribution = new Distribution(cp, 0, cp.length);
+        double[] probabilities = {0.0, 0.25, 0.5, 0.75, 1.0};
+        String distributionStr = distribution.getQuantiles(probabilities).mkString(", ");
         logger.debug("For task {}.{} in stage {} (TID {}), the block sizes in MapStatus are " +
           "inaccurate (average is {}, {} blocks underestimated, size of underestimated is {})," +
-          " distribution is {}.", taskContext.partitionId(), taskContext.attemptNumber(),
-          taskContext.stageId(), taskContext.taskAttemptId(), hc.getAvgSize(),
-          underestimatedBlocksNum, underestimatedBlocksSize,
-          String.join(", ", rangesAndDistribute));
+          " distribution at the given probabilities(0, 0.25, 0.5, 0.75, 1.0) is {}.",
+          taskContext.partitionId(), taskContext.attemptNumber(), taskContext.stageId(),
+          taskContext.taskAttemptId(), hc.getAvgSize(),
+          underestimatedBlocksNum, underestimatedBlocksSize, distributionStr);
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
@@ -22,7 +22,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler.{HighlyCompressedMapStatus, MapStatus}
 import org.apache.spark.shuffle.{BaseShuffleHandle, IndexShuffleBlockResolver, ShuffleWriter}
 import org.apache.spark.storage.ShuffleBlockId
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{Distribution, Utils}
 import org.apache.spark.util.collection.ExternalSorter
 
 private[spark] class SortShuffleWriter[K, V, C](
@@ -78,30 +78,18 @@ private[spark] class SortShuffleWriter[K, V, C](
           val underestimatedLengths = partitionLengths.filter(_ > hc.getAvgSize)
           writeMetrics.incUnderestimatedBlocksSize(underestimatedLengths.sum)
           if (log.isDebugEnabled()) {
-            // Distribution of sizes in MapStatus. The ranges are: [0, 1k), [1k, 10k), [10k, 100k),
-            // [100k, 1m), [1m, 10m), [10m, 100m), [100m, 1g), [1g, 10g), [10g, Long.MaxValue).
-            val lenDistribution = Array[Int](0, 0, 0, 0, 0, 0, 0, 0, 0)
-            partitionLengths.foreach {
-              case len: Long if len >= 0L && len < 1024L => lenDistribution(0) += 1
-              case len: Long if len >= 1024L && len < 10240L => lenDistribution(1) += 1
-              case len: Long if len >= 10240L && len < 102400L => lenDistribution(2) += 1
-              case len: Long if len >= 102400L && len < 1048576L => lenDistribution(3) += 1
-              case len: Long if len >= 1048576L && len < 10485760L => lenDistribution(4) += 1
-              case len: Long if len >= 10485760L && len < 104857600L => lenDistribution(5) += 1
-              case len: Long if len >= 104857600L && len < 1073741824L => lenDistribution(6) += 1
-              case len: Long if len >= 1073741824L && len < 10737418240L => lenDistribution(7) += 1
-              case len => lenDistribution(8) += 1
+            // Distribution of sizes in MapStatus.
+            Distribution(partitionLengths.map(_.toDouble)) match {
+              case Some(distribution) =>
+                val distributionStr = distribution.getQuantiles().mkString(", ")
+                logDebug(s"For task ${context.partitionId()}.${context.attemptNumber()} in stage" +
+                  s" ${context.stageId()} (TID ${context.taskAttemptId()}), the block sizes in" +
+                  s" MapStatus are inaccurate (average is ${hc.getAvgSize}," +
+                  s" ${underestimatedLengths.length} blocks underestimated, sum of sizes is" +
+                  s" ${underestimatedLengths.sum}), distribution at the given probabilities" +
+                  s" (0, 0.25, 0.5, 0.75, 1.0) is $distributionStr.")
+              case None => // no-op
             }
-            val ranges = List[String]("[0, 1k)", "[1k, 10k)", "[10k, 100k)", "[100k, 1m)",
-              "[1m, 10m)", "[10m, 100m)", "[100m, 1g)", "[1g, 10g)", ">10g")
-            val distributeStr = ranges.zip(lenDistribution).map {
-              case (range, num) => s"$range:$num"
-            }.mkString(", ")
-            logDebug(s"For task ${context.partitionId()}.${context.attemptNumber()} in stage " +
-              s"${context.stageId()} (TID ${context.taskAttemptId()}), " +
-              s"the block sizes in MapStatus are inaccurate (average is ${hc.getAvgSize}, " +
-              s"${underestimatedLengths.length} blocks underestimated, " +
-              s"sum of sizes is ${underestimatedLengths.sum}), distribution is $distributeStr.")
           }
         case _ => // no-op
       }
diff --git a/core/src/main/scala/org/apache/spark/util/Distribution.scala b/core/src/main/scala/org/apache/spark/util/Distribution.scala
@@ -42,7 +42,7 @@ private[spark] class Distribution(val data: Array[Double], val startIdx: Int, va
    * given from 0 to 1
    * @param probabilities
    */
-  def getQuantiles(probabilities: Traversable[Double] = defaultProbabilities)
+  def getQuantiles(probabilities: Array[Double] = defaultProbabilities)
       : IndexedSeq[Double] = {
     probabilities.toIndexedSeq.map { p: Double => data(closestIndex(p)) }
   }

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ private[spark] class Distribution(val data: Array[Double], val startIdx: Int, va`
`42`	`42`	`* given from 0 to 1`
`43`	`43`	`* @param probabilities`
`44`	`44`	`*/`
`45`		`- def getQuantiles(probabilities: Traversable[Double] = defaultProbabilities)`
	`45`	`+ def getQuantiles(probabilities: Array[Double] = defaultProbabilities)`
`46`	`46`	`: IndexedSeq[Double] = {`
`47`	`47`	`probabilities.toIndexedSeq.map { p: Double => data(closestIndex(p)) }`
`48`	`48`	`}`