1717
1818package org .apache .spark .sql .execution .columnar
1919
20- import scala .collection .JavaConverters ._
21-
2220import org .apache .commons .lang3 .StringUtils
2321
2422import org .apache .spark .network .util .JavaUtils
@@ -31,7 +29,7 @@ import org.apache.spark.sql.catalyst.plans.logical
3129import org .apache .spark .sql .catalyst .plans .logical .Statistics
3230import org .apache .spark .sql .execution .SparkPlan
3331import org .apache .spark .storage .StorageLevel
34- import org .apache .spark .util .CollectionAccumulator
32+ import org .apache .spark .util .LongAccumulator
3533
3634
3735object InMemoryRelation {
@@ -63,8 +61,7 @@ case class InMemoryRelation(
6361 @ transient child : SparkPlan ,
6462 tableName : Option [String ])(
6563 @ transient var _cachedColumnBuffers : RDD [CachedBatch ] = null ,
66- val batchStats : CollectionAccumulator [InternalRow ] =
67- child.sqlContext.sparkContext.collectionAccumulator[InternalRow ])
64+ val batchStats : LongAccumulator = child.sqlContext.sparkContext.longAccumulator)
6865 extends logical.LeafNode with MultiInstanceRelation {
6966
7067 override protected def innerChildren : Seq [QueryPlan [_]] = Seq (child)
@@ -74,21 +71,12 @@ case class InMemoryRelation(
7471 @ transient val partitionStatistics = new PartitionStatistics (output)
7572
7673 override lazy val statistics : Statistics = {
77- if (batchStats.value.isEmpty ) {
74+ if (batchStats.value == 0L ) {
7875 // Underlying columnar RDD hasn't been materialized, no useful statistics information
7976 // available, return the default statistics.
8077 Statistics (sizeInBytes = child.sqlContext.conf.defaultSizeInBytes)
8178 } else {
82- // Underlying columnar RDD has been materialized, required information has also been
83- // collected via the `batchStats` accumulator.
84- val sizeOfRow : Expression =
85- BindReferences .bindReference(
86- output.map(a => partitionStatistics.forAttribute(a).sizeInBytes).reduce(Add ),
87- partitionStatistics.schema)
88-
89- val sizeInBytes =
90- batchStats.value.asScala.map(row => sizeOfRow.eval(row).asInstanceOf [Long ]).sum
91- Statistics (sizeInBytes = sizeInBytes)
79+ Statistics (sizeInBytes = batchStats.value.longValue)
9280 }
9381 }
9482
@@ -139,10 +127,10 @@ case class InMemoryRelation(
139127 rowCount += 1
140128 }
141129
130+ batchStats.add(totalSize)
131+
142132 val stats = InternalRow .fromSeq(columnBuilders.map(_.columnStats.collectedStatistics)
143133 .flatMap(_.values))
144-
145- batchStats.add(stats)
146134 CachedBatch (rowCount, columnBuilders.map { builder =>
147135 JavaUtils .bufferToArray(builder.build())
148136 }, stats)
0 commit comments