Skip to content

Commit b46c92b

Browse files
committed
Add a test
1 parent 843721b commit b46c92b

File tree

1 file changed

+64
-0
lines changed

1 file changed

+64
-0
lines changed
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql
19+
20+
import org.apache.commons.math3.stat.inference.ChiSquareTest
21+
22+
import org.apache.spark.sql.internal.SQLConf
23+
import org.apache.spark.sql.test.SharedSQLContext
24+
25+
26+
class ConfigBehaviorSuite extends QueryTest with SharedSQLContext {
27+
28+
import testImplicits._
29+
30+
test("SPARK-22160 spark.sql.execution.rangeExchange.sampleSizePerPartition") {
31+
// In this test, we run a sort and compute the histogram for partition size post shuffle.
32+
// With a high sample count, the partition size should be more evenly distributed, and has a
33+
// low chi-sq test value.
34+
35+
val numPartitions = 4
36+
37+
def computeChiSquareTest(): Double = {
38+
val n = 10000
39+
// Trigger a sort
40+
val data = spark.range(0, n, 1, 1).sort('id)
41+
.selectExpr("SPARK_PARTITION_ID() pid", "id").as[(Int, Long)].collect()
42+
43+
// Compute histogram for the number of records per partition post sort
44+
val dist = data.groupBy(_._1).map(_._2.length.toLong).toArray
45+
assert(dist.length == 4)
46+
47+
new ChiSquareTest().chiSquare(
48+
Array.fill(numPartitions) { n.toDouble / numPartitions },
49+
dist)
50+
}
51+
52+
withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> numPartitions.toString) {
53+
// The default chi-sq value should be low
54+
assert(computeChiSquareTest() < 100)
55+
56+
withSQLConf(SQLConf.RANGE_EXCHANGE_SAMPLE_SIZE_PER_PARTITION.key -> "1") {
57+
// If we only sample one point, the range boundaries will be pretty bad and the
58+
// chi-sq value would be very high.
59+
assert(computeChiSquareTest() > 1000)
60+
}
61+
}
62+
}
63+
64+
}

0 commit comments

Comments
 (0)