Skip to content

Commit e677394

Browse files
dongjoon-hyuncloud-fan
authored andcommitted
[SPARK-21041][SQL] SparkSession.range should be consistent with SparkContext.range
## What changes were proposed in this pull request? This PR fixes the inconsistency in `SparkSession.range`. **BEFORE** ```scala scala> spark.range(java.lang.Long.MAX_VALUE - 3, java.lang.Long.MIN_VALUE + 2, 1).collect res2: Array[Long] = Array(9223372036854775804, 9223372036854775805, 9223372036854775806) ``` **AFTER** ```scala scala> spark.range(java.lang.Long.MAX_VALUE - 3, java.lang.Long.MIN_VALUE + 2, 1).collect res2: Array[Long] = Array() ``` ## How was this patch tested? Pass the Jenkins with newly added test cases. Author: Dongjoon Hyun <dongjoon@apache.org> Closes #18257 from dongjoon-hyun/SPARK-21041. (cherry picked from commit a92e095) Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent a4d78e4 commit e677394

File tree

2 files changed

+18
-3
lines changed

2 files changed

+18
-3
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import scala.concurrent.{ExecutionContext, Future}
2121
import scala.concurrent.duration.Duration
2222

2323
import org.apache.spark.{InterruptibleIterator, SparkException, TaskContext}
24-
import org.apache.spark.rdd.{PartitionwiseSampledRDD, RDD}
24+
import org.apache.spark.rdd.{EmptyRDD, PartitionwiseSampledRDD, RDD}
2525
import org.apache.spark.sql.catalyst.InternalRow
2626
import org.apache.spark.sql.catalyst.expressions._
2727
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, ExpressionCanonicalizer}
@@ -347,8 +347,12 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
347347
}
348348

349349
override def inputRDDs(): Seq[RDD[InternalRow]] = {
350-
sqlContext.sparkContext.parallelize(0 until numSlices, numSlices)
351-
.map(i => InternalRow(i)) :: Nil
350+
val rdd = if (start == end || (start < end ^ 0 < step)) {
351+
new EmptyRDD[InternalRow](sqlContext.sparkContext)
352+
} else {
353+
sqlContext.sparkContext.parallelize(0 until numSlices, numSlices).map(i => InternalRow(i))
354+
}
355+
rdd :: Nil
352356
}
353357

354358
protected override def doProduce(ctx: CodegenContext): String = {

sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,17 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
191191
checkAnswer(sql("SELECT * FROM range(3)"), Row(0) :: Row(1) :: Row(2) :: Nil)
192192
}
193193
}
194+
195+
test("SPARK-21041 SparkSession.range()'s behavior is inconsistent with SparkContext.range()") {
196+
val start = java.lang.Long.MAX_VALUE - 3
197+
val end = java.lang.Long.MIN_VALUE + 2
198+
Seq("false", "true").foreach { value =>
199+
withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> value) {
200+
assert(spark.range(start, end, 1).collect.length == 0)
201+
assert(spark.range(start, start, 1).collect.length == 0)
202+
}
203+
}
204+
}
194205
}
195206

196207
object DataFrameRangeSuite {

0 commit comments

Comments
 (0)