[SPARK-24181][SQL] Better error message for writing sorted data

dbtsai · dbtsai · commit 6ea582e36ab0 · 2018-05-09T09:15:16.000-07:00
## What changes were proposed in this pull request? The exception message should clearly distinguish sorting and bucketing in `save` and `jdbc` write. When a user tries to write a sorted data using save or insertInto, it will throw an exception with message that `s"'$operation' does not support bucketing right now""`. We should throw `s"'$operation' does not support sortBy right now""` instead. ## How was this patch tested? More tests in `DataFrameReaderWriterSuite.scala` Author: DB Tsai <d_tsai@apple.com> Closes #21235 from dbtsai/fixException.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -330,8 +330,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   private def getBucketSpec: Option[BucketSpec] = {
-    if (sortColumnNames.isDefined) {
-      require(numBuckets.isDefined, "sortBy must be used together with bucketBy")
+    if (sortColumnNames.isDefined && numBuckets.isEmpty) {
+      throw new AnalysisException("sortBy must be used together with bucketBy")
     }
 
     numBuckets.map { n =>
@@ -340,8 +340,12 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   private def assertNotBucketed(operation: String): Unit = {
-    if (numBuckets.isDefined || sortColumnNames.isDefined) {
-      throw new AnalysisException(s"'$operation' does not support bucketing right now")
+    if (getBucketSpec.isDefined) {
+      if (sortColumnNames.isEmpty) {
+        throw new AnalysisException(s"'$operation' does not support bucketBy right now")
+      } else {
+        throw new AnalysisException(s"'$operation' does not support bucketBy and sortBy right now")
+      }
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala
@@ -60,7 +60,10 @@ abstract class BucketedWriteSuite extends QueryTest with SQLTestUtils {
 
   test("specify sorting columns without bucketing columns") {
     val df = Seq(1 -> "a", 2 -> "b").toDF("i", "j")
-    intercept[IllegalArgumentException](df.write.sortBy("j").saveAsTable("tt"))
+    val e = intercept[AnalysisException] {
+      df.write.sortBy("j").saveAsTable("tt")
+    }
+    assert(e.getMessage == "sortBy must be used together with bucketBy;")
   }
 
   test("sorting by non-orderable column") {
@@ -74,7 +77,16 @@ abstract class BucketedWriteSuite extends QueryTest with SQLTestUtils {
     val e = intercept[AnalysisException] {
       df.write.bucketBy(2, "i").parquet("/tmp/path")
     }
-    assert(e.getMessage == "'save' does not support bucketing right now;")
+    assert(e.getMessage == "'save' does not support bucketBy right now;")
+  }
+
+  test("write bucketed and sorted data using save()") {
+    val df = Seq(1 -> "a", 2 -> "b").toDF("i", "j")
+
+    val e = intercept[AnalysisException] {
+      df.write.bucketBy(2, "i").sortBy("i").parquet("/tmp/path")
+    }
+    assert(e.getMessage == "'save' does not support bucketBy and sortBy right now;")
   }
 
   test("write bucketed data using insertInto()") {
@@ -83,7 +95,16 @@ abstract class BucketedWriteSuite extends QueryTest with SQLTestUtils {
     val e = intercept[AnalysisException] {
       df.write.bucketBy(2, "i").insertInto("tt")
     }
-    assert(e.getMessage == "'insertInto' does not support bucketing right now;")
+    assert(e.getMessage == "'insertInto' does not support bucketBy right now;")
+  }
+
+  test("write bucketed and sorted data using insertInto()") {
+    val df = Seq(1 -> "a", 2 -> "b").toDF("i", "j")
+
+    val e = intercept[AnalysisException] {
+      df.write.bucketBy(2, "i").sortBy("i").insertInto("tt")
+    }
+    assert(e.getMessage == "'insertInto' does not support bucketBy and sortBy right now;")
   }
 
   private lazy val df = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
@@ -276,7 +276,7 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
     assert(LastOptions.parameters("doubleOpt") == "6.7")
   }
 
-  test("check jdbc() does not support partitioning or bucketing") {
+  test("check jdbc() does not support partitioning, bucketBy or sortBy") {
     val df = spark.read.text(Utils.createTempDir(namePrefix = "text").getCanonicalPath)
 
     var w = df.write.partitionBy("value")
@@ -287,7 +287,19 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
 
     w = df.write.bucketBy(2, "value")
     e = intercept[AnalysisException](w.jdbc(null, null, null))
-    Seq("jdbc", "bucketing").foreach { s =>
+    Seq("jdbc", "does not support bucketBy right now").foreach { s =>
+      assert(e.getMessage.toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT)))
+    }
+
+    w = df.write.sortBy("value")
+    e = intercept[AnalysisException](w.jdbc(null, null, null))
+    Seq("sortBy must be used together with bucketBy").foreach { s =>
+      assert(e.getMessage.toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT)))
+    }
+
+    w = df.write.bucketBy(2, "value").sortBy("value")
+    e = intercept[AnalysisException](w.jdbc(null, null, null))
+    Seq("jdbc", "does not support bucketBy and sortBy right now").foreach { s =>
       assert(e.getMessage.toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT)))
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -330,8 +330,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {`
`330`	`330`	`}`
`331`	`331`
`332`	`332`	`private def getBucketSpec: Option[BucketSpec] = {`
`333`		`- if (sortColumnNames.isDefined) {`
`334`		`- require(numBuckets.isDefined, "sortBy must be used together with bucketBy")`
	`333`	`+ if (sortColumnNames.isDefined && numBuckets.isEmpty) {`
	`334`	`+ throw new AnalysisException("sortBy must be used together with bucketBy")`
`335`	`335`	`}`
`336`	`336`
`337`	`337`	`numBuckets.map { n =>`
`@@ -340,8 +340,12 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {`
`340`	`340`	`}`
`341`	`341`
`342`	`342`	`private def assertNotBucketed(operation: String): Unit = {`
`343`		`- if (numBuckets.isDefined \|\| sortColumnNames.isDefined) {`
`344`		`- throw new AnalysisException(s"'$operation' does not support bucketing right now")`
	`343`	`+ if (getBucketSpec.isDefined) {`
	`344`	`+ if (sortColumnNames.isEmpty) {`
	`345`	`+ throw new AnalysisException(s"'$operation' does not support bucketBy right now")`
	`346`	`+ } else {`
	`347`	`+ throw new AnalysisException(s"'$operation' does not support bucketBy and sortBy right now")`
	`348`	`+ }`
`345`	`349`	`}`
`346`	`350`	`}`
`347`	`351`