From 4584b2db0e39a4a67685110e35a243757a858319 Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Thu, 16 Aug 2018 12:57:04 +0800 Subject: [PATCH 1/2] Revert "[SPARK-24924][SQL] Add mapping for built-in Avro data source" This reverts commit 58353d7f4baa8102c3d2f4777a5c407f14993306. --- .../org/apache/spark/sql/avro/AvroSuite.scala | 10 +--------- .../sql/execution/datasources/DataSource.scala | 8 ++++++-- .../org/apache/spark/sql/SQLQuerySuite.scala | 16 ++++++++++++++++ .../sql/sources/ResolvedDataSourceSuite.scala | 14 ++++++++++++-- 4 files changed, 35 insertions(+), 13 deletions(-) diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala index c4f4d8efd6df..1ffebaa0aeb3 100644 --- a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala +++ b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala @@ -35,7 +35,6 @@ import org.apache.commons.io.FileUtils import org.apache.spark.SparkException import org.apache.spark.sql._ -import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils} import org.apache.spark.sql.types._ @@ -76,13 +75,6 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils { }, new GenericDatumReader[Any]()).getSchema.toString(false) } - test("resolve avro data source") { - Seq("avro", "com.databricks.spark.avro").foreach { provider => - assert(DataSource.lookupDataSource(provider, spark.sessionState.conf) === - classOf[org.apache.spark.sql.avro.AvroFileFormat]) - } - } - test("reading from multiple paths") { val df = spark.read.format("avro").load(episodesAvro, episodesAvro) assert(df.count == 16) @@ -503,7 +495,7 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils { // get the same values back. withTempPath { tempDir => val name = "AvroTest" - val namespace = "org.apache.spark.avro" + val namespace = "com.databricks.spark.avro" val parameters = Map("recordName" -> name, "recordNamespace" -> namespace) val avroDir = tempDir + "/namedAvro" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index b1a10fdb6020..0c3d9a4895fe 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -571,7 +571,6 @@ object DataSource extends Logging { val nativeOrc = classOf[OrcFileFormat].getCanonicalName val socket = classOf[TextSocketSourceProvider].getCanonicalName val rate = classOf[RateStreamProvider].getCanonicalName - val avro = "org.apache.spark.sql.avro.AvroFileFormat" Map( "org.apache.spark.sql.jdbc" -> jdbc, @@ -593,7 +592,6 @@ object DataSource extends Logging { "org.apache.spark.ml.source.libsvm.DefaultSource" -> libsvm, "org.apache.spark.ml.source.libsvm" -> libsvm, "com.databricks.spark.csv" -> csv, - "com.databricks.spark.avro" -> avro, "org.apache.spark.sql.execution.streaming.TextSocketSourceProvider" -> socket, "org.apache.spark.sql.execution.streaming.RateSourceProvider" -> rate ) @@ -637,6 +635,12 @@ object DataSource extends Logging { "Hive built-in ORC data source must be used with Hive support enabled. " + "Please use the native ORC data source by setting 'spark.sql.orc.impl' to " + "'native'") + } else if (provider1.toLowerCase(Locale.ROOT) == "avro" || + provider1 == "com.databricks.spark.avro") { + throw new AnalysisException( + s"Failed to find data source: ${provider1.toLowerCase(Locale.ROOT)}. " + + "Please find an Avro package at " + + "http://spark.apache.org/third-party-projects.html") } else { throw new ClassNotFoundException( s"Failed to find data source: $provider1. Please find packages at " + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 84efd2b7a1dc..780fb6f0353b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -1699,6 +1699,22 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { } assert(e.message.contains("Hive built-in ORC data source must be used with Hive support")) + e = intercept[AnalysisException] { + sql(s"select id from `com.databricks.spark.avro`.`file_path`") + } + assert(e.message.contains("Failed to find data source: com.databricks.spark.avro.")) + + // data source type is case insensitive + e = intercept[AnalysisException] { + sql(s"select id from Avro.`file_path`") + } + assert(e.message.contains("Failed to find data source: avro.")) + + e = intercept[AnalysisException] { + sql(s"select id from avro.`file_path`") + } + assert(e.message.contains("Failed to find data source: avro.")) + e = intercept[AnalysisException] { sql(s"select id from `org.apache.spark.sql.sources.HadoopFsRelationProvider`.`file_path`") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala index 95460fa70d8f..4adbff5c663b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala @@ -77,9 +77,19 @@ class ResolvedDataSourceSuite extends SparkFunSuite with SharedSQLContext { } test("error message for unknown data sources") { - val error = intercept[ClassNotFoundException] { + val error1 = intercept[AnalysisException] { + getProvidingClass("avro") + } + assert(error1.getMessage.contains("Failed to find data source: avro.")) + + val error2 = intercept[AnalysisException] { + getProvidingClass("com.databricks.spark.avro") + } + assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro.")) + + val error3 = intercept[ClassNotFoundException] { getProvidingClass("asfdwefasdfasdf") } - assert(error.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) + assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) } } From 656790e7cc05ea1fd31e768ed1a05c18b7881de6 Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Thu, 16 Aug 2018 13:42:19 +0800 Subject: [PATCH 2/2] improve error message --- .../apache/spark/sql/execution/datasources/DataSource.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index 0c3d9a4895fe..61f6290ef5ab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -639,8 +639,8 @@ object DataSource extends Logging { provider1 == "com.databricks.spark.avro") { throw new AnalysisException( s"Failed to find data source: ${provider1.toLowerCase(Locale.ROOT)}. " + - "Please find an Avro package at " + - "http://spark.apache.org/third-party-projects.html") + "AVRO is built-in data source since Spark 2.4. Please deploy the application " + + "as per https://spark.apache.org/docs/latest/avro-data-source.html#deploying") } else { throw new ClassNotFoundException( s"Failed to find data source: $provider1. Please find packages at " +