diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-1.2 b/dev/deps/spark-deps-hadoop-2.7-hive-1.2 index f1bf1160f9039..ca9f150d35b81 100644 --- a/dev/deps/spark-deps-hadoop-2.7-hive-1.2 +++ b/dev/deps/spark-deps-hadoop-2.7-hive-1.2 @@ -200,7 +200,7 @@ stream/2.9.6//stream-2.9.6.jar stringtemplate/3.2.1//stringtemplate-3.2.1.jar super-csv/2.2.0//super-csv-2.2.0.jar threeten-extra/1.5.0//threeten-extra-1.5.0.jar -univocity-parsers/2.8.3//univocity-parsers-2.8.3.jar +univocity-parsers/2.9.0//univocity-parsers-2.9.0.jar xbean-asm7-shaded/4.15//xbean-asm7-shaded-4.15.jar xercesImpl/2.12.0//xercesImpl-2.12.0.jar xml-apis/1.4.01//xml-apis-1.4.01.jar diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 index 758e32086024f..706f89be794a6 100644 --- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 @@ -213,7 +213,7 @@ stream/2.9.6//stream-2.9.6.jar super-csv/2.2.0//super-csv-2.2.0.jar threeten-extra/1.5.0//threeten-extra-1.5.0.jar transaction-api/1.1//transaction-api-1.1.jar -univocity-parsers/2.8.3//univocity-parsers-2.8.3.jar +univocity-parsers/2.9.0//univocity-parsers-2.9.0.jar velocity/1.5//velocity-1.5.jar xbean-asm7-shaded/4.15//xbean-asm7-shaded-4.15.jar xercesImpl/2.12.0//xercesImpl-2.12.0.jar diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 index 0b8a068bff626..1156cb6e27ec1 100644 --- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 @@ -229,7 +229,7 @@ super-csv/2.2.0//super-csv-2.2.0.jar threeten-extra/1.5.0//threeten-extra-1.5.0.jar token-provider/1.0.1//token-provider-1.0.1.jar transaction-api/1.1//transaction-api-1.1.jar -univocity-parsers/2.8.3//univocity-parsers-2.8.3.jar +univocity-parsers/2.9.0//univocity-parsers-2.9.0.jar velocity/1.5//velocity-1.5.jar woodstox-core/5.0.3//woodstox-core-5.0.3.jar xbean-asm7-shaded/4.15//xbean-asm7-shaded-4.15.jar diff --git a/pom.xml b/pom.xml index c5722fae7392a..e3d71518d92b9 100644 --- a/pom.xml +++ b/pom.xml @@ -2348,7 +2348,7 @@ com.univocity univocity-parsers - 2.8.3 + 2.9.0 org.apache.hive diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtils.scala index 3e83c1dcb4758..efe4188377142 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVExprUtils.scala @@ -25,8 +25,13 @@ object CSVExprUtils { * This is currently being used in CSV reading path and CSV schema inference. */ def filterCommentAndEmpty(iter: Iterator[String], options: CSVOptions): Iterator[String] = { - iter.filter { line => - line.trim.nonEmpty && !line.startsWith(options.comment.toString) + if (options.isCommentSet) { + val commentPrefix = options.comment.toString + iter.filter { line => + line.trim.nonEmpty && !line.startsWith(commentPrefix) + } + } else { + iter.filter(_.trim.nonEmpty) } } @@ -34,7 +39,7 @@ object CSVExprUtils { if (options.isCommentSet) { val commentPrefix = options.comment.toString iter.dropWhile { line => - line.trim.isEmpty || line.trim.startsWith(commentPrefix) + line.trim.isEmpty || line.startsWith(commentPrefix) } } else { iter.dropWhile(_.trim.isEmpty) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index 9d09cab4fd482..f2191fcf35f1a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -220,7 +220,9 @@ class CSVOptions( format.setQuote(quote) format.setQuoteEscape(escape) charToEscapeQuoteEscaping.foreach(format.setCharToEscapeQuoteEscaping) - format.setComment(comment) + if (isCommentSet) { + format.setComment(comment) + } lineSeparatorInWrite.foreach(format.setLineSeparator) writerSettings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceFlagInWrite) @@ -242,7 +244,11 @@ class CSVOptions( format.setQuoteEscape(escape) lineSeparator.foreach(format.setLineSeparator) charToEscapeQuoteEscaping.foreach(format.setCharToEscapeQuoteEscaping) - format.setComment(comment) + if (isCommentSet) { + format.setComment(comment) + } else { + settings.setCommentProcessingEnabled(false) + } settings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceInRead) settings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceInRead) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 31e99bef4e243..378695d154dbc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1902,25 +1902,26 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa test("SPARK-25387: bad input should not cause NPE") { val schema = StructType(StructField("a", IntegerType) :: Nil) - val input = spark.createDataset(Seq("\u0000\u0000\u0001234")) + val input = spark.createDataset(Seq("\u0001\u0000\u0001234")) checkAnswer(spark.read.schema(schema).csv(input), Row(null)) checkAnswer(spark.read.option("multiLine", true).schema(schema).csv(input), Row(null)) - assert(spark.read.csv(input).collect().toSet == Set(Row())) + assert(spark.read.schema(schema).csv(input).collect().toSet == Set(Row(null))) } test("SPARK-31261: bad csv input with `columnNameCorruptRecord` should not cause NPE") { val schema = StructType( StructField("a", IntegerType) :: StructField("_corrupt_record", StringType) :: Nil) - val input = spark.createDataset(Seq("\u0000\u0000\u0001234")) + val input = spark.createDataset(Seq("\u0001\u0000\u0001234")) checkAnswer( spark.read .option("columnNameOfCorruptRecord", "_corrupt_record") .schema(schema) .csv(input), - Row(null, null)) - assert(spark.read.csv(input).collect().toSet == Set(Row())) + Row(null, "\u0001\u0000\u0001234")) + assert(spark.read.schema(schema).csv(input).collect().toSet == + Set(Row(null, "\u0001\u0000\u0001234"))) } test("field names of inferred schema shouldn't compare to the first row") { @@ -2366,6 +2367,17 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa } } + test("SPARK-32614: don't treat rows starting with null char as comment") { + withTempPath { path => + Seq("\u0000foo", "bar", "baz").toDS.write.text(path.getCanonicalPath) + val df = spark.read.format("csv") + .option("header", "false") + .option("inferSchema", "true") + .load(path.getCanonicalPath) + assert(df.count() == 3) + } + } + test("case sensitivity of filters references") { Seq(true, false).foreach { filterPushdown => withSQLConf(SQLConf.CSV_FILTER_PUSHDOWN_ENABLED.key -> filterPushdown.toString) {