From ad3a11d5c6ead4133195e81f59852db669de5b56 Mon Sep 17 00:00:00 2001 From: Koert Kuipers Date: Sat, 1 Sep 2018 14:59:35 -0400 Subject: [PATCH 1/2] fix new behavior when quote is changed and fix old behavior when quote is unset --- .../datasources/csv/CSVOptions.scala | 12 ++++- .../execution/datasources/csv/CSVSuite.scala | 45 ++++++++++++++++++- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index fab8d62da0c1..e57e92af4e06 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -173,7 +173,11 @@ class CSVOptions( writerSettings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceFlagInWrite) writerSettings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceFlagInWrite) writerSettings.setNullValue(nullValue) - writerSettings.setEmptyValue("\"\"") + if (quote == '\u0000') { + writerSettings.setEmptyValue(nullValue) + } else { + writerSettings.setEmptyValue(s"${quote}${quote}") + } writerSettings.setSkipEmptyLines(true) writerSettings.setQuoteAllFields(quoteAll) writerSettings.setQuoteEscapingEnabled(escapeQuotes) @@ -194,7 +198,11 @@ class CSVOptions( settings.setInputBufferSize(inputBufferSize) settings.setMaxColumns(maxColumns) settings.setNullValue(nullValue) - settings.setEmptyValue("") + if (quote == '\u0000') { + settings.setEmptyValue(null) + } else { + settings.setEmptyValue("") + } settings.setMaxCharsPerColumn(maxCharsPerColumn) settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER) settings diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 5a1d6679ebbd..f2ebc7d29a4d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1357,11 +1357,54 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te checkAnswer(computed, expected) } - // Keeps the old behavior where empty string us coerced to nullValue is not passed. + + // Checks for new behavior where an empty string is not coerced to null when `nullValue` is + // set to anything but an empty string literal and quote character is changed. + withTempPath { path => + df.write + .option("nullValue", "-") + .option("quote", "'") + .csv(path.getAbsolutePath) + val computed = spark.read + .option("nullValue", "-") + .option("quote", "'") + .schema(df.schema) + .csv(path.getAbsolutePath) + val expected = Seq( + (1, "John Doe"), + (2, ""), + (3, litNull), + (4, litNull) + ).toDF("id", "name") + + checkAnswer(computed, expected) + } + + // Keeps the old behavior where empty string is coerced to nullValue if not passed. + withTempPath { path => + df.write + .csv(path.getAbsolutePath) + val computed = spark.read + .schema(df.schema) + .csv(path.getAbsolutePath) + val expected = Seq( + (1, "John Doe"), + (2, litNull), + (3, "-"), + (4, litNull) + ).toDF("id", "name") + + checkAnswer(computed, expected) + } + + // Keeps the old behavior where empty string is coerced to nullValue if not passed + // with quotes disabled. withTempPath { path => df.write + .option("quote", "") .csv(path.getAbsolutePath) val computed = spark.read + .option("quote", "") .schema(df.schema) .csv(path.getAbsolutePath) val expected = Seq( From 57724049dbe40d4b8b38a249084feff4363b99b8 Mon Sep 17 00:00:00 2001 From: Koert Kuipers Date: Sat, 1 Sep 2018 16:13:50 -0400 Subject: [PATCH 2/2] minimize changes, also there is no point in quoting empty strings if null value is also set to empty string --- .../spark/sql/execution/datasources/csv/CSVOptions.scala | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index e57e92af4e06..60d26ff06755 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -173,7 +173,7 @@ class CSVOptions( writerSettings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceFlagInWrite) writerSettings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceFlagInWrite) writerSettings.setNullValue(nullValue) - if (quote == '\u0000') { + if (nullValue == "" || quote == '\u0000') { writerSettings.setEmptyValue(nullValue) } else { writerSettings.setEmptyValue(s"${quote}${quote}") @@ -198,11 +198,7 @@ class CSVOptions( settings.setInputBufferSize(inputBufferSize) settings.setMaxColumns(maxColumns) settings.setNullValue(nullValue) - if (quote == '\u0000') { - settings.setEmptyValue(null) - } else { - settings.setEmptyValue("") - } + settings.setEmptyValue("") settings.setMaxCharsPerColumn(maxCharsPerColumn) settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER) settings