From 5b61de71715b07c58ca337eccc6b98a52f533a17 Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Wed, 17 Mar 2021 13:44:55 +0900 Subject: [PATCH 1/3] Respect the default input buffer size in Univocity --- .../spark/sql/catalyst/csv/CSVOptions.scala | 3 --- .../execution/datasources/csv/CSVSuite.scala | 21 +++++++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index ec405994eadef..c6a80611ea7c4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -166,8 +166,6 @@ class CSVOptions( val quoteAll = getBool("quoteAll", false) - val inputBufferSize = 128 - /** * The max error content length in CSV parser/writer exception message. */ @@ -259,7 +257,6 @@ class CSVOptions( settings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceInRead) settings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceInRead) settings.setReadInputOnSeparateThread(false) - settings.setInputBufferSize(inputBufferSize) settings.setMaxColumns(maxColumns) settings.setNullValue(nullValue) settings.setEmptyValue(emptyValueInRead) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 034503c50577f..1d1dacd69d1fc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -2452,6 +2452,27 @@ abstract class CSVSuite assert(result.sameElements(exceptResults)) } } + + test("SPARK-34768: counting a long record with ignoreTrailingWhiteSpace set to true") { + val line = "XX |XXX-XXXX |XXXXXX " + + "|XXXXXXXX|XXXXX |XXXXXX " + + "|X|XXXXXXX|XXXXXXXX|XXXX|XXXXXXXXXXXXXXX |XXXXXXXXXXX" + + "|XXXXXX |XXXXXXXXXXXXXXXXXXXXXX|XXXXXX " + + "|XXXXXXXXXXXXXX|XXXXXX |XXXXXXXXXXXXXXXXXXXXXX" + + "|XXXXXX |XXXXXXXXXXXXXXXXXXXXXX|XXXXXX " + + "|XXXXXXXXX|XXXXXX |XXXXXXX| " + + "|| || " + + "|| ||XXXX-XX-XX XX:XX:XX.XXXXXXX" + + "||XXXXX.XXXXXXXXXXXXXXX|XXXXX.XXXXXXXXXXXXXX" + + "|XXXXX.XXXXXXXXXXXXXXX|X|XXXXXX |X" + withTempPath { path => + Seq(line).toDF.write.text(path.getAbsolutePath) + assert(spark.read.format("csv") + .option("delimiter", "|") + .option("inferSchema", "true") + .option("ignoreTrailingWhiteSpace", "true").load(path.getAbsolutePath).count() == 1) + } + } } class CSVv1Suite extends CSVSuite { From dd1e0e2ec0fd06e1a7a57a07dcd58bba04b3eba9 Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Wed, 17 Mar 2021 15:21:36 +0900 Subject: [PATCH 2/3] Address comment --- .../apache/spark/sql/execution/datasources/csv/CSVSuite.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 1d1dacd69d1fc..b6f93c04b6f41 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -2469,7 +2469,6 @@ abstract class CSVSuite Seq(line).toDF.write.text(path.getAbsolutePath) assert(spark.read.format("csv") .option("delimiter", "|") - .option("inferSchema", "true") .option("ignoreTrailingWhiteSpace", "true").load(path.getAbsolutePath).count() == 1) } } From 6186f10ecbc4116835b9fcffbb59bd5af1be4ac2 Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Wed, 17 Mar 2021 16:35:33 +0900 Subject: [PATCH 3/3] Address comments --- .../sql/execution/datasources/csv/CSVSuite.scala | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index b6f93c04b6f41..fd25a79619d24 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -2454,17 +2454,8 @@ abstract class CSVSuite } test("SPARK-34768: counting a long record with ignoreTrailingWhiteSpace set to true") { - val line = "XX |XXX-XXXX |XXXXXX " + - "|XXXXXXXX|XXXXX |XXXXXX " + - "|X|XXXXXXX|XXXXXXXX|XXXX|XXXXXXXXXXXXXXX |XXXXXXXXXXX" + - "|XXXXXX |XXXXXXXXXXXXXXXXXXXXXX|XXXXXX " + - "|XXXXXXXXXXXXXX|XXXXXX |XXXXXXXXXXXXXXXXXXXXXX" + - "|XXXXXX |XXXXXXXXXXXXXXXXXXXXXX|XXXXXX " + - "|XXXXXXXXX|XXXXXX |XXXXXXX| " + - "|| || " + - "|| ||XXXX-XX-XX XX:XX:XX.XXXXXXX" + - "||XXXXX.XXXXXXXXXXXXXXX|XXXXX.XXXXXXXXXXXXXX" + - "|XXXXX.XXXXXXXXXXXXXXX|X|XXXXXX |X" + val bufSize = 128 + val line = "X" * (bufSize - 1) + "| |" withTempPath { path => Seq(line).toDF.write.text(path.getAbsolutePath) assert(spark.read.format("csv")