From 2420be1ba077971740b896b399dfeb874d929c5e Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Tue, 13 Apr 2021 15:41:25 +0900 Subject: [PATCH 1/4] Add an internal option to control input buffer in univocity --- .../scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index c6a80611ea7c4..58b17570c2b17 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -211,6 +211,8 @@ class CSVOptions( } val lineSeparatorInWrite: Option[String] = lineSeparator + val inputBuffer: Option[Int] = parameters.get("inputBuffer").map(_.toInt) + /** * The handling method to be used when unescaped quotes are found in the input. */ @@ -257,6 +259,7 @@ class CSVOptions( settings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceInRead) settings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceInRead) settings.setReadInputOnSeparateThread(false) + inputBuffer.foreach(settings.setInputBufferSize) settings.setMaxColumns(maxColumns) settings.setNullValue(nullValue) settings.setEmptyValue(emptyValueInRead) From 8ef9121c0fe5ad3b3067aaf425b5f6048ed7b73b Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Tue, 13 Apr 2021 15:55:24 +0900 Subject: [PATCH 2/4] Update sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala --- .../scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index 58b17570c2b17..c10291381f6b1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -211,7 +211,7 @@ class CSVOptions( } val lineSeparatorInWrite: Option[String] = lineSeparator - val inputBuffer: Option[Int] = parameters.get("inputBuffer").map(_.toInt) + val inputBufferSize: Option[Int] = parameters.get("inputBuffer").map(_.toInt) /** * The handling method to be used when unescaped quotes are found in the input. From 0ac48e6c77c909213c670bdbea3f95d55dbf05fe Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Tue, 13 Apr 2021 15:55:29 +0900 Subject: [PATCH 3/4] Update sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala --- .../scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index c10291381f6b1..be628adc0311a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -259,7 +259,7 @@ class CSVOptions( settings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceInRead) settings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceInRead) settings.setReadInputOnSeparateThread(false) - inputBuffer.foreach(settings.setInputBufferSize) + inputBufferSize.foreach(settings.setInputBufferSize) settings.setMaxColumns(maxColumns) settings.setNullValue(nullValue) settings.setEmptyValue(emptyValueInRead) From f1f92fb33636ac4edf9c4d3a72beeaa3cdfc0637 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Tue, 13 Apr 2021 15:56:04 +0900 Subject: [PATCH 4/4] Update sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala --- .../scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index be628adc0311a..2e5539a90c65d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -211,7 +211,7 @@ class CSVOptions( } val lineSeparatorInWrite: Option[String] = lineSeparator - val inputBufferSize: Option[Int] = parameters.get("inputBuffer").map(_.toInt) + val inputBufferSize: Option[Int] = parameters.get("inputBufferSize").map(_.toInt) /** * The handling method to be used when unescaped quotes are found in the input.