From ee04aadf9dcca349f5045faef8973fccb964b511 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sun, 18 Sep 2016 18:06:27 +0900 Subject: [PATCH 1/2] Remove unused rowSeparator variable and set auto-expanding buffer as default for maxCharsPerColumn option in CSV --- dev/deps/spark-deps-hadoop-2.2 | 2 +- dev/deps/spark-deps-hadoop-2.3 | 2 +- dev/deps/spark-deps-hadoop-2.4 | 2 +- dev/deps/spark-deps-hadoop-2.6 | 2 +- dev/deps/spark-deps-hadoop-2.7 | 2 +- sql/core/pom.xml | 2 +- .../spark/sql/execution/datasources/csv/CSVOptions.scala | 4 +--- .../spark/sql/execution/datasources/csv/CSVParser.scala | 2 -- 8 files changed, 7 insertions(+), 11 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index a7259e25bfec6..f4f92c6d20c23 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -159,7 +159,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.1.jar +univocity-parsers-2.2.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xmlenc-0.52.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 6986ab572b947..3db013f1a7585 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -167,7 +167,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.1.jar +univocity-parsers-2.2.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xmlenc-0.52.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 75cccb352b9cf..71710109a16ac 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -167,7 +167,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.1.jar +univocity-parsers-2.2.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xmlenc-0.52.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index ef7b8a7d8da26..cb30fda253c0a 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -175,7 +175,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.1.jar +univocity-parsers-2.2.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xercesImpl-2.9.1.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 63566125373dd..9008aa80bc877 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -176,7 +176,7 @@ stax-api-1.0.1.jar stream-2.7.0.jar stringtemplate-3.2.1.jar super-csv-2.2.0.jar -univocity-parsers-2.1.1.jar +univocity-parsers-2.2.1.jar validation-api-1.1.0.Final.jar xbean-asm5-shaded-4.4.jar xercesImpl-2.9.1.jar diff --git a/sql/core/pom.xml b/sql/core/pom.xml index b2752638bebd5..84de1d4a6e2d1 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -38,7 +38,7 @@ com.univocity univocity-parsers - 2.1.1 + 2.2.1 jar diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index 364d7c831eb44..e7dcc22272192 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -112,7 +112,7 @@ private[csv] class CSVOptions(@transient private val parameters: Map[String, Str val maxColumns = getInt("maxColumns", 20480) - val maxCharsPerColumn = getInt("maxCharsPerColumn", 1000000) + val maxCharsPerColumn = getInt("maxCharsPerColumn", -1) val escapeQuotes = getBool("escapeQuotes", true) @@ -123,8 +123,6 @@ private[csv] class CSVOptions(@transient private val parameters: Map[String, Str val inputBufferSize = 128 val isCommentSet = this.comment != '\u0000' - - val rowSeparator = "\n" } object CSVOptions { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala index 64bdd6f4643dc..332f5c8e9fb74 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala @@ -34,7 +34,6 @@ private[csv] class CsvReader(params: CSVOptions) { val settings = new CsvParserSettings() val format = settings.getFormat format.setDelimiter(params.delimiter) - format.setLineSeparator(params.rowSeparator) format.setQuote(params.quote) format.setQuoteEscape(params.escape) format.setComment(params.comment) @@ -70,7 +69,6 @@ private[csv] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten private val format = writerSettings.getFormat format.setDelimiter(params.delimiter) - format.setLineSeparator(params.rowSeparator) format.setQuote(params.quote) format.setQuoteEscape(params.escape) format.setComment(params.comment) From 70cb567eb0980517b0c31d8c0e9a6e0cd6166998 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sun, 18 Sep 2016 20:06:18 +0900 Subject: [PATCH 2/2] Adds the comment for default value --- python/pyspark/sql/readwriter.py | 2 +- python/pyspark/sql/streaming.py | 2 +- .../src/main/scala/org/apache/spark/sql/DataFrameReader.scala | 4 ++-- .../org/apache/spark/sql/streaming/DataStreamReader.scala | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 3d79e0ccccee4..771daf4e007ab 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -348,7 +348,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non set, it uses the default value, ``20480``. :param maxCharsPerColumn: defines the maximum number of characters allowed for any given value being read. If None is set, it uses the default value, - ``1000000``. + ``-1`` meaning unlimited length. :param maxMalformedLogPerPartition: sets the maximum number of malformed rows Spark will log for each partition. Malformed records beyond this number will be ignored. If None is set, it diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 67375f6b5f942..8231321e117e8 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -516,7 +516,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non set, it uses the default value, ``20480``. :param maxCharsPerColumn: defines the maximum number of characters allowed for any given value being read. If None is set, it uses the default value, - ``1000000``. + ``-1`` meaning unlimited length. :param mode: allows a mode for dealing with corrupt records during parsing. If None is set, it uses the default value, ``PERMISSIVE``. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index d29d90ce40453..1cb3612041421 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -391,8 +391,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * `java.sql.Timestamp.valueOf()` and `java.sql.Date.valueOf()` or ISO 8601 format. *
  • `maxColumns` (default `20480`): defines a hard limit of how many columns * a record can have.
  • - *
  • `maxCharsPerColumn` (default `1000000`): defines the maximum number of characters allowed - * for any given value being read.
  • + *
  • `maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed + * for any given value being read. By default, it is -1 meaning unlimited length
  • *
  • `maxMalformedLogPerPartition` (default `10`): sets the maximum number of malformed rows * Spark will log for each partition. Malformed records beyond this number will be ignored.
  • *
  • `mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index c25f71af7362a..f5ae80a0fae3d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -246,8 +246,8 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * `java.text.SimpleDateFormat`. This applies to timestamp type.
  • *
  • `maxColumns` (default `20480`): defines a hard limit of how many columns * a record can have.
  • - *
  • `maxCharsPerColumn` (default `1000000`): defines the maximum number of characters allowed - * for any given value being read.
  • + *
  • `maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed + * for any given value being read. By default, it is -1 meaning unlimited length
  • *
  • `mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records * during parsing. *