Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,11 @@ class CSVOptions(

val inputBufferSize = 128

/**
* The max error content length in CSV parser/writer exception message.
*/
val maxErrorContentLength = 1000

val isCommentSet = this.comment != '\u0000'

val samplingRatio =
Expand Down Expand Up @@ -220,6 +225,7 @@ class CSVOptions(
writerSettings.setSkipEmptyLines(true)
writerSettings.setQuoteAllFields(quoteAll)
writerSettings.setQuoteEscapingEnabled(escapeQuotes)
writerSettings.setErrorContentLength(maxErrorContentLength)
writerSettings
}

Expand All @@ -246,6 +252,7 @@ class CSVOptions(
lineSeparatorInRead.foreach { _ =>
settings.setNormalizeLineEndingsWithinQuotes(!multiLine)
}
settings.setErrorContentLength(maxErrorContentLength)

settings
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.csv

import java.io.{ByteArrayOutputStream, EOFException, File, FileOutputStream}
import java.nio.charset.{Charset, StandardCharsets, UnsupportedCharsetException}
import java.nio.file.Files
import java.nio.file.{Files, StandardOpenOption}
import java.sql.{Date, Timestamp}
import java.text.SimpleDateFormat
import java.util.Locale
Expand All @@ -28,6 +28,7 @@ import java.util.zip.GZIPOutputStream
import scala.collection.JavaConverters._
import scala.util.Properties

import com.univocity.parsers.common.TextParsingException
import org.apache.commons.lang3.time.FastDateFormat
import org.apache.hadoop.io.SequenceFile.CompressionType
import org.apache.hadoop.io.compress.GzipCodec
Expand Down Expand Up @@ -2085,4 +2086,27 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
}
}
}

test("SPARK-28431: prevent CSV datasource throw TextParsingException with large size message") {
withTempPath { path =>
val maxCharsPerCol = 10000
val str = "a" * (maxCharsPerCol + 1)

Files.write(
path.toPath,
str.getBytes(StandardCharsets.UTF_8),
StandardOpenOption.CREATE, StandardOpenOption.WRITE
)

val errMsg = intercept[TextParsingException] {
spark.read
.option("maxCharsPerColumn", maxCharsPerCol)
.csv(path.getAbsolutePath)
.count()
}.getMessage

assert(errMsg.contains("..."),
"expect the TextParsingException truncate the error content to be 1000 length.")
}
}
}