Skip to content

Commit 91f4750

Browse files
committed
Add test to check UTF-16 and UTF-32
1 parent 0d0addf commit 91f4750

File tree

3 files changed

+19
-18
lines changed

3 files changed

+19
-18
lines changed

python/pyspark/sql/readwriter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -895,7 +895,7 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No
895895
the quote character. If None is set, the default value is
896896
escape character when escape and quote characters are
897897
different, ``\0`` otherwise..
898-
:param encoding: sets encoding used for encoding the file. If None is set, it
898+
:param encoding: sets the encoding (charset) to be used on the csv file. If None is set, it
899899
uses the default value, ``UTF-8``.
900900
901901
>>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,7 @@ private[csv] class CsvOutputWriter(
153153
private val writer = CodecStreams.createOutputStreamWriter(
154154
context,
155155
new Path(path),
156-
charset
157-
)
156+
charset)
158157

159158
private val gen = new UnivocityGenerator(dataSchema, writer, params)
160159

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
package org.apache.spark.sql.execution.datasources.csv
1919

2020
import java.io.File
21-
import java.nio.charset.UnsupportedCharsetException
21+
import java.nio.charset.{Charset, UnsupportedCharsetException}
22+
import java.nio.file.Files
2223
import java.sql.{Date, Timestamp}
2324
import java.text.SimpleDateFormat
2425
import java.util.Locale
@@ -513,24 +514,25 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
513514
}
514515

515516
test("Save csv with custom charset") {
516-
Seq("iso-8859-1", "utf-8", "windows-1250").foreach { encoding =>
517+
518+
// scalastyle:off nonascii
519+
val content = "µß áâä ÁÂÄ"
520+
// scalastyle:on nonascii
521+
522+
Seq("iso-8859-1", "utf-8", "utf-16", "utf-32", "windows-1250").foreach { encoding =>
517523
withTempDir { dir =>
518-
val csvDir = new File(dir, "csv").getCanonicalPath
519-
// scalastyle:off
520-
val originalDF = Seq("µß áâä ÁÂÄ").toDF("_c0")
521-
// scalastyle:on
522-
originalDF.write
523-
.option("header", "false")
524-
.option("encoding", encoding)
525-
.csv(csvDir)
524+
val csvDir = new File(dir, "csv")
526525

527-
val df = spark
528-
.read
529-
.option("header", "false")
526+
val originalDF = Seq(content).toDF("_c0").repartition(1)
527+
originalDF.write
530528
.option("encoding", encoding)
531-
.csv(csvDir)
529+
.csv(csvDir.getCanonicalPath)
532530

533-
checkAnswer(df, originalDF)
531+
csvDir.listFiles().filter(_.getName.endsWith("csv")).foreach({ csvFile =>
532+
val readback = Files.readAllBytes(csvFile.toPath)
533+
val expected = (content + "\n").getBytes(Charset.forName(encoding))
534+
assert(readback === expected)
535+
})
534536
}
535537
}
536538
}

0 commit comments

Comments
 (0)