Add test to check UTF-16 and UTF-32

crafty-coder · crafty-coder · commit 91f4750ff2f4 · 2018-07-16T15:14:55.000+01:00
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -895,7 +895,7 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No
                                           the quote character. If None is set, the default value is
                                           escape character when escape and quote characters are
                                           different, ``\0`` otherwise..
-        :param encoding: sets encoding used for encoding the file. If None is set, it
+        :param encoding: sets the encoding (charset) to be used on the csv file. If None is set, it
                                           uses the default value, ``UTF-8``.
 
         >>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
@@ -153,8 +153,7 @@ private[csv] class CsvOutputWriter(
   private val writer = CodecStreams.createOutputStreamWriter(
     context,
     new Path(path),
-    charset
-  )
+    charset)
 
   private val gen = new UnivocityGenerator(dataSchema, writer, params)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.sql.execution.datasources.csv
 
 import java.io.File
-import java.nio.charset.UnsupportedCharsetException
+import java.nio.charset.{Charset, UnsupportedCharsetException}
+import java.nio.file.Files
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
 import java.util.Locale
@@ -513,24 +514,25 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
   }
 
   test("Save csv with custom charset") {
-    Seq("iso-8859-1", "utf-8", "windows-1250").foreach { encoding =>
+
+    // scalastyle:off nonascii
+    val content = "µß áâä ÁÂÄ"
+    // scalastyle:on nonascii
+
+    Seq("iso-8859-1", "utf-8", "utf-16", "utf-32", "windows-1250").foreach { encoding =>
       withTempDir { dir =>
-        val csvDir = new File(dir, "csv").getCanonicalPath
-        // scalastyle:off
-        val originalDF = Seq("µß áâä ÁÂÄ").toDF("_c0")
-        // scalastyle:on
-        originalDF.write
-          .option("header", "false")
-          .option("encoding", encoding)
-          .csv(csvDir)
+        val csvDir = new File(dir, "csv")
 
-        val df = spark
-          .read
-          .option("header", "false")
+        val originalDF = Seq(content).toDF("_c0").repartition(1)
+        originalDF.write
           .option("encoding", encoding)
-          .csv(csvDir)
+          .csv(csvDir.getCanonicalPath)
 
-        checkAnswer(df, originalDF)
+        csvDir.listFiles().filter(_.getName.endsWith("csv")).foreach({ csvFile =>
+          val readback = Files.readAllBytes(csvFile.toPath)
+          val expected = (content + "\n").getBytes(Charset.forName(encoding))
+          assert(readback === expected)
+        })
       }
     }
   }