address comments

WeichenXu123 · WeichenXu123 · commit 218ce4cf7963 · 2018-09-05T23:59:13.000+08:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageDataSource.scala b/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageDataSource.scala
@@ -18,33 +18,32 @@
 package org.apache.spark.ml.source.image
 
 /**
- * `image` package implements Spark SQL data source API for loading IMAGE data as `DataFrame`.
+ * `image` package implements Spark SQL data source API for loading image data as `DataFrame`.
  * The loaded `DataFrame` has one `StructType` column: `image`.
  * The schema of the `image` column is:
- *  - origin: String (represents the origin of the image.
- *                    If loaded from files, then it is the file path)
+ *  - origin: String (represents the file path of the image)
  *  - height: Int (height of the image)
  *  - width: Int (width of the image)
  *  - nChannels: Int (number of the image channels)
  *  - mode: Int (OpenCV-compatible type)
  *  - data: BinaryType (Image bytes in OpenCV-compatible order: row-wise BGR in most cases)
  *
- * To use IMAGE data source, you need to set "image" as the format in `DataFrameReader` and
+ * To use image data source, you need to set "image" as the format in `DataFrameReader` and
  * optionally specify the data source options, for example:
  * {{{
  *   // Scala
  *   val df = spark.read.format("image")
- *     .option("dropImageFailures", true)
+ *     .option("dropInvalid", true)
  *     .load("data/mllib/images/partitioned")
  *
  *   // Java
  *   Dataset<Row> df = spark.read().format("image")
- *     .option("dropImageFailures", true)
+ *     .option("dropInvalid", true)
  *     .load("data/mllib/images/partitioned");
  * }}}
  *
- * IMAGE data source supports the following options:
- *  - "dropImageFailures": Whether to drop the files that are not valid images from the result.
+ * Image data source supports the following options:
+ *  - "dropInvalid": Whether to drop the files that are not valid images from the result.
  *
  * @note This IMAGE data source does not support saving images to files.
  *
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageFileFormat.scala b/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageFileFormat.scala
@@ -69,7 +69,7 @@ private[image] class ImageFileFormat extends FileFormat with DataSourceRegister
 
     (file: PartitionedFile) => {
       val emptyUnsafeRow = new UnsafeRow(0)
-      if (!imageSourceOptions.dropImageFailures && requiredSchema.isEmpty) {
+      if (!imageSourceOptions.dropInvalid && requiredSchema.isEmpty) {
         Iterator(emptyUnsafeRow)
       } else {
         val origin = file.filePath
@@ -82,7 +82,7 @@ private[image] class ImageFileFormat extends FileFormat with DataSourceRegister
           Closeables.close(stream, true)
         }
         val resultOpt = ImageSchema.decode(origin, bytes)
-        val filteredResult = if (imageSourceOptions.dropImageFailures) {
+        val filteredResult = if (imageSourceOptions.dropInvalid) {
           resultOpt.toIterator
         } else {
           Iterator(resultOpt.getOrElse(ImageSchema.invalidImageRow(origin)))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageOptions.scala b/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageOptions.scala
@@ -24,5 +24,9 @@ private[image] class ImageOptions(
 
   def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
 
-  val dropImageFailures = parameters.getOrElse("dropImageFailures", "false").toBoolean
+  /**
+   * Whether to drop invalid images. If true, invalid images will be removed, otherwise
+   * invalid images will be returned with empty data and all other field filled with `-1`.
+   */
+  val dropInvalid = parameters.getOrElse("dropInvalid", "false").toBoolean
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
@@ -34,7 +34,7 @@ class ImageFileFormatSuite extends SparkFunSuite with MLlibTestSparkContext {
     val df1 = spark.read.format("image").load(imagePath)
     assert(df1.count === 9)
 
-    val df2 = spark.read.format("image").option("dropImageFailures", true).load(imagePath)
+    val df2 = spark.read.format("image").option("dropInvalid", true).load(imagePath)
     assert(df2.count === 8)
   }
 
@@ -50,11 +50,11 @@ class ImageFileFormatSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("image datasource test: read non image") {
     val filePath = imagePath + "/cls=kittens/date=2018-01/not-image.txt"
-    val df = spark.read.format("image").option("dropImageFailures", true)
+    val df = spark.read.format("image").option("dropInvalid", true)
       .load(filePath)
     assert(df.count() === 0)
 
-    val df2 = spark.read.format("image").option("dropImageFailures", false)
+    val df2 = spark.read.format("image").option("dropInvalid", false)
       .load(filePath)
     assert(df2.count() === 1)
     val result = df2.head()
@@ -64,7 +64,7 @@ class ImageFileFormatSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("image datasource partition test") {
     val result = spark.read.format("image")
-      .option("dropImageFailures", true).load(imagePath)
+      .option("dropInvalid", true).load(imagePath)
       .select(substring_index(col("image.origin"), "/", -1).as("origin"), col("cls"), col("date"))
       .collect()
 
@@ -82,15 +82,16 @@ class ImageFileFormatSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   // Images with the different number of channels
   test("readImages pixel values test") {
-    val images = spark.read.format("image").option("dropImageFailures", true)
+    val images = spark.read.format("image").option("dropInvalid", true)
       .load(imagePath + "/cls=multichannel/").collect()
 
     val firstBytes20Set = images.map { rrow =>
       val row = rrow.getAs[Row]("image")
       val filename = Paths.get(getOrigin(row)).getFileName().toString()
       val mode = getMode(row)
       val bytes20 = getData(row).slice(0, 20).toList
-      filename -> Tuple2(mode, bytes20)
+      filename -> Tuple2(mode, bytes20) // Cannot remove `Tuple2`, otherwise `->` operator
+                                        // will match 2 arguments
     }.toSet
 
     assert(firstBytes20Set === expectedFirstBytes20Set)
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
@@ -2186,7 +2186,7 @@ def tearDown(self):
 class ImageReaderTest(SparkSessionTestCase):
 
     def test_read_images(self):
-        data_path = 'data/mllib/images/kittens'
+        data_path = 'data/mllib/images/origin/kittens'
         df = ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True)
         self.assertEqual(df.count(), 4)
         first_row = df.take(1)[0][0]
@@ -2253,7 +2253,7 @@ def tearDownClass(cls):
     def test_read_images_multiple_times(self):
         # This test case is to check if `ImageSchema.readImages` tries to
         # initiate Hive client multiple times. See SPARK-22651.
-        data_path = 'data/mllib/images/kittens'
+        data_path = 'data/mllib/images/origin/kittens'
         ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True)
         ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True)
 

Original file line number	Diff line number	Diff line change
`@@ -24,5 +24,9 @@ private[image] class ImageOptions(`
`24`	`24`
`25`	`25`	`def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))`
`26`	`26`
`27`		`- val dropImageFailures = parameters.getOrElse("dropImageFailures", "false").toBoolean`
	`27`	`+ /**`
	`28`	`+ * Whether to drop invalid images. If true, invalid images will be removed, otherwise`
	`29`	+ * invalid images will be returned with empty data and all other field filled with `-1`.
	`30`	`+ */`
	`31`	`+ val dropInvalid = parameters.getOrElse("dropInvalid", "false").toBoolean`
`28`	`32`	`}`