diff --git a/docs/_data/menu-ml.yaml b/docs/_data/menu-ml.yaml index b5a6641e2e7e2..8e366f7f029aa 100644 --- a/docs/_data/menu-ml.yaml +++ b/docs/_data/menu-ml.yaml @@ -1,5 +1,7 @@ - text: Basic statistics url: ml-statistics.html +- text: Data sources + url: ml-datasource - text: Pipelines url: ml-pipeline.html - text: Extracting, transforming and selecting features diff --git a/docs/ml-datasource.md b/docs/ml-datasource.md new file mode 100644 index 0000000000000..15083326240ac --- /dev/null +++ b/docs/ml-datasource.md @@ -0,0 +1,108 @@ +--- +layout: global +title: Data sources +displayTitle: Data sources +--- + +In this section, we introduce how to use data source in ML to load data. +Beside some general data sources such as Parquet, CSV, JSON and JDBC, we also provide some specific data sources for ML. + +**Table of Contents** + +* This will become a table of contents (this text will be scraped). +{:toc} + +## Image data source + +This image data source is used to load image files from a directory, it can load compressed image (jpeg, png, etc.) into raw image representation via `ImageIO` in Java library. +The loaded DataFrame has one `StructType` column: "image", containing image data stored as image schema. +The schema of the `image` column is: + - origin: `StringType` (represents the file path of the image) + - height: `IntegerType` (height of the image) + - width: `IntegerType` (width of the image) + - nChannels: `IntegerType` (number of image channels) + - mode: `IntegerType` (OpenCV-compatible type) + - data: `BinaryType` (Image bytes in OpenCV-compatible order: row-wise BGR in most cases) + + +
+
+[`ImageDataSource`](api/scala/index.html#org.apache.spark.ml.source.image.ImageDataSource) +implements a Spark SQL data source API for loading image data as a DataFrame. + +{% highlight scala %} +scala> val df = spark.read.format("image").option("dropInvalid", true).load("data/mllib/images/origin/kittens") +df: org.apache.spark.sql.DataFrame = [image: struct] + +scala> df.select("image.origin", "image.width", "image.height").show(truncate=false) ++-----------------------------------------------------------------------+-----+------+ +|origin |width|height| ++-----------------------------------------------------------------------+-----+------+ +|file:///spark/data/mllib/images/origin/kittens/54893.jpg |300 |311 | +|file:///spark/data/mllib/images/origin/kittens/DP802813.jpg |199 |313 | +|file:///spark/data/mllib/images/origin/kittens/29.5.a_b_EGDP022204.jpg |300 |200 | +|file:///spark/data/mllib/images/origin/kittens/DP153539.jpg |300 |296 | ++-----------------------------------------------------------------------+-----+------+ +{% endhighlight %} +
+ +
+[`ImageDataSource`](api/java/org/apache/spark/ml/source/image/ImageDataSource.html) +implements Spark SQL data source API for loading image data as DataFrame. + +{% highlight java %} +Dataset imagesDF = spark.read().format("image").option("dropInvalid", true).load("data/mllib/images/origin/kittens"); +imageDF.select("image.origin", "image.width", "image.height").show(false); +/* +Will output: ++-----------------------------------------------------------------------+-----+------+ +|origin |width|height| ++-----------------------------------------------------------------------+-----+------+ +|file:///spark/data/mllib/images/origin/kittens/54893.jpg |300 |311 | +|file:///spark/data/mllib/images/origin/kittens/DP802813.jpg |199 |313 | +|file:///spark/data/mllib/images/origin/kittens/29.5.a_b_EGDP022204.jpg |300 |200 | +|file:///spark/data/mllib/images/origin/kittens/DP153539.jpg |300 |296 | ++-----------------------------------------------------------------------+-----+------+ +*/ +{% endhighlight %} +
+ +
+In PySpark we provide Spark SQL data source API for loading image data as DataFrame. + +{% highlight python %} +>>> df = spark.read.format("image").option("dropInvalid", true).load("data/mllib/images/origin/kittens") +>>> df.select("image.origin", "image.width", "image.height").show(truncate=False) ++-----------------------------------------------------------------------+-----+------+ +|origin |width|height| ++-----------------------------------------------------------------------+-----+------+ +|file:///spark/data/mllib/images/origin/kittens/54893.jpg |300 |311 | +|file:///spark/data/mllib/images/origin/kittens/DP802813.jpg |199 |313 | +|file:///spark/data/mllib/images/origin/kittens/29.5.a_b_EGDP022204.jpg |300 |200 | +|file:///spark/data/mllib/images/origin/kittens/DP153539.jpg |300 |296 | ++-----------------------------------------------------------------------+-----+------+ +{% endhighlight %} +
+ +
+In SparkR we provide Spark SQL data source API for loading image data as DataFrame. + +{% highlight r %} +> df = read.df("data/mllib/images/origin/kittens", "image") +> head(select(df, df$image.origin, df$image.width, df$image.height)) + +1 file:///spark/data/mllib/images/origin/kittens/54893.jpg +2 file:///spark/data/mllib/images/origin/kittens/DP802813.jpg +3 file:///spark/data/mllib/images/origin/kittens/29.5.a_b_EGDP022204.jpg +4 file:///spark/data/mllib/images/origin/kittens/DP153539.jpg + width height +1 300 311 +2 199 313 +3 300 200 +4 300 296 + +{% endhighlight %} +
+ + +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageDataSource.scala b/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageDataSource.scala index a111c95248cf5..d4d74082dc8c5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageDataSource.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageDataSource.scala @@ -19,14 +19,17 @@ package org.apache.spark.ml.source.image /** * `image` package implements Spark SQL data source API for loading image data as `DataFrame`. - * The loaded `DataFrame` has one `StructType` column: `image`. + * It can load compressed image (jpeg, png, etc.) into raw image representation via `ImageIO` + * in Java library. + * The loaded `DataFrame` has one `StructType` column: `image`, containing image data stored + * as image schema. * The schema of the `image` column is: - * - origin: String (represents the file path of the image) - * - height: Int (height of the image) - * - width: Int (width of the image) - * - nChannels: Int (number of the image channels) - * - mode: Int (OpenCV-compatible type) - * - data: BinaryType (Image bytes in OpenCV-compatible order: row-wise BGR in most cases) + * - origin: `StringType` (represents the file path of the image) + * - height: `IntegerType` (height of the image) + * - width: `IntegerType` (width of the image) + * - nChannels: `IntegerType` (number of image channels) + * - mode: `IntegerType` (OpenCV-compatible type) + * - data: `BinaryType` (Image bytes in OpenCV-compatible order: row-wise BGR in most cases) * * To use image data source, you need to set "image" as the format in `DataFrameReader` and * optionally specify the data source options, for example: