diff --git a/README.md b/README.md index 2ec1d6aa..95b3e7d5 100644 --- a/README.md +++ b/README.md @@ -10,23 +10,28 @@ The structure and test tools are mostly copied from [CSV Data Source for Spark]( ## Requirements -This library requires Spark 1.3+ +This library requires Spark 2.0+ for 0.4.x. + +For version that works with Spark 1.x, please check for [branch-0.3](https://github.com/databricks/spark-xml/tree/branch-0.3). ## Linking You can link against this library in your program at the following coordinates: ### Scala 2.10 + ``` groupId: com.databricks artifactId: spark-xml_2.10 -version: 0.3.4 +version: 0.4.0 ``` + ### Scala 2.11 + ``` groupId: com.databricks artifactId: spark-xml_2.11 -version: 0.3.4 +version: 0.4.0 ``` ## Using with Spark shell @@ -34,12 +39,12 @@ This package can be added to Spark using the `--packages` command line option. ### Spark compiled with Scala 2.10 ``` -$SPARK_HOME/bin/spark-shell --packages com.databricks:spark-xml_2.10:0.3.4 +$SPARK_HOME/bin/spark-shell --packages com.databricks:spark-xml_2.10:0.4.0 ``` ### Spark compiled with Scala 2.11 ``` -$SPARK_HOME/bin/spark-shell --packages com.databricks:spark-xml_2.11:0.3.4 +$SPARK_HOME/bin/spark-shell --packages com.databricks:spark-xml_2.11:0.4.0 ``` ## Features @@ -173,7 +178,6 @@ OPTIONS (path "books.xml", rowTag "book") ``` ### Scala API -__Spark 1.4+:__ ```scala import org.apache.spark.sql.SQLContext @@ -222,50 +226,7 @@ selectedData.write .save("newbooks.xml") ``` -__Spark 1.3:__ - -```scala -import org.apache.spark.sql.SQLContext - -val sqlContext = new SQLContext(sc) -val df = sqlContext.load( - "com.databricks.spark.xml", - Map("path" -> "books.xml", "rowTag" -> "book")) - -val selectedData = df.select("author", "_id") -selectedData.save("com.databricks.spark.xml", - SaveMode.ErrorIfExists, - Map("path" -> "newbooks.xml", "rootTag" -> "books", "rowTag" -> "book")) -``` - -You can manually specify the schema when reading data: -```scala -import org.apache.spark.sql.SQLContext -import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}; - -val sqlContext = new SQLContext(sc) -val customSchema = StructType(Array( - StructField("_id", StringType, nullable = true), - StructField("author", StringType, nullable = true), - StructField("description", StringType, nullable = true), - StructField("genre", StringType ,nullable = true), - StructField("price", DoubleType, nullable = true), - StructField("publish_date", StringType, nullable = true), - StructField("title", StringType, nullable = true))) - -val df = sqlContext.load( - "com.databricks.spark.xml", - schema = customSchema, - Map("path" -> "books.xml", "rowTag" -> "book")) - -val selectedData = df.select("author", "_id") -selectedData.save("com.databricks.spark.xml", - SaveMode.ErrorIfExists, - Map("path" -> "newbooks.xml", "rootTag" -> "books", "rowTag" -> "book")) -``` - ### Java API -__Spark 1.4+:__ ```java import org.apache.spark.sql.SQLContext @@ -312,58 +273,8 @@ df.select("author", "_id").write() .save("newbooks.xml"); ``` - - -__Spark 1.3:__ - -```java -import org.apache.spark.sql.SQLContext - -SQLContext sqlContext = new SQLContext(sc); - -HashMap options = new HashMap(); -options.put("rowTag", "book"); -options.put("path", "books.xml"); -DataFrame df = sqlContext.load("com.databricks.spark.xml", options); - -HashMap options = new HashMap(); -options.put("rowTag", "book"); -options.put("rootTag", "books"); -options.put("path", "newbooks.xml"); -df.select("author", "_id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options) -``` - -You can manually specify schema: -```java -import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.types.*; - -SQLContext sqlContext = new SQLContext(sc); -StructType customSchema = new StructType(new StructField[] { - new StructField("_id", DataTypes.StringType, true, Metadata.empty()), - new StructField("author", DataTypes.StringType, true, Metadata.empty()), - new StructField("description", DataTypes.StringType, true, Metadata.empty()), - new StructField("genre", DataTypes.StringType, true, Metadata.empty()), - new StructField("price", DataTypes.DoubleType, true, Metadata.empty()), - new StructField("publish_date", DataTypes.StringType, true, Metadata.empty()), - new StructField("title", DataTypes.StringType, true, Metadata.empty()) -}); - -HashMap options = new HashMap(); -options.put("rowTag", "book"); -options.put("path", "books.xml"); -DataFrame df = sqlContext.load("com.databricks.spark.xml", customSchema, options); - -HashMap options = new HashMap(); -options.put("rowTag", "book"); -options.put("rootTag", "books"); -options.put("path", "newbooks.xml"); -df.select("author", "_id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options) -``` - ### Python API -__Spark 1.4+:__ ```python from pyspark.sql import SQLContext @@ -402,45 +313,13 @@ df.select("author", "_id").write \ .save('newbooks.xml') ``` - -__Spark 1.3:__ - -```python -from pyspark.sql import SQLContext -sqlContext = SQLContext(sc) - -df = sqlContext.load(source="com.databricks.spark.xml", rowTag = 'book', path = 'books.xml') -df.select("author", "_id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml') -``` - -You can manually specify schema: -```python -from pyspark.sql import SQLContext -from pyspark.sql.types import * - -sqlContext = SQLContext(sc) -customSchema = StructType([ \ - StructField("_id", StringType(), True), \ - StructField("author", StringType(), True), \ - StructField("description", StringType(), True), \ - StructField("genre", StringType(), True), \ - StructField("price", DoubleType(), True), \ - StructField("publish_date", StringType(), True), \ - StructField("title", StringType(), True)]) - -df = sqlContext.load(source="com.databricks.spark.xml", rowTag = 'book', schema = customSchema, path = 'books.xml') -df.select("author", "_id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml') -``` - - ### R API -__Spark 1.4+:__ Automatically infer schema (data types) ```R library(SparkR) -Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-xml_2.10:0.3.4" "sparkr-shell"') +Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-xml_2.10:0.4.0" "sparkr-shell"') sqlContext <- sparkRSQL.init(sc) df <- read.df(sqlContext, "books.xml", source = "com.databricks.spark.xml", rowTag = "book") @@ -453,7 +332,7 @@ You can manually specify schema: ```R library(SparkR) -Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-csv_2.10:0.3.4" "sparkr-shell"') +Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-csv_2.10:0.4.0" "sparkr-shell"') sqlContext <- sparkRSQL.init(sc) customSchema <- structType( structField("_id", "string"), diff --git a/build.sbt b/build.sbt index 5a8c5e51..6d42644c 100755 --- a/build.sbt +++ b/build.sbt @@ -1,6 +1,6 @@ name := "spark-xml" -version := "0.4.0-SNAPSHOT" +version := "0.4.0" organization := "com.databricks"