diff --git a/README.md b/README.md index 8a58fdce..ee347a1c 100644 --- a/README.md +++ b/README.md @@ -51,8 +51,8 @@ When reading files the API accepts several options: * `excludeAttribute` : Whether you want to exclude attributes in elements or not. Default is false. * `treatEmptyValuesAsNulls` : Whether you want to treat whitespaces as a null value. Default is false. * `failFast` : Whether you want to fail when it fails to parse malformed rows in XML files, instead of dropping the rows. Default is false. -* `attributePrefix`: The prefix for attributes so that we can differentiate attributes and elements. This will be the prefix for field names. Default is `@`. -* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `#VALUE`. +* `attributePrefix`: The prefix for attributes so that we can differentiate attributes and elements. This will be the prefix for field names. Default is `_`. +* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `_VALUE`. * `charset`: Defaults to 'UTF-8' but can be set to other valid charset names When writing files the API accepts several options: @@ -60,8 +60,8 @@ When writing files the API accepts several options: * `rowTag`: The row tag of your xml files to treat as a row. For example, in this xml ` ...`, the appropriate value would be `book`. Default is `ROW`. * `rootTag`: The root tag of your xml files to treat as the root. For example, in this xml ` ...`, the appropriate value would be `books`. Default is `ROWS`. * `nullValue`: The value to write `null` value. Default is string `null`. When this is `null`, it does not write attributes and elements for fields. -* `attributePrefix`: The prefix for attributes so that we can differentiating attributes and elements. This will be the prefix for field names. Default is `@`. -* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `#VALUE`. +* `attributePrefix`: The prefix for attributes so that we can differentiating attributes and elements. This will be the prefix for field names. Default is `_`. +* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `_VALUE`. * `compression`: compression codec to use when saving to file. Should be the fully qualified name of a class implementing `org.apache.hadoop.io.compress.CompressionCodec` or one of case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). Defaults to no compression when a codec is not specified. Currently it supports the shortened name usage. You can use just `xml` instead of `com.databricks.spark.xml` from Spark 1.5.0+ @@ -87,7 +87,7 @@ Due to the structure differences between `DataFrame` and XML, there are some con ``` root - |-- @myOneAttrib: string (nullable = true) + |-- _myOneAttrib: string (nullable = true) |-- two: string (nullable = true) |-- three: string (nullable = true) ``` @@ -106,8 +106,8 @@ Due to the structure differences between `DataFrame` and XML, there are some con ``` root |-- two: struct (nullable = true) - | |-- #VALUE: string (nullable = true) - | |-- @myTwoAttrib: string (nullable = true) + | |-- _VALUE: string (nullable = true) + | |-- _myTwoAttrib: string (nullable = true) |-- three: string (nullable = true) ``` @@ -163,7 +163,7 @@ OPTIONS (path "books.xml", rowTag "book") You can also specify column names and types in DDL. In this case, we do not infer schema. ```sql -CREATE TABLE books (author string, description string, genre string, @id string, price double, publish_date string, title string) +CREATE TABLE books (author string, description string, genre string, _id string, price double, publish_date string, title string) USING com.databricks.spark.xml OPTIONS (path "books.xml", rowTag "book") ``` @@ -180,7 +180,7 @@ val df = sqlContext.read .option("rowTag", "book") .load("books.xml") -val selectedData = df.select("author", "@id") +val selectedData = df.select("author", "_id") selectedData.write .format("com.databricks.spark.xml") .option("rootTag", "books") @@ -195,7 +195,7 @@ import org.apache.spark.sql.types.{StructType, StructField, StringType, DoubleTy val sqlContext = new SQLContext(sc) val customSchema = StructType(Array( - StructField("@id", StringType, nullable = true), + StructField("_id", StringType, nullable = true), StructField("author", StringType, nullable = true), StructField("description", StringType, nullable = true), StructField("genre", StringType ,nullable = true), @@ -210,7 +210,7 @@ val df = sqlContext.read .schema(customSchema) .load("books.xml") -val selectedData = df.select("author", "@id") +val selectedData = df.select("author", "_id") selectedData.write .format("com.databricks.spark.xml") .option("rootTag", "books") @@ -228,7 +228,7 @@ val df = sqlContext.load( "com.databricks.spark.xml", Map("path" -> "books.xml", "rowTag" -> "book")) -val selectedData = df.select("author", "@id") +val selectedData = df.select("author", "_id") selectedData.save("com.databricks.spark.xml", SaveMode.ErrorIfExists, Map("path" -> "newbooks.xml", "rootTag" -> "books", "rowTag" -> "book")) @@ -241,7 +241,7 @@ import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerT val sqlContext = new SQLContext(sc) val customSchema = StructType(Array( - StructField("@id", StringType, nullable = true), + StructField("_id", StringType, nullable = true), StructField("author", StringType, nullable = true), StructField("description", StringType, nullable = true), StructField("genre", StringType ,nullable = true), @@ -254,7 +254,7 @@ val df = sqlContext.load( schema = customSchema, Map("path" -> "books.xml", "rowTag" -> "book")) -val selectedData = df.select("author", "@id") +val selectedData = df.select("author", "_id") selectedData.save("com.databricks.spark.xml", SaveMode.ErrorIfExists, Map("path" -> "newbooks.xml", "rootTag" -> "books", "rowTag" -> "book")) @@ -272,7 +272,7 @@ DataFrame df = sqlContext.read() .option("rowTag", "book") .load("books.xml"); -df.select("author", "@id").write() +df.select("author", "_id").write() .format("com.databricks.spark.xml") .option("rootTag", "books") .option("rowTag", "book") @@ -286,7 +286,7 @@ import org.apache.spark.sql.types.*; SQLContext sqlContext = new SQLContext(sc); StructType customSchema = new StructType(new StructField[] { - new StructField("@id", DataTypes.StringType, true, Metadata.empty()), + new StructField("_id", DataTypes.StringType, true, Metadata.empty()), new StructField("author", DataTypes.StringType, true, Metadata.empty()), new StructField("description", DataTypes.StringType, true, Metadata.empty()), new StructField("genre", DataTypes.StringType, true, Metadata.empty()), @@ -301,7 +301,7 @@ DataFrame df = sqlContext.read() .schema(customSchema) .load("books.xml"); -df.select("author", "@id").write() +df.select("author", "_id").write() .format("com.databricks.spark.xml") .option("rootTag", "books") .option("rowTag", "book") @@ -326,7 +326,7 @@ HashMap options = new HashMap(); options.put("rowTag", "book"); options.put("rootTag", "books"); options.put("path", "newbooks.xml"); -df.select("author", "@id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options) +df.select("author", "_id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options) ``` You can manually specify schema: @@ -336,7 +336,7 @@ import org.apache.spark.sql.types.*; SQLContext sqlContext = new SQLContext(sc); StructType customSchema = new StructType(new StructField[] { - new StructField("@id", DataTypes.StringType, true, Metadata.empty()), + new StructField("_id", DataTypes.StringType, true, Metadata.empty()), new StructField("author", DataTypes.StringType, true, Metadata.empty()), new StructField("description", DataTypes.StringType, true, Metadata.empty()), new StructField("genre", DataTypes.StringType, true, Metadata.empty()), @@ -354,7 +354,7 @@ HashMap options = new HashMap(); options.put("rowTag", "book"); options.put("rootTag", "books"); options.put("path", "newbooks.xml"); -df.select("author", "@id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options) +df.select("author", "_id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options) ``` ### Python API @@ -366,7 +366,7 @@ from pyspark.sql import SQLContext sqlContext = SQLContext(sc) df = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='book').load('books.xml') -df.select("author", "@id").write \ +df.select("author", "_id").write \ .format('com.databricks.spark.xml') \ .options(rowTag='book', rootTag='books') \ .save('newbooks.xml') @@ -379,7 +379,7 @@ from pyspark.sql.types import * sqlContext = SQLContext(sc) customSchema = StructType([ \ - StructField("@id", StringType(), True), \ + StructField("_id", StringType(), True), \ StructField("author", StringType(), True), \ StructField("description", StringType(), True), \ StructField("genre", StringType(), True), \ @@ -392,7 +392,7 @@ df = sqlContext.read \ .options(rowTag='book') \ .load('books.xml', schema = customSchema) -df.select("author", "@id").write \ +df.select("author", "_id").write \ .format('com.databricks.spark.xml') \ .options(rowTag='book', rootTag='books') \ .save('newbooks.xml') @@ -406,7 +406,7 @@ from pyspark.sql import SQLContext sqlContext = SQLContext(sc) df = sqlContext.load(source="com.databricks.spark.xml", rowTag = 'book', path = 'books.xml') -df.select("author", "@id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml') +df.select("author", "_id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml') ``` You can manually specify schema: @@ -416,7 +416,7 @@ from pyspark.sql.types import * sqlContext = SQLContext(sc) customSchema = StructType([ \ - StructField("@id", StringType(), True), \ + StructField("_id", StringType(), True), \ StructField("author", StringType(), True), \ StructField("description", StringType(), True), \ StructField("genre", StringType(), True), \ @@ -425,7 +425,7 @@ customSchema = StructType([ \ StructField("title", StringType(), True)]) df = sqlContext.load(source="com.databricks.spark.xml", rowTag = 'book', schema = customSchema, path = 'books.xml') -df.select("author", "@id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml') +df.select("author", "_id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml') ``` @@ -452,7 +452,7 @@ library(SparkR) Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-csv_2.10:0.3.3" "sparkr-shell"') sqlContext <- sparkRSQL.init(sc) customSchema <- structType( - structField("@id", "string"), + structField("_id", "string"), structField("author", "string"), structField("description", "string"), structField("genre", "string"), diff --git a/src/main/scala/com/databricks/spark/xml/XmlOptions.scala b/src/main/scala/com/databricks/spark/xml/XmlOptions.scala index 01ccaa7c..e279f140 100644 --- a/src/main/scala/com/databricks/spark/xml/XmlOptions.scala +++ b/src/main/scala/com/databricks/spark/xml/XmlOptions.scala @@ -38,8 +38,8 @@ private[xml] class XmlOptions( } private[xml] object XmlOptions { - val DEFAULT_ATTRIBUTE_PREFIX = "@" - val DEFAULT_VALUE_TAG = "#VALUE" + val DEFAULT_ATTRIBUTE_PREFIX = "_" + val DEFAULT_VALUE_TAG = "_VALUE" val DEFAULT_ROW_TAG = "ROW" val DEFAULT_ROOT_TAG = "ROWS" val DEFAULT_CHARSET = "UTF-8" diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala index 6d8d049c..c638d90e 100755 --- a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala @@ -564,15 +564,35 @@ class XmlSuite extends FunSuite with BeforeAndAfterAll { } test("DSL test parsing and inferring attribute in elements having no child element") { + // Default value. + val resultsOne = new XmlReader() + .withRowTag(booksTag) + .xmlFile(sqlContext, booksAttributesInNoChild) + + val schemaOne = StructType(List( + StructField("_id", StringType, nullable = true), + StructField("author", StringType, nullable = true), + StructField("price", StructType( + List(StructField("_VALUE", StringType, nullable = true), + StructField(s"_unit", StringType, nullable = true))), + nullable = true), + StructField("publish_date", StringType, nullable = true), + StructField("title", StringType, nullable = true)) + ) + + assert(resultsOne.schema === schemaOne) + assert(resultsOne.count == numBooks) + + // Explicitly set val attributePrefix = "@#" val valueTag = "#@@value" - val results = new XmlReader() + val resultsTwo = new XmlReader() .withRowTag(booksTag) .withAttributePrefix(attributePrefix) .withValueTag(valueTag) .xmlFile(sqlContext, booksAttributesInNoChild) - val schema = StructType(List( + val schemaTwo = StructType(List( StructField(s"${attributePrefix}id", StringType, nullable = true), StructField("author", StringType, nullable = true), StructField("price", StructType( @@ -583,8 +603,8 @@ class XmlSuite extends FunSuite with BeforeAndAfterAll { StructField("title", StringType, nullable = true)) ) - assert(results.schema === schema) - assert(results.count == numBooks) + assert(resultsTwo.schema === schemaTwo) + assert(resultsTwo.count == numBooks) } test("DSL test schema (excluding tags) inferred correctly") {