diff --git a/README.md b/README.md
index 8a58fdce..ee347a1c 100644
--- a/README.md
+++ b/README.md
@@ -51,8 +51,8 @@ When reading files the API accepts several options:
* `excludeAttribute` : Whether you want to exclude attributes in elements or not. Default is false.
* `treatEmptyValuesAsNulls` : Whether you want to treat whitespaces as a null value. Default is false.
* `failFast` : Whether you want to fail when it fails to parse malformed rows in XML files, instead of dropping the rows. Default is false.
-* `attributePrefix`: The prefix for attributes so that we can differentiate attributes and elements. This will be the prefix for field names. Default is `@`.
-* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `#VALUE`.
+* `attributePrefix`: The prefix for attributes so that we can differentiate attributes and elements. This will be the prefix for field names. Default is `_`.
+* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `_VALUE`.
* `charset`: Defaults to 'UTF-8' but can be set to other valid charset names
When writing files the API accepts several options:
@@ -60,8 +60,8 @@ When writing files the API accepts several options:
* `rowTag`: The row tag of your xml files to treat as a row. For example, in this xml ` ...`, the appropriate value would be `book`. Default is `ROW`.
* `rootTag`: The root tag of your xml files to treat as the root. For example, in this xml ` ...`, the appropriate value would be `books`. Default is `ROWS`.
* `nullValue`: The value to write `null` value. Default is string `null`. When this is `null`, it does not write attributes and elements for fields.
-* `attributePrefix`: The prefix for attributes so that we can differentiating attributes and elements. This will be the prefix for field names. Default is `@`.
-* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `#VALUE`.
+* `attributePrefix`: The prefix for attributes so that we can differentiating attributes and elements. This will be the prefix for field names. Default is `_`.
+* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `_VALUE`.
* `compression`: compression codec to use when saving to file. Should be the fully qualified name of a class implementing `org.apache.hadoop.io.compress.CompressionCodec` or one of case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). Defaults to no compression when a codec is not specified.
Currently it supports the shortened name usage. You can use just `xml` instead of `com.databricks.spark.xml` from Spark 1.5.0+
@@ -87,7 +87,7 @@ Due to the structure differences between `DataFrame` and XML, there are some con
```
root
- |-- @myOneAttrib: string (nullable = true)
+ |-- _myOneAttrib: string (nullable = true)
|-- two: string (nullable = true)
|-- three: string (nullable = true)
```
@@ -106,8 +106,8 @@ Due to the structure differences between `DataFrame` and XML, there are some con
```
root
|-- two: struct (nullable = true)
- | |-- #VALUE: string (nullable = true)
- | |-- @myTwoAttrib: string (nullable = true)
+ | |-- _VALUE: string (nullable = true)
+ | |-- _myTwoAttrib: string (nullable = true)
|-- three: string (nullable = true)
```
@@ -163,7 +163,7 @@ OPTIONS (path "books.xml", rowTag "book")
You can also specify column names and types in DDL. In this case, we do not infer schema.
```sql
-CREATE TABLE books (author string, description string, genre string, @id string, price double, publish_date string, title string)
+CREATE TABLE books (author string, description string, genre string, _id string, price double, publish_date string, title string)
USING com.databricks.spark.xml
OPTIONS (path "books.xml", rowTag "book")
```
@@ -180,7 +180,7 @@ val df = sqlContext.read
.option("rowTag", "book")
.load("books.xml")
-val selectedData = df.select("author", "@id")
+val selectedData = df.select("author", "_id")
selectedData.write
.format("com.databricks.spark.xml")
.option("rootTag", "books")
@@ -195,7 +195,7 @@ import org.apache.spark.sql.types.{StructType, StructField, StringType, DoubleTy
val sqlContext = new SQLContext(sc)
val customSchema = StructType(Array(
- StructField("@id", StringType, nullable = true),
+ StructField("_id", StringType, nullable = true),
StructField("author", StringType, nullable = true),
StructField("description", StringType, nullable = true),
StructField("genre", StringType ,nullable = true),
@@ -210,7 +210,7 @@ val df = sqlContext.read
.schema(customSchema)
.load("books.xml")
-val selectedData = df.select("author", "@id")
+val selectedData = df.select("author", "_id")
selectedData.write
.format("com.databricks.spark.xml")
.option("rootTag", "books")
@@ -228,7 +228,7 @@ val df = sqlContext.load(
"com.databricks.spark.xml",
Map("path" -> "books.xml", "rowTag" -> "book"))
-val selectedData = df.select("author", "@id")
+val selectedData = df.select("author", "_id")
selectedData.save("com.databricks.spark.xml",
SaveMode.ErrorIfExists,
Map("path" -> "newbooks.xml", "rootTag" -> "books", "rowTag" -> "book"))
@@ -241,7 +241,7 @@ import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerT
val sqlContext = new SQLContext(sc)
val customSchema = StructType(Array(
- StructField("@id", StringType, nullable = true),
+ StructField("_id", StringType, nullable = true),
StructField("author", StringType, nullable = true),
StructField("description", StringType, nullable = true),
StructField("genre", StringType ,nullable = true),
@@ -254,7 +254,7 @@ val df = sqlContext.load(
schema = customSchema,
Map("path" -> "books.xml", "rowTag" -> "book"))
-val selectedData = df.select("author", "@id")
+val selectedData = df.select("author", "_id")
selectedData.save("com.databricks.spark.xml",
SaveMode.ErrorIfExists,
Map("path" -> "newbooks.xml", "rootTag" -> "books", "rowTag" -> "book"))
@@ -272,7 +272,7 @@ DataFrame df = sqlContext.read()
.option("rowTag", "book")
.load("books.xml");
-df.select("author", "@id").write()
+df.select("author", "_id").write()
.format("com.databricks.spark.xml")
.option("rootTag", "books")
.option("rowTag", "book")
@@ -286,7 +286,7 @@ import org.apache.spark.sql.types.*;
SQLContext sqlContext = new SQLContext(sc);
StructType customSchema = new StructType(new StructField[] {
- new StructField("@id", DataTypes.StringType, true, Metadata.empty()),
+ new StructField("_id", DataTypes.StringType, true, Metadata.empty()),
new StructField("author", DataTypes.StringType, true, Metadata.empty()),
new StructField("description", DataTypes.StringType, true, Metadata.empty()),
new StructField("genre", DataTypes.StringType, true, Metadata.empty()),
@@ -301,7 +301,7 @@ DataFrame df = sqlContext.read()
.schema(customSchema)
.load("books.xml");
-df.select("author", "@id").write()
+df.select("author", "_id").write()
.format("com.databricks.spark.xml")
.option("rootTag", "books")
.option("rowTag", "book")
@@ -326,7 +326,7 @@ HashMap options = new HashMap();
options.put("rowTag", "book");
options.put("rootTag", "books");
options.put("path", "newbooks.xml");
-df.select("author", "@id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options)
+df.select("author", "_id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options)
```
You can manually specify schema:
@@ -336,7 +336,7 @@ import org.apache.spark.sql.types.*;
SQLContext sqlContext = new SQLContext(sc);
StructType customSchema = new StructType(new StructField[] {
- new StructField("@id", DataTypes.StringType, true, Metadata.empty()),
+ new StructField("_id", DataTypes.StringType, true, Metadata.empty()),
new StructField("author", DataTypes.StringType, true, Metadata.empty()),
new StructField("description", DataTypes.StringType, true, Metadata.empty()),
new StructField("genre", DataTypes.StringType, true, Metadata.empty()),
@@ -354,7 +354,7 @@ HashMap options = new HashMap();
options.put("rowTag", "book");
options.put("rootTag", "books");
options.put("path", "newbooks.xml");
-df.select("author", "@id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options)
+df.select("author", "_id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options)
```
### Python API
@@ -366,7 +366,7 @@ from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
df = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='book').load('books.xml')
-df.select("author", "@id").write \
+df.select("author", "_id").write \
.format('com.databricks.spark.xml') \
.options(rowTag='book', rootTag='books') \
.save('newbooks.xml')
@@ -379,7 +379,7 @@ from pyspark.sql.types import *
sqlContext = SQLContext(sc)
customSchema = StructType([ \
- StructField("@id", StringType(), True), \
+ StructField("_id", StringType(), True), \
StructField("author", StringType(), True), \
StructField("description", StringType(), True), \
StructField("genre", StringType(), True), \
@@ -392,7 +392,7 @@ df = sqlContext.read \
.options(rowTag='book') \
.load('books.xml', schema = customSchema)
-df.select("author", "@id").write \
+df.select("author", "_id").write \
.format('com.databricks.spark.xml') \
.options(rowTag='book', rootTag='books') \
.save('newbooks.xml')
@@ -406,7 +406,7 @@ from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
df = sqlContext.load(source="com.databricks.spark.xml", rowTag = 'book', path = 'books.xml')
-df.select("author", "@id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml')
+df.select("author", "_id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml')
```
You can manually specify schema:
@@ -416,7 +416,7 @@ from pyspark.sql.types import *
sqlContext = SQLContext(sc)
customSchema = StructType([ \
- StructField("@id", StringType(), True), \
+ StructField("_id", StringType(), True), \
StructField("author", StringType(), True), \
StructField("description", StringType(), True), \
StructField("genre", StringType(), True), \
@@ -425,7 +425,7 @@ customSchema = StructType([ \
StructField("title", StringType(), True)])
df = sqlContext.load(source="com.databricks.spark.xml", rowTag = 'book', schema = customSchema, path = 'books.xml')
-df.select("author", "@id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml')
+df.select("author", "_id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml')
```
@@ -452,7 +452,7 @@ library(SparkR)
Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-csv_2.10:0.3.3" "sparkr-shell"')
sqlContext <- sparkRSQL.init(sc)
customSchema <- structType(
- structField("@id", "string"),
+ structField("_id", "string"),
structField("author", "string"),
structField("description", "string"),
structField("genre", "string"),
diff --git a/src/main/scala/com/databricks/spark/xml/XmlOptions.scala b/src/main/scala/com/databricks/spark/xml/XmlOptions.scala
index 01ccaa7c..e279f140 100644
--- a/src/main/scala/com/databricks/spark/xml/XmlOptions.scala
+++ b/src/main/scala/com/databricks/spark/xml/XmlOptions.scala
@@ -38,8 +38,8 @@ private[xml] class XmlOptions(
}
private[xml] object XmlOptions {
- val DEFAULT_ATTRIBUTE_PREFIX = "@"
- val DEFAULT_VALUE_TAG = "#VALUE"
+ val DEFAULT_ATTRIBUTE_PREFIX = "_"
+ val DEFAULT_VALUE_TAG = "_VALUE"
val DEFAULT_ROW_TAG = "ROW"
val DEFAULT_ROOT_TAG = "ROWS"
val DEFAULT_CHARSET = "UTF-8"
diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala
index 6d8d049c..c638d90e 100755
--- a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala
+++ b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala
@@ -564,15 +564,35 @@ class XmlSuite extends FunSuite with BeforeAndAfterAll {
}
test("DSL test parsing and inferring attribute in elements having no child element") {
+ // Default value.
+ val resultsOne = new XmlReader()
+ .withRowTag(booksTag)
+ .xmlFile(sqlContext, booksAttributesInNoChild)
+
+ val schemaOne = StructType(List(
+ StructField("_id", StringType, nullable = true),
+ StructField("author", StringType, nullable = true),
+ StructField("price", StructType(
+ List(StructField("_VALUE", StringType, nullable = true),
+ StructField(s"_unit", StringType, nullable = true))),
+ nullable = true),
+ StructField("publish_date", StringType, nullable = true),
+ StructField("title", StringType, nullable = true))
+ )
+
+ assert(resultsOne.schema === schemaOne)
+ assert(resultsOne.count == numBooks)
+
+ // Explicitly set
val attributePrefix = "@#"
val valueTag = "#@@value"
- val results = new XmlReader()
+ val resultsTwo = new XmlReader()
.withRowTag(booksTag)
.withAttributePrefix(attributePrefix)
.withValueTag(valueTag)
.xmlFile(sqlContext, booksAttributesInNoChild)
- val schema = StructType(List(
+ val schemaTwo = StructType(List(
StructField(s"${attributePrefix}id", StringType, nullable = true),
StructField("author", StringType, nullable = true),
StructField("price", StructType(
@@ -583,8 +603,8 @@ class XmlSuite extends FunSuite with BeforeAndAfterAll {
StructField("title", StringType, nullable = true))
)
- assert(results.schema === schema)
- assert(results.count == numBooks)
+ assert(resultsTwo.schema === schemaTwo)
+ assert(resultsTwo.count == numBooks)
}
test("DSL test schema (excluding tags) inferred correctly") {