databricks · HyukjinKwon · Jul 16, 2016 · Aug 31, 2016
diff --git a/README.md b/README.md
@@ -51,17 +51,17 @@ When reading files the API accepts several options:
 * `excludeAttribute` : Whether you want to exclude attributes in elements or not. Default is false.
 * `treatEmptyValuesAsNulls` : Whether you want to treat whitespaces as a null value. Default is false.
 * `failFast` : Whether you want to fail when it fails to parse malformed rows in XML files, instead of dropping the rows. Default is false.
-* `attributePrefix`: The prefix for attributes so that we can differentiate attributes and elements. This will be the prefix for field names. Default is `@`.
-* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `#VALUE`.
+* `attributePrefix`: The prefix for attributes so that we can differentiate attributes and elements. This will be the prefix for field names. Default is `_`.
+* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `_VALUE`.
 * `charset`: Defaults to 'UTF-8' but can be set to other valid charset names
 
 When writing files the API accepts several options:
 * `path`: Location to write files.
 * `rowTag`: The row tag of your xml files to treat as a row. For example, in this xml `<books> <book><book> ...</books>`, the appropriate value would be `book`. Default is `ROW`.
 * `rootTag`: The root tag of your xml files to treat as the root. For example, in this xml `<books> <book><book> ...</books>`, the appropriate value would be `books`. Default is `ROWS`.
 * `nullValue`: The value to write `null` value. Default is string `null`. When this is `null`, it does not write attributes and elements for fields.
-* `attributePrefix`: The prefix for attributes so that we can differentiating attributes and elements. This will be the prefix for field names. Default is `@`.
-* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `#VALUE`.
+* `attributePrefix`: The prefix for attributes so that we can differentiating attributes and elements. This will be the prefix for field names. Default is `_`.
+* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `_VALUE`.
 * `compression`: compression codec to use when saving to file. Should be the fully qualified name of a class implementing `org.apache.hadoop.io.compress.CompressionCodec` or one of case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). Defaults to no compression when a codec is not specified.
 
 Currently it supports the shortened name usage. You can use just `xml` instead of `com.databricks.spark.xml` from Spark 1.5.0+
@@ -87,7 +87,7 @@ Due to the structure differences between `DataFrame` and XML, there are some con
 
     ```
     root
-     |-- @myOneAttrib: string (nullable = true)
+     |-- _myOneAttrib: string (nullable = true)
      |-- two: string (nullable = true)
      |-- three: string (nullable = true)
     ```
@@ -106,8 +106,8 @@ Due to the structure differences between `DataFrame` and XML, there are some con
     ```
     root
      |-- two: struct (nullable = true)
-     |    |-- #VALUE: string (nullable = true)
-     |    |-- @myTwoAttrib: string (nullable = true)
+     |    |-- _VALUE: string (nullable = true)
+     |    |-- _myTwoAttrib: string (nullable = true)
      |-- three: string (nullable = true)
     ```
 
@@ -163,7 +163,7 @@ OPTIONS (path "books.xml", rowTag "book")
 
 You can also specify column names and types in DDL. In this case, we do not infer schema.
 ```sql
-CREATE TABLE books (author string, description string, genre string, @id string, price double, publish_date string, title string)
+CREATE TABLE books (author string, description string, genre string, _id string, price double, publish_date string, title string)
 USING com.databricks.spark.xml
 OPTIONS (path "books.xml", rowTag "book")
 ```
@@ -180,7 +180,7 @@ val df = sqlContext.read
     .option("rowTag", "book")
     .load("books.xml")
 
-val selectedData = df.select("author", "@id")
+val selectedData = df.select("author", "_id")
 selectedData.write
     .format("com.databricks.spark.xml")
     .option("rootTag", "books")
@@ -195,7 +195,7 @@ import org.apache.spark.sql.types.{StructType, StructField, StringType, DoubleTy
 
 val sqlContext = new SQLContext(sc)
 val customSchema = StructType(Array(
-    StructField("@id", StringType, nullable = true),
+    StructField("_id", StringType, nullable = true),
     StructField("author", StringType, nullable = true),
     StructField("description", StringType, nullable = true),
     StructField("genre", StringType ,nullable = true),
@@ -210,7 +210,7 @@ val df = sqlContext.read
     .schema(customSchema)
     .load("books.xml")
 
-val selectedData = df.select("author", "@id")
+val selectedData = df.select("author", "_id")
 selectedData.write
     .format("com.databricks.spark.xml")
     .option("rootTag", "books")
@@ -228,7 +228,7 @@ val df = sqlContext.load(
     "com.databricks.spark.xml",
     Map("path" -> "books.xml", "rowTag" -> "book"))
 
-val selectedData = df.select("author", "@id")
+val selectedData = df.select("author", "_id")
 selectedData.save("com.databricks.spark.xml",
 	SaveMode.ErrorIfExists,
 	Map("path" -> "newbooks.xml", "rootTag" -> "books", "rowTag" -> "book"))
@@ -241,7 +241,7 @@ import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerT
 
 val sqlContext = new SQLContext(sc)
 val customSchema = StructType(Array(
-    StructField("@id", StringType, nullable = true),
+    StructField("_id", StringType, nullable = true),
     StructField("author", StringType, nullable = true),
     StructField("description", StringType, nullable = true),
     StructField("genre", StringType ,nullable = true),
@@ -254,7 +254,7 @@ val df = sqlContext.load(
     schema = customSchema,
     Map("path" -> "books.xml", "rowTag" -> "book"))
 
-val selectedData = df.select("author", "@id")
+val selectedData = df.select("author", "_id")
 selectedData.save("com.databricks.spark.xml",
 	SaveMode.ErrorIfExists,
 	Map("path" -> "newbooks.xml", "rootTag" -> "books", "rowTag" -> "book"))
@@ -272,7 +272,7 @@ DataFrame df = sqlContext.read()
     .option("rowTag", "book")
     .load("books.xml");
 
-df.select("author", "@id").write()
+df.select("author", "_id").write()
     .format("com.databricks.spark.xml")
     .option("rootTag", "books")
     .option("rowTag", "book")
@@ -286,7 +286,7 @@ import org.apache.spark.sql.types.*;
 
 SQLContext sqlContext = new SQLContext(sc);
 StructType customSchema = new StructType(new StructField[] {
-    new StructField("@id", DataTypes.StringType, true, Metadata.empty()),
+    new StructField("_id", DataTypes.StringType, true, Metadata.empty()),
     new StructField("author", DataTypes.StringType, true, Metadata.empty()),
     new StructField("description", DataTypes.StringType, true, Metadata.empty()),
     new StructField("genre", DataTypes.StringType, true, Metadata.empty()),
@@ -301,7 +301,7 @@ DataFrame df = sqlContext.read()
     .schema(customSchema)
     .load("books.xml");
 
-df.select("author", "@id").write()
+df.select("author", "_id").write()
     .format("com.databricks.spark.xml")
     .option("rootTag", "books")
     .option("rowTag", "book")
@@ -326,7 +326,7 @@ HashMap<String, String> options = new HashMap<String, String>();
 options.put("rowTag", "book");
 options.put("rootTag", "books");
 options.put("path", "newbooks.xml");
-df.select("author", "@id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options)
+df.select("author", "_id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options)
 ```
 
 You can manually specify schema:
@@ -336,7 +336,7 @@ import org.apache.spark.sql.types.*;
 
 SQLContext sqlContext = new SQLContext(sc);
 StructType customSchema = new StructType(new StructField[] {
-    new StructField("@id", DataTypes.StringType, true, Metadata.empty()),
+    new StructField("_id", DataTypes.StringType, true, Metadata.empty()),
     new StructField("author", DataTypes.StringType, true, Metadata.empty()),
     new StructField("description", DataTypes.StringType, true, Metadata.empty()),
     new StructField("genre", DataTypes.StringType, true, Metadata.empty()),
@@ -354,7 +354,7 @@ HashMap<String, String> options = new HashMap<String, String>();
 options.put("rowTag", "book");
 options.put("rootTag", "books");
 options.put("path", "newbooks.xml");
-df.select("author", "@id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options)
+df.select("author", "_id").save("com.databricks.spark.xml", SaveMode.ErrorIfExists, options)
 ```
 
 ### Python API
@@ -366,7 +366,7 @@ from pyspark.sql import SQLContext
 sqlContext = SQLContext(sc)
 
 df = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='book').load('books.xml')
-df.select("author", "@id").write \
+df.select("author", "_id").write \
     .format('com.databricks.spark.xml') \
     .options(rowTag='book', rootTag='books') \
     .save('newbooks.xml')
@@ -379,7 +379,7 @@ from pyspark.sql.types import *
 
 sqlContext = SQLContext(sc)
 customSchema = StructType([ \
-    StructField("@id", StringType(), True), \
+    StructField("_id", StringType(), True), \
     StructField("author", StringType(), True), \
     StructField("description", StringType(), True), \
     StructField("genre", StringType(), True), \
@@ -392,7 +392,7 @@ df = sqlContext.read \
     .options(rowTag='book') \
     .load('books.xml', schema = customSchema)
 
-df.select("author", "@id").write \
+df.select("author", "_id").write \
     .format('com.databricks.spark.xml') \
     .options(rowTag='book', rootTag='books') \
     .save('newbooks.xml')
@@ -406,7 +406,7 @@ from pyspark.sql import SQLContext
 sqlContext = SQLContext(sc)
 
 df = sqlContext.load(source="com.databricks.spark.xml", rowTag = 'book', path = 'books.xml')
-df.select("author", "@id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml')
+df.select("author", "_id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml')
 ```
 
 You can manually specify schema:
@@ -416,7 +416,7 @@ from pyspark.sql.types import *
 
 sqlContext = SQLContext(sc)
 customSchema = StructType([ \
-    StructField("@id", StringType(), True), \
+    StructField("_id", StringType(), True), \
     StructField("author", StringType(), True), \
     StructField("description", StringType(), True), \
     StructField("genre", StringType(), True), \
@@ -425,7 +425,7 @@ customSchema = StructType([ \
     StructField("title", StringType(), True)])
 
 df = sqlContext.load(source="com.databricks.spark.xml", rowTag = 'book', schema = customSchema, path = 'books.xml')
-df.select("author", "@id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml')
+df.select("author", "_id").save('newbooks.xml', rootTag = 'books', rowTag = 'book', path = 'newbooks.xml')
 ```
 
 
@@ -452,7 +452,7 @@ library(SparkR)
 Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-csv_2.10:0.3.3" "sparkr-shell"')
 sqlContext <- sparkRSQL.init(sc)
 customSchema <- structType(
-    structField("@id", "string"),
+    structField("_id", "string"),
     structField("author", "string"),
     structField("description", "string"),
     structField("genre", "string"),

diff --git a/src/main/scala/com/databricks/spark/xml/XmlOptions.scala b/src/main/scala/com/databricks/spark/xml/XmlOptions.scala
@@ -38,8 +38,8 @@ private[xml] class XmlOptions(
 }
 
 private[xml] object XmlOptions {
-  val DEFAULT_ATTRIBUTE_PREFIX = "@"
-  val DEFAULT_VALUE_TAG = "#VALUE"
+  val DEFAULT_ATTRIBUTE_PREFIX = "_"
+  val DEFAULT_VALUE_TAG = "_VALUE"
   val DEFAULT_ROW_TAG = "ROW"
   val DEFAULT_ROOT_TAG = "ROWS"
   val DEFAULT_CHARSET = "UTF-8"

diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala
@@ -564,15 +564,35 @@ class XmlSuite extends FunSuite with BeforeAndAfterAll {
   }
 
   test("DSL test parsing and inferring attribute in elements having no child element") {
+    // Default value.
+    val resultsOne = new XmlReader()
+      .withRowTag(booksTag)
+      .xmlFile(sqlContext, booksAttributesInNoChild)
+
+    val schemaOne = StructType(List(
+      StructField("_id", StringType, nullable = true),
+      StructField("author", StringType, nullable = true),
+      StructField("price", StructType(
+        List(StructField("_VALUE", StringType, nullable = true),
+          StructField(s"_unit", StringType, nullable = true))),
+        nullable = true),
+      StructField("publish_date", StringType, nullable = true),
+      StructField("title", StringType, nullable = true))
+    )
+
+    assert(resultsOne.schema === schemaOne)
+    assert(resultsOne.count == numBooks)
+
+    // Explicitly set
     val attributePrefix = "@#"
     val valueTag = "#@@value"
-    val results = new XmlReader()
+    val resultsTwo = new XmlReader()
       .withRowTag(booksTag)
       .withAttributePrefix(attributePrefix)
       .withValueTag(valueTag)
       .xmlFile(sqlContext, booksAttributesInNoChild)
 
-    val schema = StructType(List(
+    val schemaTwo = StructType(List(
       StructField(s"${attributePrefix}id", StringType, nullable = true),
       StructField("author", StringType, nullable = true),
       StructField("price", StructType(
@@ -583,8 +603,8 @@ class XmlSuite extends FunSuite with BeforeAndAfterAll {
       StructField("title", StringType, nullable = true))
     )
 
-    assert(results.schema === schema)
-    assert(results.count == numBooks)
+    assert(resultsTwo.schema === schemaTwo)
+    assert(resultsTwo.count == numBooks)
   }
 
   test("DSL test schema (excluding tags) inferred correctly") {