apache · sandip-db · Jul 6, 2023 · Jul 7, 2023 · Jul 7, 2023 · Jul 10, 2023
diff --git a/dev/.rat-excludes b/dev/.rat-excludes
@@ -145,3 +145,4 @@ empty.proto
 .*\.proto.bin
 LimitedInputStream.java
 TimSort.java
+xml-resources/*
diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3
@@ -245,9 +245,11 @@ super-csv/2.2.0//super-csv-2.2.0.jar
 threeten-extra/1.7.1//threeten-extra-1.7.1.jar
 tink/1.9.0//tink-1.9.0.jar
 transaction-api/1.1//transaction-api-1.1.jar
+txw2/3.0.2//txw2-3.0.2.jar
 univocity-parsers/2.9.1//univocity-parsers-2.9.1.jar
 wildfly-openssl/1.1.3.Final//wildfly-openssl-1.1.3.Final.jar
 xbean-asm9-shaded/4.23//xbean-asm9-shaded-4.23.jar
+xmlschema-core/2.3.0//xmlschema-core-2.3.0.jar
 xz/1.9//xz-1.9.jar
 zjsonpatch/0.3.0//zjsonpatch-0.3.0.jar
 zookeeper-jute/3.6.3//zookeeper-jute-3.6.3.jar

diff --git a/pom.xml b/pom.xml
@@ -182,6 +182,8 @@
     <codehaus.jackson.version>1.9.13</codehaus.jackson.version>
     <fasterxml.jackson.version>2.15.2</fasterxml.jackson.version>
     <fasterxml.jackson.databind.version>2.15.2</fasterxml.jackson.databind.version>
+    <ws.xmlschema.version>2.3.0</ws.xmlschema.version>
+    <org.glassfish.jaxb.txw2.version>3.0.2</org.glassfish.jaxb.txw2.version>
     <snappy.version>1.1.10.3</snappy.version>
     <netlib.ludovic.dev.version>3.0.3</netlib.ludovic.dev.version>
     <commons-codec.version>1.16.0</commons-codec.version>
@@ -986,6 +988,16 @@
         <artifactId>jackson-module-jaxb-annotations</artifactId>
         <version>${fasterxml.jackson.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.apache.ws.xmlschema</groupId>
+        <artifactId>xmlschema-core</artifactId>
+        <version>${ws.xmlschema.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.glassfish.jaxb</groupId>
+        <artifactId>txw2</artifactId>
+        <version>${org.glassfish.jaxb.txw2.version}</version>
+      </dependency>
       <dependency>
         <groupId>org.glassfish.jersey.core</groupId>
         <artifactId>jersey-server</artifactId>

diff --git a/sql/core/pom.xml b/sql/core/pom.xml
@@ -143,6 +143,14 @@
       <groupId>com.fasterxml.jackson.core</groupId>
       <artifactId>jackson-databind</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.apache.ws.xmlschema</groupId>
+      <artifactId>xmlschema-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.glassfish.jaxb</groupId>
+      <artifactId>txw2</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.xbean</groupId>
       <artifactId>xbean-asm9-shaded</artifactId>

diff --git a/...core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/...core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -22,6 +22,7 @@ org.apache.spark.sql.execution.datasources.noop.NoopDataSource
 org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 org.apache.spark.sql.execution.datasources.v2.parquet.ParquetDataSourceV2
 org.apache.spark.sql.execution.datasources.v2.text.TextDataSourceV2
+org.apache.spark.sql.execution.datasources.xml.DefaultSource
 org.apache.spark.sql.execution.streaming.ConsoleSinkProvider
 org.apache.spark.sql.execution.streaming.sources.RateStreamProvider
 org.apache.spark.sql.execution.streaming.sources.TextSocketSourceProvider

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/DefaultSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/DefaultSource.scala
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.datasources.xml
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}
+import org.apache.spark.sql.execution.datasources.xml.util.XmlFile
+import org.apache.spark.sql.sources._
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Provides access to XML data from pure SQL statements (i.e. for users of the
+ * JDBC server).
+ */
+class DefaultSource
+  extends RelationProvider
+  with SchemaRelationProvider
+  with CreatableRelationProvider
+  with DataSourceRegister {
+
+  /**
+   * Short alias for spark-xml data source.
+   */
+  override def shortName(): String = "xml"
+
+  private def checkPath(parameters: Map[String, String]): String = {
+    parameters.getOrElse("path",
+      throw new IllegalArgumentException("'path' must be specified for XML data."))
+  }
+
+  /**
+   * Creates a new relation for data store in XML given parameters.
+   * Parameters have to include 'path'.
+   */
+  override def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String]): BaseRelation = {
+    createRelation(sqlContext, parameters, null)
+  }
+
+  /**
+   * Creates a new relation for data store in XML given parameters and user supported schema.
+   * Parameters have to include 'path'.
+   */
+  override def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String],
+      schema: StructType): XmlRelation = {
+    val path = checkPath(parameters)
+    // We need the `charset` and `rowTag` before creating the relation.
+    val (charset, rowTag) = {
+      val options = XmlOptions(parameters)
+      (options.charset, options.rowTag)
+    }
+
+    val paramsWithTZ =
+      sqlContext.sparkContext.getConf.getOption("spark.sql.session.timeZone") match {
+        case Some(tz) => parameters.updated("timezone", tz)
+        case None => parameters
+      }
+
+    XmlRelation(
+      () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag),
+      Some(path),
+      paramsWithTZ,
+      schema)(sqlContext)
+  }
+
+  override def createRelation(
+      sqlContext: SQLContext,
+      mode: SaveMode,
+      parameters: Map[String, String],
+      data: DataFrame): BaseRelation = {
+    val path = checkPath(parameters)
+    val filesystemPath = new Path(path)
+    val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
+    val doSave = if (fs.exists(filesystemPath)) {
+      mode match {
+        case SaveMode.Append =>
+          throw new IllegalArgumentException(
+            s"Append mode is not supported by ${this.getClass.getCanonicalName}")
+        case SaveMode.Overwrite =>
+          fs.delete(filesystemPath, true)
+          true
+        case SaveMode.ErrorIfExists =>
+          throw new IllegalArgumentException(s"path $path already exists.")
+        case SaveMode.Ignore => false
+      }
+    } else {
+      true
+    }
+    if (doSave) {
+      // Only save data when the save mode is not ignore.
+      XmlFile.saveAsXmlFile(data, filesystemPath.toString, parameters)
+    }
+    createRelation(sqlContext, parameters, data.schema)
+  }
+}
diff --git a/...ore/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlDataToCatalyst.scala b/...ore/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlDataToCatalyst.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.datasources.xml
+
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression}
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.util.GenericArrayData
+import org.apache.spark.sql.execution.datasources.xml.parsers.StaxXmlParser
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+case class XmlDataToCatalyst(
+    child: Expression,
+    schema: DataType,
+    options: XmlOptions)
+  extends UnaryExpression with CodegenFallback with ExpectsInputTypes {
+
+  override lazy val dataType: DataType = schema
+
+  @transient
+  lazy val rowSchema: StructType = schema match {
+    case st: StructType => st
+    case ArrayType(st: StructType, _) => st
+  }
+
+  override def nullSafeEval(xml: Any): Any = xml match {
+    case string: UTF8String =>
+      CatalystTypeConverters.convertToCatalyst(
+        StaxXmlParser.parseColumn(string.toString, rowSchema, options))
+    case string: String =>
+      StaxXmlParser.parseColumn(string, rowSchema, options)
+    case arr: GenericArrayData =>
+      CatalystTypeConverters.convertToCatalyst(
+        arr.array.map(s => StaxXmlParser.parseColumn(s.toString, rowSchema, options)))
+    case arr: Array[_] =>
+      arr.map(s => StaxXmlParser.parseColumn(s.toString, rowSchema, options))
+    case _ => null
+  }
+
+  override def inputTypes: Seq[DataType] = schema match {
+    case _: StructType => Seq(StringType)
+    case ArrayType(_: StructType, _) => Seq(ArrayType(StringType))
+  }
+
+  // Overrides, in Spark 3.2.0+
+  protected def withNewChildInternal(newChild: Expression): XmlDataToCatalyst = copy(newChild)
+}