diff --git a/.github/workflows/spark-ci.yml b/.github/workflows/spark-ci.yml
index be4083714d9c..ecc973e32a8d 100644
--- a/.github/workflows/spark-ci.yml
+++ b/.github/workflows/spark-ci.yml
@@ -72,17 +72,21 @@ jobs:
strategy:
matrix:
jvm: [11, 17, 21]
- spark: ['3.4', '3.5', '4.0']
+ spark: ['3.4', '3.5', '4.0', '4.1']
scala: ['2.12', '2.13']
exclude:
# Spark 3.5 is the first version not failing on Java 21 (https://issues.apache.org/jira/browse/SPARK-42369)
# Full Java 21 support is coming in Spark 4 (https://issues.apache.org/jira/browse/SPARK-43831)
- jvm: 11
spark: '4.0'
+ - jvm: 11
+ spark: '4.1'
- jvm: 21
spark: '3.4'
- spark: '4.0'
scala: '2.12'
+ - spark: '4.1'
+ scala: '2.12'
env:
SPARK_LOCAL_IP: localhost
steps:
diff --git a/.gitignore b/.gitignore
index f931c10e9407..bcac4d1610fc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,6 +38,8 @@ spark/v3.5/spark/benchmark/*
spark/v3.5/spark-extensions/benchmark/*
spark/v4.0/spark/benchmark/*
spark/v4.0/spark-extensions/benchmark/*
+spark/v4.1/spark/benchmark/*
+spark/v4.1/spark-extensions/benchmark/*
*/benchmark/*
__pycache__/
diff --git a/dev/stage-binaries.sh b/dev/stage-binaries.sh
index ec3080575b86..50f984eb404e 100755
--- a/dev/stage-binaries.sh
+++ b/dev/stage-binaries.sh
@@ -20,7 +20,7 @@
SCALA_VERSION=2.12
FLINK_VERSIONS=1.20,2.0,2.1
-SPARK_VERSIONS=3.4,3.5,4.0
+SPARK_VERSIONS=3.4,3.5,4.0,4.1
KAFKA_VERSIONS=3
./gradlew -Prelease -DscalaVersion=$SCALA_VERSION -DflinkVersions=$FLINK_VERSIONS -DsparkVersions=$SPARK_VERSIONS -DkafkaVersions=$KAFKA_VERSIONS publishApachePublicationToMavenRepository --no-parallel --no-configuration-cache
diff --git a/gradle.properties b/gradle.properties
index 0f70b49eb722..c0f283303ae0 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -18,8 +18,8 @@ jmhJsonOutputPath=build/reports/jmh/results.json
jmhIncludeRegex=.*
systemProp.defaultFlinkVersions=2.1
systemProp.knownFlinkVersions=1.20,2.0,2.1
-systemProp.defaultSparkVersions=4.0
-systemProp.knownSparkVersions=3.4,3.5,4.0
+systemProp.defaultSparkVersions=4.1
+systemProp.knownSparkVersions=3.4,3.5,4.0,4.1
systemProp.defaultKafkaVersions=3
systemProp.knownKafkaVersions=3
systemProp.defaultScalaVersion=2.12
diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
index 5e2d5435eb14..4fbba96317ce 100644
--- a/gradle/libs.versions.toml
+++ b/gradle/libs.versions.toml
@@ -87,6 +87,7 @@ snowflake-jdbc = "3.28.0"
spark34 = "3.4.4"
spark35 = "3.5.7"
spark40 = "4.0.1"
+spark41 = "4.1.0"
sqlite-jdbc = "3.51.1.0"
testcontainers = "2.0.3"
tez08 = { strictly = "0.8.4"} # see rich version usage explanation above
diff --git a/jmh.gradle b/jmh.gradle
index 57efb3821d8f..d2c4709bf349 100644
--- a/jmh.gradle
+++ b/jmh.gradle
@@ -53,6 +53,11 @@ if (sparkVersions.contains("4.0")) {
jmhProjects.add(project(":iceberg-spark:iceberg-spark-extensions-4.0_2.13"))
}
+if (sparkVersions.contains("4.1")) {
+ jmhProjects.add(project(":iceberg-spark:iceberg-spark-4.1_2.13"))
+ jmhProjects.add(project(":iceberg-spark:iceberg-spark-extensions-4.1_2.13"))
+}
+
configure(jmhProjects) {
apply plugin: 'me.champeau.jmh'
apply plugin: 'io.morethan.jmhreport'
diff --git a/settings.gradle b/settings.gradle
index de342dda1476..70f9343a252b 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -175,6 +175,18 @@ if (sparkVersions.contains("4.0")) {
project(":iceberg-spark:spark-runtime-4.0_2.13").name = "iceberg-spark-runtime-4.0_2.13"
}
+if (sparkVersions.contains("4.1")) {
+ include ":iceberg-spark:spark-4.1_2.13"
+ include ":iceberg-spark:spark-extensions-4.1_2.13"
+ include ":iceberg-spark:spark-runtime-4.1_2.13"
+ project(":iceberg-spark:spark-4.1_2.13").projectDir = file('spark/v4.1/spark')
+ project(":iceberg-spark:spark-4.1_2.13").name = "iceberg-spark-4.1_2.13"
+ project(":iceberg-spark:spark-extensions-4.1_2.13").projectDir = file('spark/v4.1/spark-extensions')
+ project(":iceberg-spark:spark-extensions-4.1_2.13").name = "iceberg-spark-extensions-4.1_2.13"
+ project(":iceberg-spark:spark-runtime-4.1_2.13").projectDir = file('spark/v4.1/spark-runtime')
+ project(":iceberg-spark:spark-runtime-4.1_2.13").name = "iceberg-spark-runtime-4.1_2.13"
+}
+
if (kafkaVersions.contains("3")) {
include 'kafka-connect'
project(':kafka-connect').name = 'iceberg-kafka-connect'
diff --git a/spark/build.gradle b/spark/build.gradle
index 75d3f899e5d6..4d4a84fd390a 100644
--- a/spark/build.gradle
+++ b/spark/build.gradle
@@ -31,3 +31,7 @@ if (sparkVersions.contains("3.5")) {
if (sparkVersions.contains("4.0")) {
apply from: file("$projectDir/v4.0/build.gradle")
}
+
+if (sparkVersions.contains("4.1")) {
+ apply from: file("$projectDir/v4.1/build.gradle")
+}
diff --git a/spark/v4.1/build.gradle b/spark/v4.1/build.gradle
new file mode 100644
index 000000000000..14a07ac543c3
--- /dev/null
+++ b/spark/v4.1/build.gradle
@@ -0,0 +1,359 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+String sparkMajorVersion = '4.1'
+String scalaVersion = '2.13'
+
+JavaVersion javaVersion = JavaVersion.current()
+Boolean javaVersionSupported = javaVersion == JavaVersion.VERSION_17 || javaVersion == JavaVersion.VERSION_21
+if (!javaVersionSupported) {
+ logger.warn("Skip Spark 4.1 build which requires JDK 17 or 21 but was executed with JDK " + javaVersion)
+}
+
+def sparkProjects = [
+ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}"),
+ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVersion}"),
+ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersion}"),
+]
+
+configure(sparkProjects) {
+ tasks.configureEach {
+ onlyIf { javaVersionSupported }
+ }
+ configurations {
+ all {
+ resolutionStrategy {
+ force "com.fasterxml.jackson.module:jackson-module-scala_${scalaVersion}:${libs.versions.jackson215.get()}"
+ force "com.fasterxml.jackson.core:jackson-databind:${libs.versions.jackson215.get()}"
+ force "com.fasterxml.jackson.core:jackson-core:${libs.versions.jackson215.get()}"
+ }
+ }
+ }
+}
+
+project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") {
+ apply plugin: 'scala'
+ apply plugin: 'com.github.alisiikh.scalastyle'
+
+ // Set target to JDK17 for Spark 4.1 to fix following error
+ // "spark/v4.1/spark/src/main/scala/org/apache/spark/sql/stats/ThetaSketchAgg.scala:52:12: Class java.lang.Record not found"
+ tasks.withType(ScalaCompile.class) {
+ sourceCompatibility = "17"
+ targetCompatibility = "17"
+ scalaCompileOptions.additionalParameters.add("-release:17")
+ }
+
+ sourceSets {
+ main {
+ scala.srcDirs = ['src/main/scala', 'src/main/java']
+ java.srcDirs = []
+ }
+ }
+
+ dependencies {
+ implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow')
+ api project(':iceberg-api')
+ annotationProcessor libs.immutables.value
+ compileOnly libs.immutables.value
+ implementation project(':iceberg-common')
+ implementation project(':iceberg-core')
+ implementation project(':iceberg-data')
+ implementation project(':iceberg-orc')
+ implementation project(':iceberg-parquet')
+ implementation project(':iceberg-arrow')
+ implementation("org.scala-lang.modules:scala-collection-compat_${scalaVersion}:${libs.versions.scala.collection.compat.get()}")
+ implementation("org.apache.datasketches:datasketches-java:${libs.versions.datasketches.get()}")
+
+ compileOnly libs.errorprone.annotations
+ compileOnly libs.avro.avro
+ compileOnly("org.apache.spark:spark-hive_${scalaVersion}:${libs.versions.spark41.get()}") {
+ exclude group: 'org.apache.avro', module: 'avro'
+ exclude group: 'org.apache.arrow'
+ exclude group: 'org.apache.parquet'
+ // to make sure netty libs only come from project(':iceberg-arrow')
+ exclude group: 'io.netty', module: 'netty-buffer'
+ exclude group: 'io.netty', module: 'netty-common'
+ exclude group: 'org.roaringbitmap'
+ }
+
+ // TODO: datafusion-comet Spark 4.1 support
+ compileOnly "org.apache.datafusion:comet-spark-spark4.0_2.13:${libs.versions.comet.get()}"
+
+ implementation libs.parquet.column
+ implementation libs.parquet.hadoop
+
+ implementation("${libs.orc.core.get().module}:${libs.versions.orc.get()}:nohive") {
+ exclude group: 'org.apache.hadoop'
+ exclude group: 'commons-lang'
+ // These artifacts are shaded and included in the orc-core fat jar
+ exclude group: 'com.google.protobuf', module: 'protobuf-java'
+ exclude group: 'org.apache.hive', module: 'hive-storage-api'
+ }
+
+ implementation(libs.arrow.vector) {
+ exclude group: 'io.netty', module: 'netty-buffer'
+ exclude group: 'io.netty', module: 'netty-common'
+ exclude group: 'com.google.code.findbugs', module: 'jsr305'
+ }
+
+ implementation libs.caffeine
+
+ testImplementation(libs.hadoop3.minicluster) {
+ exclude group: 'org.apache.avro', module: 'avro'
+ // to make sure netty libs only come from project(':iceberg-arrow')
+ exclude group: 'io.netty', module: 'netty-buffer'
+ exclude group: 'io.netty', module: 'netty-common'
+ }
+ testImplementation project(path: ':iceberg-hive-metastore')
+ testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
+ testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
+ testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts')
+ testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts')
+ testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) {
+ transitive = false
+ }
+ testImplementation libs.sqlite.jdbc
+ testImplementation libs.awaitility
+ testImplementation(testFixtures(project(':iceberg-parquet')))
+ // runtime dependencies for running REST Catalog based integration test
+ testRuntimeOnly libs.jetty.servlet
+ }
+
+ test {
+ useJUnitPlatform()
+ }
+
+ tasks.withType(Test) {
+ // Vectorized reads need more memory
+ maxHeapSize '3160m'
+ }
+}
+
+project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVersion}") {
+ apply plugin: 'java-library'
+ apply plugin: 'scala'
+ apply plugin: 'com.github.alisiikh.scalastyle'
+ apply plugin: 'antlr'
+
+ configurations {
+ /*
+ The Gradle Antlr plugin erroneously adds both antlr-build and runtime dependencies to the runtime path. This
+ bug https://github.com/gradle/gradle/issues/820 exists because older versions of Antlr do not have separate
+ runtime and implementation dependencies and they do not want to break backwards compatibility. So to only end up with
+ the runtime dependency on the runtime classpath we remove the dependencies added by the plugin here. Then add
+ the runtime dependency back to only the runtime configuration manually.
+ */
+ implementation {
+ extendsFrom = extendsFrom.findAll { it != configurations.antlr }
+ }
+ }
+
+ dependencies {
+ implementation("org.scala-lang.modules:scala-collection-compat_${scalaVersion}:${libs.versions.scala.collection.compat.get()}")
+ implementation libs.roaringbitmap
+
+ compileOnly "org.scala-lang:scala-library"
+ compileOnly project(path: ':iceberg-bundled-guava', configuration: 'shadow')
+ compileOnly project(':iceberg-api')
+ compileOnly project(':iceberg-core')
+ compileOnly project(':iceberg-common')
+ compileOnly project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}")
+ compileOnly("org.apache.spark:spark-hive_${scalaVersion}:${libs.versions.spark41.get()}") {
+ exclude group: 'org.apache.avro', module: 'avro'
+ exclude group: 'org.apache.arrow'
+ exclude group: 'org.apache.parquet'
+ // to make sure netty libs only come from project(':iceberg-arrow')
+ exclude group: 'io.netty', module: 'netty-buffer'
+ exclude group: 'io.netty', module: 'netty-common'
+ exclude group: 'org.roaringbitmap'
+ }
+ compileOnly libs.errorprone.annotations
+
+ testImplementation project(path: ':iceberg-data')
+ testImplementation project(path: ':iceberg-parquet')
+ testImplementation project(path: ':iceberg-hive-metastore')
+ testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
+ testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts')
+ testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts')
+ testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
+ testImplementation project(path: ":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}", configuration: 'testArtifacts')
+ testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) {
+ transitive = false
+ }
+ // runtime dependencies for running REST Catalog based integration test
+ testRuntimeOnly libs.jetty.servlet
+ testRuntimeOnly libs.sqlite.jdbc
+
+ testImplementation libs.avro.avro
+ testImplementation libs.parquet.hadoop
+ testImplementation libs.awaitility
+ // TODO: datafusion-comet Spark 4.1 support
+ testImplementation "org.apache.datafusion:comet-spark-spark4.0_2.13:${libs.versions.comet.get()}"
+ testImplementation(testFixtures(project(':iceberg-parquet')))
+
+ // Required because we remove antlr plugin dependencies from the compile configuration, see note above
+ runtimeOnly libs.antlr.runtime413
+ antlr libs.antlr.antlr413
+ }
+
+ test {
+ useJUnitPlatform()
+ }
+
+ generateGrammarSource {
+ maxHeapSize = "64m"
+ arguments += ['-visitor', '-package', 'org.apache.spark.sql.catalyst.parser.extensions']
+ }
+}
+
+project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersion}") {
+ apply plugin: 'com.gradleup.shadow'
+
+ tasks.jar.dependsOn tasks.shadowJar
+
+ sourceSets {
+ integration {
+ java.srcDir "$projectDir/src/integration/java"
+ resources.srcDir "$projectDir/src/integration/resources"
+ }
+ }
+
+ configurations {
+ implementation {
+ exclude group: 'org.apache.spark'
+ // included in Spark
+ exclude group: 'org.slf4j'
+ exclude group: 'org.apache.commons'
+ exclude group: 'commons-pool'
+ exclude group: 'commons-codec'
+ exclude group: 'org.xerial.snappy'
+ exclude group: 'javax.xml.bind'
+ exclude group: 'javax.annotation'
+ exclude group: 'com.github.luben'
+ exclude group: 'com.ibm.icu'
+ exclude group: 'org.glassfish'
+ exclude group: 'org.abego.treelayout'
+ exclude group: 'org.antlr'
+ exclude group: 'org.scala-lang'
+ exclude group: 'org.scala-lang.modules'
+ }
+ }
+
+ dependencies {
+ api project(':iceberg-api')
+ implementation project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}")
+ implementation project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVersion}")
+ implementation project(':iceberg-aws')
+ implementation project(':iceberg-azure')
+ implementation(project(':iceberg-aliyun')) {
+ exclude group: 'edu.umd.cs.findbugs', module: 'findbugs'
+ exclude group: 'org.apache.httpcomponents', module: 'httpclient'
+ exclude group: 'commons-logging', module: 'commons-logging'
+ }
+ implementation project(':iceberg-gcp')
+ implementation project(':iceberg-bigquery')
+ implementation project(':iceberg-hive-metastore')
+ implementation(project(':iceberg-nessie')) {
+ exclude group: 'com.google.code.findbugs', module: 'jsr305'
+ }
+ implementation (project(':iceberg-snowflake')) {
+ exclude group: 'net.snowflake' , module: 'snowflake-jdbc'
+ }
+
+ integrationImplementation "org.scala-lang.modules:scala-collection-compat_${scalaVersion}:${libs.versions.scala.collection.compat.get()}"
+ integrationImplementation "org.apache.spark:spark-hive_${scalaVersion}:${libs.versions.spark41.get()}"
+ integrationImplementation libs.junit.jupiter
+ integrationImplementation libs.junit.platform.launcher
+ integrationImplementation libs.slf4j.simple
+ integrationImplementation libs.assertj.core
+ integrationImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
+ integrationImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts')
+ integrationImplementation project(path: ":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}", configuration: 'testArtifacts')
+ integrationImplementation project(path: ":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVersion}", configuration: 'testArtifacts')
+
+ // runtime dependencies for running Hive Catalog based integration test
+ integrationRuntimeOnly project(':iceberg-hive-metastore')
+ // runtime dependencies for running REST Catalog based integration test
+ integrationRuntimeOnly project(path: ':iceberg-core', configuration: 'testArtifacts')
+ integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) {
+ transitive = false
+ }
+ integrationRuntimeOnly libs.jetty.servlet
+ integrationRuntimeOnly libs.sqlite.jdbc
+
+ // Not allowed on our classpath, only the runtime jar is allowed
+ integrationCompileOnly project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVersion}")
+ integrationCompileOnly project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}")
+ integrationCompileOnly project(':iceberg-api')
+ }
+
+ shadowJar {
+ configurations = [project.configurations.runtimeClasspath]
+
+ zip64 true
+
+ // include the LICENSE and NOTICE files for the shaded Jar
+ from(projectDir) {
+ include 'LICENSE'
+ include 'NOTICE'
+ }
+
+ // Relocate dependencies to avoid conflicts
+ relocate 'com.google.errorprone', 'org.apache.iceberg.shaded.com.google.errorprone'
+ relocate 'com.google.flatbuffers', 'org.apache.iceberg.shaded.com.google.flatbuffers'
+ relocate 'com.fasterxml', 'org.apache.iceberg.shaded.com.fasterxml'
+ relocate 'com.github.benmanes', 'org.apache.iceberg.shaded.com.github.benmanes'
+ relocate 'org.checkerframework', 'org.apache.iceberg.shaded.org.checkerframework'
+ relocate 'org.apache.avro', 'org.apache.iceberg.shaded.org.apache.avro'
+ relocate 'avro.shaded', 'org.apache.iceberg.shaded.org.apache.avro.shaded'
+ relocate 'com.thoughtworks.paranamer', 'org.apache.iceberg.shaded.com.thoughtworks.paranamer'
+ relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet'
+ relocate 'shaded.parquet', 'org.apache.iceberg.shaded.org.apache.parquet.shaded'
+ relocate 'org.apache.orc', 'org.apache.iceberg.shaded.org.apache.orc'
+ relocate 'io.airlift', 'org.apache.iceberg.shaded.io.airlift'
+ relocate 'org.apache.hc.client5', 'org.apache.iceberg.shaded.org.apache.hc.client5'
+ relocate 'org.apache.hc.core5', 'org.apache.iceberg.shaded.org.apache.hc.core5'
+ // relocate Arrow and related deps to shade Iceberg specific version
+ relocate 'io.netty', 'org.apache.iceberg.shaded.io.netty'
+ relocate 'org.apache.arrow', 'org.apache.iceberg.shaded.org.apache.arrow'
+ relocate 'com.carrotsearch', 'org.apache.iceberg.shaded.com.carrotsearch'
+ relocate 'org.threeten.extra', 'org.apache.iceberg.shaded.org.threeten.extra'
+ relocate 'org.roaringbitmap', 'org.apache.iceberg.shaded.org.roaringbitmap'
+ relocate 'org.apache.datasketches', 'org.apache.iceberg.shaded.org.apache.datasketches'
+
+ archiveClassifier.set(null)
+ }
+
+ task integrationTest(type: Test) {
+ useJUnitPlatform()
+ description = "Test Spark3 Runtime Jar against Spark ${sparkMajorVersion}"
+ group = "verification"
+ jvmArgs += project.property('extraJvmArgs')
+ testClassesDirs = sourceSets.integration.output.classesDirs
+ classpath = sourceSets.integration.runtimeClasspath + files(shadowJar.archiveFile.get().asFile.path)
+ inputs.file(shadowJar.archiveFile.get().asFile.path)
+ }
+ integrationTest.dependsOn shadowJar
+ check.dependsOn integrationTest
+
+ jar {
+ enabled = false
+ }
+}
+
diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java
new file mode 100644
index 000000000000..a8b226ea1e37
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java
@@ -0,0 +1,279 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg;
+
+import com.google.errorprone.annotations.FormatMethod;
+import com.google.errorprone.annotations.FormatString;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.Spark3Util;
+import org.apache.iceberg.spark.SparkSessionCatalog;
+import org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions;
+import org.apache.iceberg.util.ThreadPools;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
+import org.apache.spark.sql.catalyst.parser.ParseException;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Timeout;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * A benchmark that evaluates the delete file index build and lookup performance.
+ *
+ *
To run this benchmark for spark-4.1:
+ * ./gradlew -DsparkVersions=4.1 :iceberg-spark:iceberg-spark-extensions-4.1_2.13:jmh
+ * -PjmhIncludeRegex=DeleteFileIndexBenchmark
+ * -PjmhOutputPath=benchmark/iceberg-delete-file-index-benchmark.txt
+ *
+ */
+@Fork(1)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3)
+@Measurement(iterations = 10)
+@Timeout(time = 20, timeUnit = TimeUnit.MINUTES)
+@BenchmarkMode(Mode.SingleShotTime)
+public class DeleteFileIndexBenchmark {
+
+ private static final String TABLE_NAME = "test_table";
+ private static final String PARTITION_COLUMN = "ss_ticket_number";
+
+ private static final int NUM_PARTITIONS = 50;
+ private static final int NUM_DATA_FILES_PER_PARTITION = 50_000;
+ private static final int NUM_DELETE_FILES_PER_PARTITION = 100;
+
+ private final Configuration hadoopConf = new Configuration();
+ private SparkSession spark;
+ private Table table;
+
+ private List dataFiles;
+
+ @Param({"partition", "file", "dv"})
+ private String type;
+
+ @Setup
+ public void setupBenchmark() throws NoSuchTableException, ParseException {
+ setupSpark();
+ initTable();
+ initDataAndDeletes();
+ loadDataFiles();
+ }
+
+ private void initDataAndDeletes() {
+ if (type.equals("partition")) {
+ initDataAndPartitionScopedDeletes();
+ } else if (type.equals("file")) {
+ initDataAndFileScopedDeletes();
+ } else {
+ initDataAndDVs();
+ }
+ }
+
+ @TearDown
+ public void tearDownBenchmark() {
+ dropTable();
+ tearDownSpark();
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void buildIndexAndLookup(Blackhole blackhole) {
+ DeleteFileIndex deletes = buildDeletes();
+ for (DataFile dataFile : dataFiles) {
+ DeleteFile[] deleteFiles = deletes.forDataFile(dataFile.dataSequenceNumber(), dataFile);
+ blackhole.consume(deleteFiles);
+ }
+ }
+
+ private void loadDataFiles() {
+ table.refresh();
+
+ Snapshot snapshot = table.currentSnapshot();
+
+ ManifestGroup manifestGroup =
+ new ManifestGroup(table.io(), snapshot.dataManifests(table.io()), ImmutableList.of());
+
+ try (CloseableIterable> entries = manifestGroup.entries()) {
+ List files = Lists.newArrayList();
+ for (ManifestEntry entry : entries) {
+ files.add(entry.file().copyWithoutStats());
+ }
+ this.dataFiles = files;
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+ }
+
+ private DeleteFileIndex buildDeletes() {
+ table.refresh();
+
+ List deleteManifests = table.currentSnapshot().deleteManifests(table.io());
+
+ return DeleteFileIndex.builderFor(table.io(), deleteManifests)
+ .specsById(table.specs())
+ .planWith(ThreadPools.getWorkerPool())
+ .build();
+ }
+
+ private void initDataAndPartitionScopedDeletes() {
+ for (int partitionOrdinal = 0; partitionOrdinal < NUM_PARTITIONS; partitionOrdinal++) {
+ StructLike partition = TestHelpers.Row.of(partitionOrdinal);
+
+ RowDelta rowDelta = table.newRowDelta();
+
+ for (int fileOrdinal = 0; fileOrdinal < NUM_DATA_FILES_PER_PARTITION; fileOrdinal++) {
+ DataFile dataFile = FileGenerationUtil.generateDataFile(table, partition);
+ rowDelta.addRows(dataFile);
+ }
+
+ for (int fileOrdinal = 0; fileOrdinal < NUM_DELETE_FILES_PER_PARTITION; fileOrdinal++) {
+ DeleteFile deleteFile = FileGenerationUtil.generatePositionDeleteFile(table, partition);
+ rowDelta.addDeletes(deleteFile);
+ }
+
+ rowDelta.commit();
+ }
+ }
+
+ private void initDataAndFileScopedDeletes() {
+ for (int partitionOrdinal = 0; partitionOrdinal < NUM_PARTITIONS; partitionOrdinal++) {
+ StructLike partition = TestHelpers.Row.of(partitionOrdinal);
+
+ RowDelta rowDelta = table.newRowDelta();
+
+ for (int fileOrdinal = 0; fileOrdinal < NUM_DATA_FILES_PER_PARTITION; fileOrdinal++) {
+ DataFile dataFile = FileGenerationUtil.generateDataFile(table, partition);
+ DeleteFile deleteFile = FileGenerationUtil.generatePositionDeleteFile(table, dataFile);
+ rowDelta.addRows(dataFile);
+ rowDelta.addDeletes(deleteFile);
+ }
+
+ rowDelta.commit();
+ }
+ }
+
+ private void initDataAndDVs() {
+ for (int partitionOrdinal = 0; partitionOrdinal < NUM_PARTITIONS; partitionOrdinal++) {
+ StructLike partition = TestHelpers.Row.of(partitionOrdinal);
+
+ RowDelta rowDelta = table.newRowDelta();
+
+ for (int fileOrdinal = 0; fileOrdinal < NUM_DATA_FILES_PER_PARTITION; fileOrdinal++) {
+ DataFile dataFile = FileGenerationUtil.generateDataFile(table, partition);
+ DeleteFile dv = FileGenerationUtil.generateDV(table, dataFile);
+ rowDelta.addRows(dataFile);
+ rowDelta.addDeletes(dv);
+ }
+
+ rowDelta.commit();
+ }
+ }
+
+ private void setupSpark() {
+ this.spark =
+ SparkSession.builder()
+ .config("spark.ui.enabled", false)
+ .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+ .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName())
+ .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName())
+ .config("spark.sql.catalog.spark_catalog.type", "hadoop")
+ .config("spark.sql.catalog.spark_catalog.warehouse", newWarehouseDir())
+ .master("local[*]")
+ .getOrCreate();
+ }
+
+ private void tearDownSpark() {
+ spark.stop();
+ }
+
+ private void initTable() throws NoSuchTableException, ParseException {
+ sql(
+ "CREATE TABLE %s ( "
+ + " `ss_sold_date_sk` INT, "
+ + " `ss_sold_time_sk` INT, "
+ + " `ss_item_sk` INT, "
+ + " `ss_customer_sk` STRING, "
+ + " `ss_cdemo_sk` STRING, "
+ + " `ss_hdemo_sk` STRING, "
+ + " `ss_addr_sk` STRING, "
+ + " `ss_store_sk` STRING, "
+ + " `ss_promo_sk` STRING, "
+ + " `ss_ticket_number` INT, "
+ + " `ss_quantity` STRING, "
+ + " `ss_wholesale_cost` STRING, "
+ + " `ss_list_price` STRING, "
+ + " `ss_sales_price` STRING, "
+ + " `ss_ext_discount_amt` STRING, "
+ + " `ss_ext_sales_price` STRING, "
+ + " `ss_ext_wholesale_cost` STRING, "
+ + " `ss_ext_list_price` STRING, "
+ + " `ss_ext_tax` STRING, "
+ + " `ss_coupon_amt` STRING, "
+ + " `ss_net_paid` STRING, "
+ + " `ss_net_paid_inc_tax` STRING, "
+ + " `ss_net_profit` STRING "
+ + ")"
+ + "USING iceberg "
+ + "PARTITIONED BY (%s) "
+ + "TBLPROPERTIES ("
+ + " '%s' '%b',"
+ + " '%s' '%s',"
+ + " '%s' '%d')",
+ TABLE_NAME,
+ PARTITION_COLUMN,
+ TableProperties.MANIFEST_MERGE_ENABLED,
+ false,
+ TableProperties.DELETE_MODE,
+ RowLevelOperationMode.MERGE_ON_READ.modeName(),
+ TableProperties.FORMAT_VERSION,
+ type.equals("dv") ? 3 : 2);
+
+ this.table = Spark3Util.loadIcebergTable(spark, TABLE_NAME);
+ }
+
+ private void dropTable() {
+ sql("DROP TABLE IF EXISTS %s PURGE", TABLE_NAME);
+ }
+
+ private String newWarehouseDir() {
+ return hadoopConf.get("hadoop.tmp.dir") + UUID.randomUUID();
+ }
+
+ @FormatMethod
+ private void sql(@FormatString String query, Object... args) {
+ spark.sql(String.format(query, args));
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java
new file mode 100644
index 000000000000..eeea81634596
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark;
+
+import static org.apache.spark.sql.functions.current_date;
+import static org.apache.spark.sql.functions.date_add;
+import static org.apache.spark.sql.functions.expr;
+
+import com.google.errorprone.annotations.FormatMethod;
+import com.google.errorprone.annotations.FormatString;
+import java.util.UUID;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.iceberg.DistributionMode;
+import org.apache.iceberg.RowLevelOperationMode;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
+import org.apache.spark.sql.catalyst.parser.ParseException;
+import org.apache.spark.sql.internal.SQLConf;
+import org.apache.spark.sql.types.StructType;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * A benchmark that evaluates the performance of the cardinality check in MERGE operations.
+ *
+ * To run this benchmark for spark-4.1:
+ * ./gradlew -DsparkVersions=4.1 :iceberg-spark:iceberg-spark-extensions-4.1_2.13:jmh
+ * -PjmhIncludeRegex=MergeCardinalityCheckBenchmark
+ * -PjmhOutputPath=benchmark/iceberg-merge-cardinality-check-benchmark.txt
+ *
+ */
+@Fork(1)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3)
+@Measurement(iterations = 5)
+@BenchmarkMode(Mode.SingleShotTime)
+public class MergeCardinalityCheckBenchmark {
+
+ private static final String TABLE_NAME = "test_table";
+ private static final int NUM_FILES = 5;
+ private static final int NUM_ROWS_PER_FILE = 1_000_000;
+ private static final int NUM_UNMATCHED_RECORDS_PER_MERGE = 100_000;
+
+ private final Configuration hadoopConf = new Configuration();
+ private SparkSession spark;
+ private long originalSnapshotId;
+
+ @Setup
+ public void setupBenchmark() throws NoSuchTableException, ParseException {
+ setupSpark();
+ initTable();
+ appendData();
+
+ Table table = Spark3Util.loadIcebergTable(spark, TABLE_NAME);
+ this.originalSnapshotId = table.currentSnapshot().snapshotId();
+ }
+
+ @TearDown
+ public void tearDownBenchmark() {
+ tearDownSpark();
+ dropTable();
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void copyOnWriteMergeCardinalityCheck10PercentUpdates() {
+ runBenchmark(RowLevelOperationMode.COPY_ON_WRITE, 0.1);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void copyOnWriteMergeCardinalityCheck30PercentUpdates() {
+ runBenchmark(RowLevelOperationMode.COPY_ON_WRITE, 0.3);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void copyOnWriteMergeCardinalityCheck90PercentUpdates() {
+ runBenchmark(RowLevelOperationMode.COPY_ON_WRITE, 0.9);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void mergeOnReadMergeCardinalityCheck10PercentUpdates() {
+ runBenchmark(RowLevelOperationMode.MERGE_ON_READ, 0.1);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void mergeOnReadMergeCardinalityCheck30PercentUpdates() {
+ runBenchmark(RowLevelOperationMode.MERGE_ON_READ, 0.3);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void mergeOnReadMergeCardinalityCheck90PercentUpdates() {
+ runBenchmark(RowLevelOperationMode.MERGE_ON_READ, 0.9);
+ }
+
+ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) {
+ sql(
+ "ALTER TABLE %s SET TBLPROPERTIES ('%s' '%s')",
+ TABLE_NAME, TableProperties.MERGE_MODE, mode.modeName());
+
+ Dataset insertDataDF = spark.range(-NUM_UNMATCHED_RECORDS_PER_MERGE, 0, 1);
+ Dataset updateDataDF = spark.range((long) (updatePercentage * NUM_ROWS_PER_FILE));
+ Dataset sourceDF = updateDataDF.union(insertDataDF);
+ sourceDF.createOrReplaceTempView("source");
+
+ sql(
+ "MERGE INTO %s t USING source s "
+ + "ON t.id = s.id "
+ + "WHEN MATCHED THEN "
+ + " UPDATE SET stringCol = 'invalid' "
+ + "WHEN NOT MATCHED THEN "
+ + " INSERT (id, intCol, floatCol, doubleCol, decimalCol, dateCol, timestampCol, stringCol) "
+ + " VALUES (s.id, null, null, null, null, null, null, 'new')",
+ TABLE_NAME);
+
+ sql(
+ "CALL system.rollback_to_snapshot(table => '%s', snapshot_id => %dL)",
+ TABLE_NAME, originalSnapshotId);
+ }
+
+ private void setupSpark() {
+ this.spark =
+ SparkSession.builder()
+ .config("spark.ui.enabled", false)
+ .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName())
+ .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName())
+ .config("spark.sql.catalog.spark_catalog.type", "hadoop")
+ .config("spark.sql.catalog.spark_catalog.warehouse", newWarehouseDir())
+ .config(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED().key(), "false")
+ .config(SQLConf.RUNTIME_ROW_LEVEL_OPERATION_GROUP_FILTER_ENABLED().key(), "false")
+ .config(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false")
+ .config(SQLConf.SHUFFLE_PARTITIONS().key(), "2")
+ .master("local")
+ .getOrCreate();
+ }
+
+ private void tearDownSpark() {
+ spark.stop();
+ }
+
+ private void initTable() {
+ sql(
+ "CREATE TABLE %s ( "
+ + " id LONG, intCol INT, floatCol FLOAT, doubleCol DOUBLE, "
+ + " decimalCol DECIMAL(20, 5), dateCol DATE, timestampCol TIMESTAMP, "
+ + " stringCol STRING)"
+ + "USING iceberg "
+ + "TBLPROPERTIES ("
+ + " '%s' '%s',"
+ + " '%s' '%d',"
+ + " '%s' '%d')",
+ TABLE_NAME,
+ TableProperties.MERGE_DISTRIBUTION_MODE,
+ DistributionMode.NONE.modeName(),
+ TableProperties.SPLIT_OPEN_FILE_COST,
+ Integer.MAX_VALUE,
+ TableProperties.FORMAT_VERSION,
+ 2);
+
+ sql("ALTER TABLE %s WRITE ORDERED BY id", TABLE_NAME);
+ }
+
+ private void dropTable() {
+ sql("DROP TABLE IF EXISTS %s PURGE", TABLE_NAME);
+ }
+
+ private void appendData() throws NoSuchTableException {
+ for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+ Dataset inputDF =
+ spark
+ .range(NUM_ROWS_PER_FILE)
+ .withColumn("intCol", expr("CAST(id AS INT)"))
+ .withColumn("floatCol", expr("CAST(id AS FLOAT)"))
+ .withColumn("doubleCol", expr("CAST(id AS DOUBLE)"))
+ .withColumn("decimalCol", expr("CAST(id AS DECIMAL(20, 5))"))
+ .withColumn("dateCol", date_add(current_date(), fileNum))
+ .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)"))
+ .withColumn("stringCol", expr("CAST(dateCol AS STRING)"));
+ appendAsFile(inputDF);
+ }
+ }
+
+ private void appendAsFile(Dataset df) throws NoSuchTableException {
+ // ensure the schema is precise (including nullability)
+ StructType sparkSchema = spark.table(TABLE_NAME).schema();
+ spark.createDataFrame(df.rdd(), sparkSchema).coalesce(1).writeTo(TABLE_NAME).append();
+ }
+
+ private String newWarehouseDir() {
+ return hadoopConf.get("hadoop.tmp.dir") + UUID.randomUUID();
+ }
+
+ @FormatMethod
+ private void sql(@FormatString String query, Object... args) {
+ spark.sql(String.format(query, args));
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java
new file mode 100644
index 000000000000..0eff3a847e41
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java
@@ -0,0 +1,409 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark;
+
+import static org.apache.iceberg.PlanningMode.DISTRIBUTED;
+import static org.apache.iceberg.PlanningMode.LOCAL;
+
+import com.google.errorprone.annotations.FormatMethod;
+import com.google.errorprone.annotations.FormatString;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.iceberg.BatchScan;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DeleteFile;
+import org.apache.iceberg.FileGenerationUtil;
+import org.apache.iceberg.PlanningMode;
+import org.apache.iceberg.RowDelta;
+import org.apache.iceberg.RowLevelOperationMode;
+import org.apache.iceberg.ScanTask;
+import org.apache.iceberg.SparkDistributedDataScan;
+import org.apache.iceberg.StructLike;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.TestHelpers;
+import org.apache.iceberg.expressions.Expression;
+import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions;
+import org.apache.iceberg.types.Conversions;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
+import org.apache.spark.sql.catalyst.parser.ParseException;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Timeout;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * A benchmark that evaluates the job planning performance.
+ *
+ * To run this benchmark for spark-4.1:
+ * ./gradlew -DsparkVersions=4.1 :iceberg-spark:iceberg-spark-extensions-4.1_2.13:jmh
+ * -PjmhIncludeRegex=PlanningBenchmark
+ * -PjmhOutputPath=benchmark/iceberg-planning-benchmark.txt
+ *
+ */
+@Fork(1)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3)
+@Measurement(iterations = 5)
+@Timeout(time = 20, timeUnit = TimeUnit.MINUTES)
+@BenchmarkMode(Mode.SingleShotTime)
+public class PlanningBenchmark {
+
+ private static final String TABLE_NAME = "test_table";
+ private static final String PARTITION_COLUMN = "ss_ticket_number";
+ private static final int PARTITION_VALUE = 10;
+ private static final String SORT_KEY_COLUMN = "ss_sold_date_sk";
+ private static final int SORT_KEY_VALUE = 5;
+
+ private static final Expression SORT_KEY_PREDICATE =
+ Expressions.equal(SORT_KEY_COLUMN, SORT_KEY_VALUE);
+ private static final Expression PARTITION_PREDICATE =
+ Expressions.equal(PARTITION_COLUMN, PARTITION_VALUE);
+ private static final Expression PARTITION_AND_SORT_KEY_PREDICATE =
+ Expressions.and(PARTITION_PREDICATE, SORT_KEY_PREDICATE);
+
+ private static final int NUM_PARTITIONS = 30;
+ private static final int NUM_DATA_FILES_PER_PARTITION = 50_000;
+ private static final int NUM_DELETE_FILES_PER_PARTITION = 50;
+
+ private final Configuration hadoopConf = new Configuration();
+ private SparkSession spark;
+ private Table table;
+
+ @Param({"partition", "file", "dv"})
+ private String type;
+
+ @Setup
+ public void setupBenchmark() throws NoSuchTableException, ParseException {
+ setupSpark();
+ initTable();
+ initDataAndDeletes();
+ }
+
+ @TearDown
+ public void tearDownBenchmark() {
+ dropTable();
+ tearDownSpark();
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void localPlanningWithPartitionAndMinMaxFilter(Blackhole blackhole) {
+ BatchScan scan = table.newBatchScan();
+ List fileTasks = planFilesWithoutColumnStats(scan, PARTITION_AND_SORT_KEY_PREDICATE);
+ blackhole.consume(fileTasks);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void distributedPlanningWithPartitionAndMinMaxFilter(Blackhole blackhole) {
+ BatchScan scan = newDistributedScan(DISTRIBUTED, DISTRIBUTED);
+ List fileTasks = planFilesWithoutColumnStats(scan, PARTITION_AND_SORT_KEY_PREDICATE);
+ blackhole.consume(fileTasks);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void localPlanningWithMinMaxFilter(Blackhole blackhole) {
+ BatchScan scan = table.newBatchScan();
+ List fileTasks = planFilesWithoutColumnStats(scan, SORT_KEY_PREDICATE);
+ blackhole.consume(fileTasks);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void distributedPlanningWithMinMaxFilter(Blackhole blackhole) {
+ BatchScan scan = newDistributedScan(DISTRIBUTED, DISTRIBUTED);
+ List fileTasks = planFilesWithoutColumnStats(scan, SORT_KEY_PREDICATE);
+ blackhole.consume(fileTasks);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void localPlanningWithoutFilter(Blackhole blackhole) {
+ BatchScan scan = table.newBatchScan();
+ List fileTasks = planFilesWithoutColumnStats(scan, Expressions.alwaysTrue());
+ blackhole.consume(fileTasks);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void distributedPlanningWithoutFilter(Blackhole blackhole) {
+ BatchScan scan = newDistributedScan(DISTRIBUTED, DISTRIBUTED);
+ List fileTasks = planFilesWithoutColumnStats(scan, Expressions.alwaysTrue());
+ blackhole.consume(fileTasks);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void localPlanningWithoutFilterWithStats(Blackhole blackhole) {
+ BatchScan scan = table.newBatchScan();
+ List fileTasks = planFilesWithColumnStats(scan, Expressions.alwaysTrue());
+ blackhole.consume(fileTasks);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void distributedPlanningWithoutFilterWithStats(Blackhole blackhole) {
+ BatchScan scan = newDistributedScan(DISTRIBUTED, DISTRIBUTED);
+ List fileTasks = planFilesWithColumnStats(scan, Expressions.alwaysTrue());
+ blackhole.consume(fileTasks);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void distributedDataLocalDeletesPlanningWithoutFilterWithStats(Blackhole blackhole) {
+ BatchScan scan = newDistributedScan(DISTRIBUTED, LOCAL);
+ List fileTasks = planFilesWithColumnStats(scan, Expressions.alwaysTrue());
+ blackhole.consume(fileTasks);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void localDataDistributedDeletesPlanningWithoutFilterWithStats(Blackhole blackhole) {
+ BatchScan scan = newDistributedScan(LOCAL, DISTRIBUTED);
+ List fileTasks = planFilesWithColumnStats(scan, Expressions.alwaysTrue());
+ blackhole.consume(fileTasks);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void localPlanningViaDistributedScanWithoutFilterWithStats(Blackhole blackhole) {
+ BatchScan scan = newDistributedScan(LOCAL, LOCAL);
+ List fileTasks = planFilesWithColumnStats(scan, Expressions.alwaysTrue());
+ blackhole.consume(fileTasks);
+ }
+
+ private void setupSpark() {
+ this.spark =
+ SparkSession.builder()
+ .config("spark.ui.enabled", false)
+ .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+ .config("spark.driver.maxResultSize", "8G")
+ .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName())
+ .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName())
+ .config("spark.sql.catalog.spark_catalog.type", "hadoop")
+ .config("spark.sql.catalog.spark_catalog.warehouse", newWarehouseDir())
+ .master("local[*]")
+ .getOrCreate();
+ }
+
+ private void tearDownSpark() {
+ spark.stop();
+ }
+
+ private void initTable() throws NoSuchTableException, ParseException {
+ sql(
+ "CREATE TABLE %s ( "
+ + " `ss_sold_date_sk` INT, "
+ + " `ss_sold_time_sk` INT, "
+ + " `ss_item_sk` INT, "
+ + " `ss_customer_sk` STRING, "
+ + " `ss_cdemo_sk` STRING, "
+ + " `ss_hdemo_sk` STRING, "
+ + " `ss_addr_sk` STRING, "
+ + " `ss_store_sk` STRING, "
+ + " `ss_promo_sk` STRING, "
+ + " `ss_ticket_number` INT, "
+ + " `ss_quantity` STRING, "
+ + " `ss_wholesale_cost` STRING, "
+ + " `ss_list_price` STRING, "
+ + " `ss_sales_price` STRING, "
+ + " `ss_ext_discount_amt` STRING, "
+ + " `ss_ext_sales_price` STRING, "
+ + " `ss_ext_wholesale_cost` STRING, "
+ + " `ss_ext_list_price` STRING, "
+ + " `ss_ext_tax` STRING, "
+ + " `ss_coupon_amt` STRING, "
+ + " `ss_net_paid` STRING, "
+ + " `ss_net_paid_inc_tax` STRING, "
+ + " `ss_net_profit` STRING "
+ + ")"
+ + "USING iceberg "
+ + "PARTITIONED BY (%s) "
+ + "TBLPROPERTIES ("
+ + " '%s' '%b',"
+ + " '%s' '%s',"
+ + " '%s' '%d')",
+ TABLE_NAME,
+ PARTITION_COLUMN,
+ TableProperties.MANIFEST_MERGE_ENABLED,
+ false,
+ TableProperties.DELETE_MODE,
+ RowLevelOperationMode.MERGE_ON_READ.modeName(),
+ TableProperties.FORMAT_VERSION,
+ type.equals("dv") ? 3 : 2);
+
+ this.table = Spark3Util.loadIcebergTable(spark, TABLE_NAME);
+ }
+
+ private void dropTable() {
+ sql("DROP TABLE IF EXISTS %s PURGE", TABLE_NAME);
+ }
+
+ private void initDataAndDeletes() {
+ if (type.equals("partition")) {
+ initDataAndPartitionScopedDeletes();
+ } else if (type.equals("file")) {
+ initDataAndFileScopedDeletes();
+ } else {
+ initDataAndDVs();
+ }
+ }
+
+ private void initDataAndPartitionScopedDeletes() {
+ for (int partitionOrdinal = 0; partitionOrdinal < NUM_PARTITIONS; partitionOrdinal++) {
+ StructLike partition = TestHelpers.Row.of(partitionOrdinal);
+
+ RowDelta rowDelta = table.newRowDelta();
+
+ for (int fileOrdinal = 0; fileOrdinal < NUM_DATA_FILES_PER_PARTITION; fileOrdinal++) {
+ DataFile dataFile = generateDataFile(partition, Integer.MIN_VALUE, Integer.MIN_VALUE);
+ rowDelta.addRows(dataFile);
+ }
+
+ // add one data file that would match the sort key predicate
+ DataFile sortKeyDataFile = generateDataFile(partition, SORT_KEY_VALUE, SORT_KEY_VALUE);
+ rowDelta.addRows(sortKeyDataFile);
+
+ for (int fileOrdinal = 0; fileOrdinal < NUM_DELETE_FILES_PER_PARTITION; fileOrdinal++) {
+ DeleteFile deleteFile = FileGenerationUtil.generatePositionDeleteFile(table, partition);
+ rowDelta.addDeletes(deleteFile);
+ }
+
+ rowDelta.commit();
+ }
+ }
+
+ private void initDataAndFileScopedDeletes() {
+ for (int partitionOrdinal = 0; partitionOrdinal < NUM_PARTITIONS; partitionOrdinal++) {
+ StructLike partition = TestHelpers.Row.of(partitionOrdinal);
+
+ RowDelta rowDelta = table.newRowDelta();
+
+ for (int fileOrdinal = 0; fileOrdinal < NUM_DATA_FILES_PER_PARTITION; fileOrdinal++) {
+ DataFile dataFile = generateDataFile(partition, Integer.MIN_VALUE, Integer.MIN_VALUE);
+ DeleteFile deleteFile = FileGenerationUtil.generatePositionDeleteFile(table, dataFile);
+ rowDelta.addRows(dataFile);
+ rowDelta.addDeletes(deleteFile);
+ }
+
+ // add one data file that would match the sort key predicate
+ DataFile sortKeyDataFile = generateDataFile(partition, SORT_KEY_VALUE, SORT_KEY_VALUE);
+ rowDelta.addRows(sortKeyDataFile);
+
+ rowDelta.commit();
+ }
+ }
+
+ private void initDataAndDVs() {
+ for (int partitionOrdinal = 0; partitionOrdinal < NUM_PARTITIONS; partitionOrdinal++) {
+ StructLike partition = TestHelpers.Row.of(partitionOrdinal);
+
+ RowDelta rowDelta = table.newRowDelta();
+
+ for (int fileOrdinal = 0; fileOrdinal < NUM_DATA_FILES_PER_PARTITION; fileOrdinal++) {
+ DataFile dataFile = generateDataFile(partition, Integer.MIN_VALUE, Integer.MIN_VALUE);
+ DeleteFile dv = FileGenerationUtil.generateDV(table, dataFile);
+ rowDelta.addRows(dataFile);
+ rowDelta.addDeletes(dv);
+ }
+
+ // add one data file that would match the sort key predicate
+ DataFile sortKeyDataFile = generateDataFile(partition, SORT_KEY_VALUE, SORT_KEY_VALUE);
+ rowDelta.addRows(sortKeyDataFile);
+
+ rowDelta.commit();
+ }
+ }
+
+ private DataFile generateDataFile(StructLike partition, int sortKeyMin, int sortKeyMax) {
+ int sortKeyFieldId = table.schema().findField(SORT_KEY_COLUMN).fieldId();
+ ByteBuffer lower = Conversions.toByteBuffer(Types.IntegerType.get(), sortKeyMin);
+ Map lowerBounds = ImmutableMap.of(sortKeyFieldId, lower);
+ ByteBuffer upper = Conversions.toByteBuffer(Types.IntegerType.get(), sortKeyMax);
+ Map upperBounds = ImmutableMap.of(sortKeyFieldId, upper);
+ return FileGenerationUtil.generateDataFile(table, partition, lowerBounds, upperBounds);
+ }
+
+ private String newWarehouseDir() {
+ return hadoopConf.get("hadoop.tmp.dir") + UUID.randomUUID();
+ }
+
+ private List planFilesWithoutColumnStats(BatchScan scan, Expression predicate) {
+ return planFiles(scan, predicate, false);
+ }
+
+ private List planFilesWithColumnStats(BatchScan scan, Expression predicate) {
+ return planFiles(scan, predicate, true);
+ }
+
+ private List planFiles(BatchScan scan, Expression predicate, boolean withColumnStats) {
+ table.refresh();
+
+ BatchScan configuredScan = scan.filter(predicate);
+
+ if (withColumnStats) {
+ configuredScan = scan.includeColumnStats();
+ }
+
+ try (CloseableIterable fileTasks = configuredScan.planFiles()) {
+ return Lists.newArrayList(fileTasks);
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+ }
+
+ private BatchScan newDistributedScan(PlanningMode dataMode, PlanningMode deleteMode) {
+ table
+ .updateProperties()
+ .set(TableProperties.DATA_PLANNING_MODE, dataMode.modeName())
+ .set(TableProperties.DELETE_PLANNING_MODE, deleteMode.modeName())
+ .commit();
+ SparkReadConf readConf = new SparkReadConf(spark, table, ImmutableMap.of());
+ return new SparkDistributedDataScan(spark, table, readConf);
+ }
+
+ @FormatMethod
+ private void sql(@FormatString String query, Object... args) {
+ spark.sql(String.format(query, args));
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java
new file mode 100644
index 000000000000..45c95bf99741
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java
@@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark;
+
+import com.google.errorprone.annotations.FormatMethod;
+import com.google.errorprone.annotations.FormatString;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DeleteFile;
+import org.apache.iceberg.FileGenerationUtil;
+import org.apache.iceberg.FileScanTask;
+import org.apache.iceberg.Partitioning;
+import org.apache.iceberg.RowDelta;
+import org.apache.iceberg.RowLevelOperationMode;
+import org.apache.iceberg.ScanTaskGroup;
+import org.apache.iceberg.StructLike;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.TestHelpers;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions;
+import org.apache.iceberg.util.TableScanUtil;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
+import org.apache.spark.sql.catalyst.parser.ParseException;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Timeout;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * A benchmark that evaluates the task group planning performance.
+ *
+ * To run this benchmark for spark-4.1:
+ * ./gradlew -DsparkVersions=4.1 :iceberg-spark:iceberg-spark-extensions-4.1_2.13:jmh
+ * -PjmhIncludeRegex=TaskGroupPlanningBenchmark
+ * -PjmhOutputPath=benchmark/iceberg-task-group-planning-benchmark.txt
+ *
+ */
+@Fork(1)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3)
+@Measurement(iterations = 5)
+@Timeout(time = 30, timeUnit = TimeUnit.MINUTES)
+@BenchmarkMode(Mode.SingleShotTime)
+public class TaskGroupPlanningBenchmark {
+
+ private static final String TABLE_NAME = "test_table";
+ private static final String PARTITION_COLUMN = "ss_ticket_number";
+
+ private static final int NUM_PARTITIONS = 150;
+ private static final int NUM_DATA_FILES_PER_PARTITION = 50_000;
+ private static final int NUM_DELETE_FILES_PER_PARTITION = 25;
+
+ private final Configuration hadoopConf = new Configuration();
+ private SparkSession spark;
+ private Table table;
+
+ private List fileTasks;
+
+ @Setup
+ public void setupBenchmark() throws NoSuchTableException, ParseException {
+ setupSpark();
+ initTable();
+ initDataAndDeletes();
+ loadFileTasks();
+ }
+
+ @TearDown
+ public void tearDownBenchmark() {
+ dropTable();
+ tearDownSpark();
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void planTaskGroups(Blackhole blackhole) {
+ SparkReadConf readConf = new SparkReadConf(spark, table, ImmutableMap.of());
+ List> taskGroups =
+ TableScanUtil.planTaskGroups(
+ fileTasks,
+ readConf.splitSize(),
+ readConf.splitLookback(),
+ readConf.splitOpenFileCost());
+
+ long rowsCount = 0L;
+ for (ScanTaskGroup taskGroup : taskGroups) {
+ rowsCount += taskGroup.estimatedRowsCount();
+ }
+ blackhole.consume(rowsCount);
+
+ long filesCount = 0L;
+ for (ScanTaskGroup taskGroup : taskGroups) {
+ filesCount += taskGroup.filesCount();
+ }
+ blackhole.consume(filesCount);
+
+ long sizeBytes = 0L;
+ for (ScanTaskGroup taskGroup : taskGroups) {
+ sizeBytes += taskGroup.sizeBytes();
+ }
+ blackhole.consume(sizeBytes);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void planTaskGroupsWithGrouping(Blackhole blackhole) {
+ SparkReadConf readConf = new SparkReadConf(spark, table, ImmutableMap.of());
+
+ List> taskGroups =
+ TableScanUtil.planTaskGroups(
+ fileTasks,
+ readConf.splitSize(),
+ readConf.splitLookback(),
+ readConf.splitOpenFileCost(),
+ Partitioning.groupingKeyType(table.schema(), table.specs().values()));
+
+ long rowsCount = 0L;
+ for (ScanTaskGroup taskGroup : taskGroups) {
+ rowsCount += taskGroup.estimatedRowsCount();
+ }
+ blackhole.consume(rowsCount);
+
+ long filesCount = 0L;
+ for (ScanTaskGroup taskGroup : taskGroups) {
+ filesCount += taskGroup.filesCount();
+ }
+ blackhole.consume(filesCount);
+
+ long sizeBytes = 0L;
+ for (ScanTaskGroup taskGroup : taskGroups) {
+ sizeBytes += taskGroup.sizeBytes();
+ }
+ blackhole.consume(sizeBytes);
+ }
+
+ private void loadFileTasks() {
+ table.refresh();
+
+ try (CloseableIterable fileTasksIterable = table.newScan().planFiles()) {
+ this.fileTasks = Lists.newArrayList(fileTasksIterable);
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+ }
+
+ private void initDataAndDeletes() {
+ for (int partitionOrdinal = 0; partitionOrdinal < NUM_PARTITIONS; partitionOrdinal++) {
+ StructLike partition = TestHelpers.Row.of(partitionOrdinal);
+
+ RowDelta rowDelta = table.newRowDelta();
+
+ for (int fileOrdinal = 0; fileOrdinal < NUM_DATA_FILES_PER_PARTITION; fileOrdinal++) {
+ DataFile dataFile = FileGenerationUtil.generateDataFile(table, partition);
+ rowDelta.addRows(dataFile);
+ }
+
+ for (int fileOrdinal = 0; fileOrdinal < NUM_DELETE_FILES_PER_PARTITION; fileOrdinal++) {
+ DeleteFile deleteFile = FileGenerationUtil.generatePositionDeleteFile(table, partition);
+ rowDelta.addDeletes(deleteFile);
+ }
+
+ rowDelta.commit();
+ }
+ }
+
+ private void setupSpark() {
+ this.spark =
+ SparkSession.builder()
+ .config("spark.ui.enabled", false)
+ .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+ .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName())
+ .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName())
+ .config("spark.sql.catalog.spark_catalog.type", "hadoop")
+ .config("spark.sql.catalog.spark_catalog.warehouse", newWarehouseDir())
+ .master("local[*]")
+ .getOrCreate();
+ }
+
+ private void tearDownSpark() {
+ spark.stop();
+ }
+
+ private void initTable() throws NoSuchTableException, ParseException {
+ sql(
+ "CREATE TABLE %s ( "
+ + " `ss_sold_date_sk` INT, "
+ + " `ss_sold_time_sk` INT, "
+ + " `ss_item_sk` INT, "
+ + " `ss_customer_sk` STRING, "
+ + " `ss_cdemo_sk` STRING, "
+ + " `ss_hdemo_sk` STRING, "
+ + " `ss_addr_sk` STRING, "
+ + " `ss_store_sk` STRING, "
+ + " `ss_promo_sk` STRING, "
+ + " `ss_ticket_number` INT, "
+ + " `ss_quantity` STRING, "
+ + " `ss_wholesale_cost` STRING, "
+ + " `ss_list_price` STRING, "
+ + " `ss_sales_price` STRING, "
+ + " `ss_ext_discount_amt` STRING, "
+ + " `ss_ext_sales_price` STRING, "
+ + " `ss_ext_wholesale_cost` STRING, "
+ + " `ss_ext_list_price` STRING, "
+ + " `ss_ext_tax` STRING, "
+ + " `ss_coupon_amt` STRING, "
+ + " `ss_net_paid` STRING, "
+ + " `ss_net_paid_inc_tax` STRING, "
+ + " `ss_net_profit` STRING "
+ + ")"
+ + "USING iceberg "
+ + "PARTITIONED BY (%s) "
+ + "TBLPROPERTIES ("
+ + " '%s' '%b',"
+ + " '%s' '%s',"
+ + " '%s' '%d')",
+ TABLE_NAME,
+ PARTITION_COLUMN,
+ TableProperties.MANIFEST_MERGE_ENABLED,
+ false,
+ TableProperties.DELETE_MODE,
+ RowLevelOperationMode.MERGE_ON_READ.modeName(),
+ TableProperties.FORMAT_VERSION,
+ 2);
+
+ this.table = Spark3Util.loadIcebergTable(spark, TABLE_NAME);
+ }
+
+ private void dropTable() {
+ sql("DROP TABLE IF EXISTS %s PURGE", TABLE_NAME);
+ }
+
+ private String newWarehouseDir() {
+ return hadoopConf.get("hadoop.tmp.dir") + UUID.randomUUID();
+ }
+
+ @FormatMethod
+ private void sql(@FormatString String query, Object... args) {
+ spark.sql(String.format(query, args));
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java
new file mode 100644
index 000000000000..d917eae5eb0f
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark;
+
+import static org.apache.spark.sql.functions.current_date;
+import static org.apache.spark.sql.functions.date_add;
+import static org.apache.spark.sql.functions.expr;
+
+import com.google.errorprone.annotations.FormatMethod;
+import com.google.errorprone.annotations.FormatString;
+import java.util.UUID;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.iceberg.DistributionMode;
+import org.apache.iceberg.RowLevelOperationMode;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
+import org.apache.spark.sql.catalyst.parser.ParseException;
+import org.apache.spark.sql.internal.SQLConf;
+import org.apache.spark.sql.types.StructType;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+
+@Fork(1)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3)
+@Measurement(iterations = 5)
+@BenchmarkMode(Mode.SingleShotTime)
+public class UpdateProjectionBenchmark {
+
+ private static final String TABLE_NAME = "test_table";
+ private static final int NUM_FILES = 5;
+ private static final int NUM_ROWS_PER_FILE = 1_000_000;
+
+ private final Configuration hadoopConf = new Configuration();
+ private SparkSession spark;
+ private long originalSnapshotId;
+
+ @Setup
+ public void setupBenchmark() throws NoSuchTableException, ParseException {
+ setupSpark();
+ initTable();
+ appendData();
+
+ Table table = Spark3Util.loadIcebergTable(spark, TABLE_NAME);
+ this.originalSnapshotId = table.currentSnapshot().snapshotId();
+ }
+
+ @TearDown
+ public void tearDownBenchmark() {
+ tearDownSpark();
+ dropTable();
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void copyOnWriteUpdate10Percent() {
+ runBenchmark(RowLevelOperationMode.COPY_ON_WRITE, 0.1);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void copyOnWriteUpdate30Percent() {
+ runBenchmark(RowLevelOperationMode.COPY_ON_WRITE, 0.3);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void copyOnWriteUpdate75Percent() {
+ runBenchmark(RowLevelOperationMode.COPY_ON_WRITE, 0.75);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void mergeOnRead10Percent() {
+ runBenchmark(RowLevelOperationMode.MERGE_ON_READ, 0.1);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void mergeOnReadUpdate30Percent() {
+ runBenchmark(RowLevelOperationMode.MERGE_ON_READ, 0.3);
+ }
+
+ @Benchmark
+ @Threads(1)
+ public void mergeOnReadUpdate75Percent() {
+ runBenchmark(RowLevelOperationMode.MERGE_ON_READ, 0.75);
+ }
+
+ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) {
+ sql(
+ "ALTER TABLE %s SET TBLPROPERTIES ('%s' '%s')",
+ TABLE_NAME, TableProperties.UPDATE_MODE, mode.modeName());
+
+ int mod = (int) (NUM_ROWS_PER_FILE / (NUM_ROWS_PER_FILE * updatePercentage));
+
+ sql(
+ "UPDATE %s "
+ + "SET intCol = intCol + 10, dateCol = date_add(dateCol, 1) "
+ + "WHERE mod(id, %d) = 0",
+ TABLE_NAME, mod);
+
+ sql(
+ "CALL system.rollback_to_snapshot(table => '%s', snapshot_id => %dL)",
+ TABLE_NAME, originalSnapshotId);
+ }
+
+ private void setupSpark() {
+ this.spark =
+ SparkSession.builder()
+ .config("spark.ui.enabled", false)
+ .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName())
+ .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName())
+ .config("spark.sql.catalog.spark_catalog.type", "hadoop")
+ .config("spark.sql.catalog.spark_catalog.warehouse", newWarehouseDir())
+ .config(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED().key(), "false")
+ .config(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false")
+ .config(SQLConf.SHUFFLE_PARTITIONS().key(), "2")
+ .master("local")
+ .getOrCreate();
+ }
+
+ private void tearDownSpark() {
+ spark.stop();
+ }
+
+ private void initTable() {
+ sql(
+ "CREATE TABLE %s ( "
+ + " id LONG, intCol INT, floatCol FLOAT, doubleCol DOUBLE, "
+ + " decimalCol DECIMAL(20, 5), dateCol DATE, timestampCol TIMESTAMP, "
+ + " stringCol STRING)"
+ + "USING iceberg "
+ + "TBLPROPERTIES ("
+ + " '%s' '%s',"
+ + " '%s' '%d',"
+ + " '%s' '%d')",
+ TABLE_NAME,
+ TableProperties.UPDATE_DISTRIBUTION_MODE,
+ DistributionMode.NONE.modeName(),
+ TableProperties.SPLIT_OPEN_FILE_COST,
+ Integer.MAX_VALUE,
+ TableProperties.FORMAT_VERSION,
+ 2);
+
+ sql("ALTER TABLE %s WRITE ORDERED BY id", TABLE_NAME);
+ }
+
+ private void dropTable() {
+ sql("DROP TABLE IF EXISTS %s PURGE", TABLE_NAME);
+ }
+
+ private void appendData() throws NoSuchTableException {
+ for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) {
+ Dataset inputDF =
+ spark
+ .range(NUM_ROWS_PER_FILE)
+ .withColumn("intCol", expr("CAST(id AS INT)"))
+ .withColumn("floatCol", expr("CAST(id AS FLOAT)"))
+ .withColumn("doubleCol", expr("CAST(id AS DOUBLE)"))
+ .withColumn("decimalCol", expr("CAST(id AS DECIMAL(20, 5))"))
+ .withColumn("dateCol", date_add(current_date(), fileNum))
+ .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)"))
+ .withColumn("stringCol", expr("CAST(dateCol AS STRING)"));
+ appendAsFile(inputDF);
+ }
+ }
+
+ private void appendAsFile(Dataset df) throws NoSuchTableException {
+ // ensure the schema is precise (including nullability)
+ StructType sparkSchema = spark.table(TABLE_NAME).schema();
+ spark.createDataFrame(df.rdd(), sparkSchema).coalesce(1).writeTo(TABLE_NAME).append();
+ }
+
+ private String newWarehouseDir() {
+ return hadoopConf.get("hadoop.tmp.dir") + UUID.randomUUID();
+ }
+
+ @FormatMethod
+ private void sql(@FormatString String query, Object... args) {
+ spark.sql(String.format(query, args));
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/antlr/org.apache.spark.sql.catalyst.parser.extensions/IcebergSqlExtensions.g4 b/spark/v4.1/spark-extensions/src/main/antlr/org.apache.spark.sql.catalyst.parser.extensions/IcebergSqlExtensions.g4
new file mode 100644
index 000000000000..4c2a16d7b19a
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/antlr/org.apache.spark.sql.catalyst.parser.extensions/IcebergSqlExtensions.g4
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * This file is an adaptation of Presto's and Spark's grammar files.
+ */
+
+grammar IcebergSqlExtensions;
+
+@lexer::members {
+ /**
+ * Verify whether current token is a valid decimal token (which contains dot).
+ * Returns true if the character that follows the token is not a digit or letter or underscore.
+ *
+ * For example:
+ * For char stream "2.3", "2." is not a valid decimal token, because it is followed by digit '3'.
+ * For char stream "2.3_", "2.3" is not a valid decimal token, because it is followed by '_'.
+ * For char stream "2.3W", "2.3" is not a valid decimal token, because it is followed by 'W'.
+ * For char stream "12.0D 34.E2+0.12 " 12.0D is a valid decimal token because it is followed
+ * by a space. 34.E2 is a valid decimal token because it is followed by symbol '+'
+ * which is not a digit or letter or underscore.
+ */
+ public boolean isValidDecimal() {
+ int nextChar = _input.LA(1);
+ if (nextChar >= 'A' && nextChar <= 'Z' || nextChar >= '0' && nextChar <= '9' ||
+ nextChar == '_') {
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ /**
+ * This method will be called when we see '/*' and try to match it as a bracketed comment.
+ * If the next character is '+', it should be parsed as hint later, and we cannot match
+ * it as a bracketed comment.
+ *
+ * Returns true if the next character is '+'.
+ */
+ public boolean isHint() {
+ int nextChar = _input.LA(1);
+ if (nextChar == '+') {
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
+
+singleStatement
+ : statement EOF
+ ;
+
+statement
+ : ALTER TABLE multipartIdentifier ADD PARTITION FIELD transform (AS name=identifier)? #addPartitionField
+ | ALTER TABLE multipartIdentifier DROP PARTITION FIELD transform #dropPartitionField
+ | ALTER TABLE multipartIdentifier REPLACE PARTITION FIELD transform WITH transform (AS name=identifier)? #replacePartitionField
+ | ALTER TABLE multipartIdentifier WRITE writeSpec #setWriteDistributionAndOrdering
+ | ALTER TABLE multipartIdentifier SET IDENTIFIER_KW FIELDS fieldList #setIdentifierFields
+ | ALTER TABLE multipartIdentifier DROP IDENTIFIER_KW FIELDS fieldList #dropIdentifierFields
+ | ALTER TABLE multipartIdentifier createReplaceBranchClause #createOrReplaceBranch
+ | ALTER TABLE multipartIdentifier createReplaceTagClause #createOrReplaceTag
+ | ALTER TABLE multipartIdentifier DROP BRANCH (IF EXISTS)? identifier #dropBranch
+ | ALTER TABLE multipartIdentifier DROP TAG (IF EXISTS)? identifier #dropTag
+ ;
+
+createReplaceTagClause
+ : (CREATE OR)? REPLACE TAG identifier tagOptions
+ | CREATE TAG (IF NOT EXISTS)? identifier tagOptions
+ ;
+
+createReplaceBranchClause
+ : (CREATE OR)? REPLACE BRANCH identifier branchOptions
+ | CREATE BRANCH (IF NOT EXISTS)? identifier branchOptions
+ ;
+
+tagOptions
+ : (AS OF VERSION snapshotId)? (refRetain)?
+ ;
+
+branchOptions
+ : (AS OF VERSION snapshotId)? (refRetain)? (snapshotRetention)?
+ ;
+
+snapshotRetention
+ : WITH SNAPSHOT RETENTION minSnapshotsToKeep
+ | WITH SNAPSHOT RETENTION maxSnapshotAge
+ | WITH SNAPSHOT RETENTION minSnapshotsToKeep maxSnapshotAge
+ ;
+
+refRetain
+ : RETAIN number timeUnit
+ ;
+
+maxSnapshotAge
+ : number timeUnit
+ ;
+
+minSnapshotsToKeep
+ : number SNAPSHOTS
+ ;
+
+writeSpec
+ : (writeDistributionSpec | writeOrderingSpec)*
+ ;
+
+writeDistributionSpec
+ : DISTRIBUTED BY PARTITION
+ ;
+
+writeOrderingSpec
+ : LOCALLY? ORDERED BY order
+ | UNORDERED
+ ;
+
+singleOrder
+ : order EOF
+ ;
+
+order
+ : fields+=orderField (',' fields+=orderField)*
+ | '(' fields+=orderField (',' fields+=orderField)* ')'
+ ;
+
+orderField
+ : transform direction=(ASC | DESC)? (NULLS nullOrder=(FIRST | LAST))?
+ ;
+
+transform
+ : multipartIdentifier #identityTransform
+ | transformName=identifier
+ '(' arguments+=transformArgument (',' arguments+=transformArgument)* ')' #applyTransform
+ ;
+
+transformArgument
+ : multipartIdentifier
+ | constant
+ ;
+
+expression
+ : constant
+ | stringMap
+ | stringArray
+ ;
+
+constant
+ : number #numericLiteral
+ | booleanValue #booleanLiteral
+ | STRING+ #stringLiteral
+ | identifier STRING #typeConstructor
+ ;
+
+stringMap
+ : MAP '(' constant (',' constant)* ')'
+ ;
+
+stringArray
+ : ARRAY '(' constant (',' constant)* ')'
+ ;
+
+booleanValue
+ : TRUE | FALSE
+ ;
+
+number
+ : MINUS? EXPONENT_VALUE #exponentLiteral
+ | MINUS? DECIMAL_VALUE #decimalLiteral
+ | MINUS? INTEGER_VALUE #integerLiteral
+ | MINUS? BIGINT_LITERAL #bigIntLiteral
+ | MINUS? SMALLINT_LITERAL #smallIntLiteral
+ | MINUS? TINYINT_LITERAL #tinyIntLiteral
+ | MINUS? DOUBLE_LITERAL #doubleLiteral
+ | MINUS? FLOAT_LITERAL #floatLiteral
+ | MINUS? BIGDECIMAL_LITERAL #bigDecimalLiteral
+ ;
+
+multipartIdentifier
+ : parts+=identifier ('.' parts+=identifier)*
+ ;
+
+identifier
+ : IDENTIFIER #unquotedIdentifier
+ | quotedIdentifier #quotedIdentifierAlternative
+ | nonReserved #unquotedIdentifier
+ ;
+
+quotedIdentifier
+ : BACKQUOTED_IDENTIFIER
+ ;
+
+fieldList
+ : fields+=multipartIdentifier (',' fields+=multipartIdentifier)*
+ ;
+
+nonReserved
+ : ADD | ALTER | AS | ASC | BRANCH | BY | CREATE | DAYS | DESC | DROP | EXISTS | FIELD | FIRST | HOURS | IF | LAST | NOT | NULLS | OF | OR | ORDERED | PARTITION | TABLE | WRITE
+ | DISTRIBUTED | LOCALLY | MINUTES | MONTHS | UNORDERED | REPLACE | RETAIN | VERSION | WITH | IDENTIFIER_KW | FIELDS | SET | SNAPSHOT | SNAPSHOTS
+ | TAG | TRUE | FALSE
+ | MAP
+ ;
+
+snapshotId
+ : number
+ ;
+
+numSnapshots
+ : number
+ ;
+
+timeUnit
+ : DAYS
+ | HOURS
+ | MINUTES
+ ;
+
+ADD: 'ADD';
+ALTER: 'ALTER';
+AS: 'AS';
+ASC: 'ASC';
+BRANCH: 'BRANCH';
+BY: 'BY';
+DAYS: 'DAYS';
+DESC: 'DESC';
+DISTRIBUTED: 'DISTRIBUTED';
+DROP: 'DROP';
+EXISTS: 'EXISTS';
+FIELD: 'FIELD';
+FIELDS: 'FIELDS';
+FIRST: 'FIRST';
+HOURS: 'HOURS';
+IF : 'IF';
+LAST: 'LAST';
+LOCALLY: 'LOCALLY';
+MINUTES: 'MINUTES';
+MONTHS: 'MONTHS';
+CREATE: 'CREATE';
+NOT: 'NOT';
+NULLS: 'NULLS';
+OF: 'OF';
+OR: 'OR';
+ORDERED: 'ORDERED';
+PARTITION: 'PARTITION';
+REPLACE: 'REPLACE';
+RETAIN: 'RETAIN';
+RETENTION: 'RETENTION';
+IDENTIFIER_KW: 'IDENTIFIER';
+SET: 'SET';
+SNAPSHOT: 'SNAPSHOT';
+SNAPSHOTS: 'SNAPSHOTS';
+TABLE: 'TABLE';
+TAG: 'TAG';
+UNORDERED: 'UNORDERED';
+VERSION: 'VERSION';
+WITH: 'WITH';
+WRITE: 'WRITE';
+
+TRUE: 'TRUE';
+FALSE: 'FALSE';
+
+MAP: 'MAP';
+ARRAY: 'ARRAY';
+
+PLUS: '+';
+MINUS: '-';
+
+STRING
+ : '\'' ( ~('\''|'\\') | ('\\' .) )* '\''
+ | '"' ( ~('"'|'\\') | ('\\' .) )* '"'
+ ;
+
+BIGINT_LITERAL
+ : DIGIT+ 'L'
+ ;
+
+SMALLINT_LITERAL
+ : DIGIT+ 'S'
+ ;
+
+TINYINT_LITERAL
+ : DIGIT+ 'Y'
+ ;
+
+INTEGER_VALUE
+ : DIGIT+
+ ;
+
+EXPONENT_VALUE
+ : DIGIT+ EXPONENT
+ | DECIMAL_DIGITS EXPONENT {isValidDecimal()}?
+ ;
+
+DECIMAL_VALUE
+ : DECIMAL_DIGITS {isValidDecimal()}?
+ ;
+
+FLOAT_LITERAL
+ : DIGIT+ EXPONENT? 'F'
+ | DECIMAL_DIGITS EXPONENT? 'F' {isValidDecimal()}?
+ ;
+
+DOUBLE_LITERAL
+ : DIGIT+ EXPONENT? 'D'
+ | DECIMAL_DIGITS EXPONENT? 'D' {isValidDecimal()}?
+ ;
+
+BIGDECIMAL_LITERAL
+ : DIGIT+ EXPONENT? 'BD'
+ | DECIMAL_DIGITS EXPONENT? 'BD' {isValidDecimal()}?
+ ;
+
+IDENTIFIER
+ : (LETTER | DIGIT | '_')+
+ ;
+
+BACKQUOTED_IDENTIFIER
+ : '`' ( ~'`' | '``' )* '`'
+ ;
+
+fragment DECIMAL_DIGITS
+ : DIGIT+ '.' DIGIT*
+ | '.' DIGIT+
+ ;
+
+fragment EXPONENT
+ : 'E' [+-]? DIGIT+
+ ;
+
+fragment DIGIT
+ : [0-9]
+ ;
+
+fragment LETTER
+ : [A-Z]
+ ;
+
+SIMPLE_COMMENT
+ : '--' ('\\\n' | ~[\r\n])* '\r'? '\n'? -> channel(HIDDEN)
+ ;
+
+BRACKETED_COMMENT
+ : '/*' {!isHint()}? (BRACKETED_COMMENT|.)*? '*/' -> channel(HIDDEN)
+ ;
+
+WS
+ : [ \r\n\t]+ -> channel(HIDDEN)
+ ;
+
+// Catch-all for anything we can't recognize.
+// We use this to be able to ignore and recover all the text
+// when splitting statements with DelimiterLexer
+UNRECOGNIZED
+ : .
+ ;
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/iceberg/spark/extensions/IcebergSparkSessionExtensions.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/iceberg/spark/extensions/IcebergSparkSessionExtensions.scala
new file mode 100644
index 000000000000..c4de35010c6e
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/iceberg/spark/extensions/IcebergSparkSessionExtensions.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark.extensions
+
+import org.apache.spark.sql.SparkSessionExtensions
+import org.apache.spark.sql.catalyst.analysis.CheckViews
+import org.apache.spark.sql.catalyst.analysis.ResolveViews
+import org.apache.spark.sql.catalyst.optimizer.ReplaceStaticInvoke
+import org.apache.spark.sql.catalyst.parser.extensions.IcebergSparkSqlExtensionsParser
+import org.apache.spark.sql.execution.datasources.v2.ExtendedDataSourceV2Strategy
+
+class IcebergSparkSessionExtensions extends (SparkSessionExtensions => Unit) {
+
+ override def apply(extensions: SparkSessionExtensions): Unit = {
+ // parser extensions
+ extensions.injectParser { case (_, parser) => new IcebergSparkSqlExtensionsParser(parser) }
+
+ // analyzer extensions
+ extensions.injectResolutionRule { spark => ResolveViews(spark) }
+ extensions.injectCheckRule(_ => CheckViews)
+
+ // optimizer extensions
+ extensions.injectOptimizerRule { _ => ReplaceStaticInvoke }
+
+ // planner extensions
+ extensions.injectPlannerStrategy { spark => ExtendedDataSourceV2Strategy(spark) }
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckViews.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckViews.scala
new file mode 100644
index 000000000000..5ad4b9c01409
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckViews.scala
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
+import org.apache.spark.sql.catalyst.plans.logical.AlterViewAs
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.Project
+import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias
+import org.apache.spark.sql.catalyst.plans.logical.View
+import org.apache.spark.sql.catalyst.plans.logical.views.CreateIcebergView
+import org.apache.spark.sql.catalyst.plans.logical.views.ResolvedV2View
+import org.apache.spark.sql.connector.catalog.ViewCatalog
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.util.SchemaUtils
+
+object CheckViews extends (LogicalPlan => Unit) {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override def apply(plan: LogicalPlan): Unit = {
+ plan foreach {
+ case CreateIcebergView(
+ resolvedIdent @ ResolvedIdentifier(_: ViewCatalog, _),
+ _,
+ query,
+ columnAliases,
+ _,
+ _,
+ _,
+ _,
+ _,
+ replace,
+ _,
+ _) =>
+ verifyColumnCount(resolvedIdent, columnAliases, query)
+ SchemaUtils.checkColumnNameDuplication(
+ query.schema.fieldNames.toIndexedSeq,
+ SQLConf.get.resolver)
+ if (replace) {
+ val viewIdent: Seq[String] =
+ resolvedIdent.catalog.name() +: resolvedIdent.identifier.asMultipartIdentifier
+ checkCyclicViewReference(viewIdent, query, Seq(viewIdent))
+ }
+
+ case AlterViewAs(ResolvedV2View(_, _), _, _) =>
+ throw new IcebergAnalysisException(
+ "ALTER VIEW AS is not supported. Use CREATE OR REPLACE VIEW instead")
+
+ case _ => // OK
+ }
+ }
+
+ private def verifyColumnCount(
+ ident: ResolvedIdentifier,
+ columns: Seq[String],
+ query: LogicalPlan): Unit = {
+ if (columns.nonEmpty) {
+ if (columns.length > query.output.length) {
+ throw new AnalysisException(
+ errorClass = "CREATE_VIEW_COLUMN_ARITY_MISMATCH.NOT_ENOUGH_DATA_COLUMNS",
+ messageParameters = Map(
+ "viewName" -> String.format("%s.%s", ident.catalog.name(), ident.identifier),
+ "viewColumns" -> columns.mkString(", "),
+ "dataColumns" -> query.output.map(c => c.name).mkString(", ")))
+ } else if (columns.length < query.output.length) {
+ throw new AnalysisException(
+ errorClass = "CREATE_VIEW_COLUMN_ARITY_MISMATCH.TOO_MANY_DATA_COLUMNS",
+ messageParameters = Map(
+ "viewName" -> String.format("%s.%s", ident.catalog.name(), ident.identifier),
+ "viewColumns" -> columns.mkString(", "),
+ "dataColumns" -> query.output.map(c => c.name).mkString(", ")))
+ }
+ }
+ }
+
+ private def checkCyclicViewReference(
+ viewIdent: Seq[String],
+ plan: LogicalPlan,
+ cyclePath: Seq[Seq[String]]): Unit = {
+ plan match {
+ case sub @ SubqueryAlias(_, Project(_, _)) =>
+ val currentViewIdent: Seq[String] = sub.identifier.qualifier :+ sub.identifier.name
+ checkIfRecursiveView(viewIdent, currentViewIdent, cyclePath, sub.children)
+ case v1View: View =>
+ val currentViewIdent: Seq[String] = v1View.desc.identifier.nameParts
+ checkIfRecursiveView(viewIdent, currentViewIdent, cyclePath, v1View.children)
+ case _ =>
+ plan.children.foreach(child => checkCyclicViewReference(viewIdent, child, cyclePath))
+ }
+
+ plan.expressions.flatMap(_.flatMap {
+ case e: SubqueryExpression =>
+ checkCyclicViewReference(viewIdent, e.plan, cyclePath)
+ None
+ case _ => None
+ })
+ }
+
+ private def checkIfRecursiveView(
+ viewIdent: Seq[String],
+ currentViewIdent: Seq[String],
+ cyclePath: Seq[Seq[String]],
+ children: Seq[LogicalPlan]): Unit = {
+ val newCyclePath = cyclePath :+ currentViewIdent
+ if (currentViewIdent == viewIdent) {
+ throw new IcebergAnalysisException(
+ String.format(
+ "Recursive cycle in view detected: %s (cycle: %s)",
+ viewIdent.asIdentifier,
+ newCyclePath.map(p => p.mkString(".")).mkString(" -> ")))
+ } else {
+ children.foreach { c =>
+ checkCyclicViewReference(viewIdent, c, newCyclePath)
+ }
+ }
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveViews.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveViews.scala
new file mode 100644
index 000000000000..76db30a5b619
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveViews.scala
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.FunctionIdentifier
+import org.apache.spark.sql.catalyst.analysis.ViewUtil.IcebergViewHelper
+import org.apache.spark.sql.catalyst.expressions.Alias
+import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
+import org.apache.spark.sql.catalyst.expressions.UpCast
+import org.apache.spark.sql.catalyst.parser.ParseException
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.Project
+import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias
+import org.apache.spark.sql.catalyst.plans.logical.views.CreateIcebergView
+import org.apache.spark.sql.catalyst.plans.logical.views.ResolvedV2View
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.trees.CurrentOrigin
+import org.apache.spark.sql.catalyst.trees.Origin
+import org.apache.spark.sql.connector.catalog.CatalogManager
+import org.apache.spark.sql.connector.catalog.LookupCatalog
+import org.apache.spark.sql.connector.catalog.View
+import org.apache.spark.sql.errors.QueryCompilationErrors
+import org.apache.spark.sql.types.MetadataBuilder
+
+case class ResolveViews(spark: SparkSession) extends Rule[LogicalPlan] with LookupCatalog {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ protected lazy val catalogManager: CatalogManager = spark.sessionState.catalogManager
+
+ override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+ case u @ UnresolvedRelation(nameParts, _, _)
+ if catalogManager.v1SessionCatalog.isTempView(nameParts) =>
+ u
+
+ case u @ UnresolvedRelation(parts @ CatalogAndIdentifier(catalog, ident), _, _) =>
+ ViewUtil
+ .loadView(catalog, ident)
+ .map(createViewRelation(parts, _))
+ .getOrElse(u)
+
+ case u @ UnresolvedTableOrView(CatalogAndIdentifier(catalog, ident), _, _) =>
+ ViewUtil
+ .loadView(catalog, ident)
+ .map(_ => ResolvedV2View(catalog.asViewCatalog, ident))
+ .getOrElse(u)
+
+ case c @ CreateIcebergView(
+ ResolvedIdentifier(_, _),
+ _,
+ query,
+ columnAliases,
+ columnComments,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _,
+ _) if query.resolved && !c.rewritten =>
+ val aliased = aliasColumns(query, columnAliases, columnComments)
+ c.copy(
+ query = aliased,
+ queryColumnNames = query.schema.fieldNames.toIndexedSeq,
+ rewritten = true)
+ }
+
+ private def aliasColumns(
+ plan: LogicalPlan,
+ columnAliases: Seq[String],
+ columnComments: Seq[Option[String]]): LogicalPlan = {
+ if (columnAliases.isEmpty || columnAliases.length != plan.output.length) {
+ plan
+ } else {
+ val projectList = plan.output.zipWithIndex.map { case (attr, pos) =>
+ if (columnComments.apply(pos).isDefined) {
+ val meta =
+ new MetadataBuilder().putString("comment", columnComments.apply(pos).get).build()
+ Alias(attr, columnAliases.apply(pos))(explicitMetadata = Some(meta))
+ } else {
+ Alias(attr, columnAliases.apply(pos))()
+ }
+ }
+ Project(projectList, plan)
+ }
+ }
+
+ private def createViewRelation(nameParts: Seq[String], view: View): LogicalPlan = {
+ val parsed = parseViewText(nameParts.quoted, view.query)
+
+ // Apply any necessary rewrites to preserve correct resolution
+ val viewCatalogAndNamespace: Seq[String] = view.currentCatalog +: view.currentNamespace.toSeq
+ val rewritten = rewriteIdentifiers(parsed, viewCatalogAndNamespace);
+
+ // Apply the field aliases and column comments
+ // This logic differs from how Spark handles views in SessionCatalog.fromCatalogTable.
+ // This is more strict because it doesn't allow resolution by field name.
+ val aliases = view.schema.fields.zipWithIndex.map { case (expected, pos) =>
+ val attr = GetColumnByOrdinal(pos, expected.dataType)
+ Alias(UpCast(attr, expected.dataType), expected.name)(explicitMetadata =
+ Some(expected.metadata))
+ }.toIndexedSeq
+
+ SubqueryAlias(nameParts, Project(aliases, rewritten))
+ }
+
+ private def parseViewText(name: String, viewText: String): LogicalPlan = {
+ val origin = Origin(objectType = Some("VIEW"), objectName = Some(name))
+
+ try {
+ CurrentOrigin.withOrigin(origin) {
+ spark.sessionState.sqlParser.parseQuery(viewText)
+ }
+ } catch {
+ case _: ParseException =>
+ throw QueryCompilationErrors.invalidViewNameError(name)
+ }
+ }
+
+ private def rewriteIdentifiers(
+ plan: LogicalPlan,
+ catalogAndNamespace: Seq[String]): LogicalPlan = {
+ // Rewrite unresolved functions and relations
+ qualifyTableIdentifiers(
+ qualifyFunctionIdentifiers(CTESubstitution.apply(plan), catalogAndNamespace),
+ catalogAndNamespace)
+ }
+
+ private def qualifyFunctionIdentifiers(
+ plan: LogicalPlan,
+ catalogAndNamespace: Seq[String]): LogicalPlan = plan transformExpressions {
+ case u @ UnresolvedFunction(Seq(name), _, _, _, _, _, _) =>
+ if (!isBuiltinFunction(name)) {
+ u.copy(nameParts = catalogAndNamespace :+ name)
+ } else {
+ u
+ }
+ case u @ UnresolvedFunction(parts, _, _, _, _, _, _) if !isCatalog(parts.head) =>
+ u.copy(nameParts = catalogAndNamespace.head +: parts)
+ }
+
+ /**
+ * Qualify table identifiers with default catalog and namespace if necessary.
+ */
+ private def qualifyTableIdentifiers(
+ child: LogicalPlan,
+ catalogAndNamespace: Seq[String]): LogicalPlan =
+ child transform {
+ case u @ UnresolvedRelation(Seq(table), _, _) =>
+ u.copy(multipartIdentifier = catalogAndNamespace :+ table)
+ case u @ UnresolvedRelation(parts, _, _) if !isCatalog(parts.head) =>
+ u.copy(multipartIdentifier = catalogAndNamespace.head +: parts)
+ case other =>
+ other.transformExpressions { case subquery: SubqueryExpression =>
+ subquery.withNewPlan(qualifyTableIdentifiers(subquery.plan, catalogAndNamespace))
+ }
+ }
+
+ private def isCatalog(name: String): Boolean = {
+ catalogManager.isCatalogRegistered(name)
+ }
+
+ private def isBuiltinFunction(name: String): Boolean = {
+ catalogManager.v1SessionCatalog.isBuiltinFunction(FunctionIdentifier(name))
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteViewCommands.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteViewCommands.scala
new file mode 100644
index 000000000000..ac0f75c422d1
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/RewriteViewCommands.scala
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.analysis.ViewUtil.IcebergViewHelper
+import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
+import org.apache.spark.sql.catalyst.plans.logical.CreateView
+import org.apache.spark.sql.catalyst.plans.logical.DropView
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.ShowViews
+import org.apache.spark.sql.catalyst.plans.logical.View
+import org.apache.spark.sql.catalyst.plans.logical.views.CreateIcebergView
+import org.apache.spark.sql.catalyst.plans.logical.views.DropIcebergView
+import org.apache.spark.sql.catalyst.plans.logical.views.ResolvedV2View
+import org.apache.spark.sql.catalyst.plans.logical.views.ShowIcebergViews
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.trees.TreePattern.UNRESOLVED_FUNCTION
+import org.apache.spark.sql.connector.catalog.CatalogManager
+import org.apache.spark.sql.connector.catalog.LookupCatalog
+import scala.collection.mutable
+
+/**
+ * ResolveSessionCatalog exits early for some v2 View commands,
+ * thus they are pre-substituted here and then handled in ResolveViews
+ */
+case class RewriteViewCommands(spark: SparkSession) extends Rule[LogicalPlan] with LookupCatalog {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ protected lazy val catalogManager: CatalogManager = spark.sessionState.catalogManager
+
+ override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp {
+ case DropView(ResolvedIdent(resolved), ifExists) =>
+ DropIcebergView(resolved, ifExists)
+
+ case CreateView(
+ ResolvedIdent(resolved),
+ userSpecifiedColumns,
+ comment,
+ _,
+ properties,
+ Some(queryText),
+ query,
+ allowExisting,
+ replace,
+ _) =>
+ val q = CTESubstitution.apply(query)
+ verifyTemporaryObjectsDontExist(resolved, q)
+ CreateIcebergView(
+ child = resolved,
+ queryText = queryText,
+ query = q,
+ columnAliases = userSpecifiedColumns.map(_._1),
+ columnComments = userSpecifiedColumns.map(_._2.orElse(Option.empty)),
+ comment = comment,
+ properties = properties,
+ allowExisting = allowExisting,
+ replace = replace)
+
+ case view @ ShowViews(CurrentNamespace, pattern, output) =>
+ if (ViewUtil.isViewCatalog(catalogManager.currentCatalog)) {
+ ShowIcebergViews(
+ ResolvedNamespace(
+ catalogManager.currentCatalog,
+ catalogManager.currentNamespace.toIndexedSeq),
+ pattern,
+ output)
+ } else {
+ view
+ }
+
+ case ShowViews(UnresolvedNamespace(CatalogAndNamespace(catalog, ns), _), pattern, output)
+ if ViewUtil.isViewCatalog(catalog) =>
+ ShowIcebergViews(ResolvedNamespace(catalog, ns), pattern, output)
+
+ // needs to be done here instead of in ResolveViews, so that a V2 view can be resolved before the Analyzer
+ // tries to resolve it, which would result in an error, saying that V2 views aren't supported
+ case u @ UnresolvedView(ResolvedView(resolved), _, _, _) =>
+ ViewUtil
+ .loadView(resolved.catalog, resolved.identifier)
+ .map(_ => ResolvedV2View(resolved.catalog.asViewCatalog, resolved.identifier))
+ .getOrElse(u)
+ }
+
+ private def isTempView(nameParts: Seq[String]): Boolean = {
+ catalogManager.v1SessionCatalog.isTempView(nameParts)
+ }
+
+ private def isTempFunction(nameParts: Seq[String]): Boolean = {
+ if (nameParts.size > 1) {
+ return false
+ }
+ catalogManager.v1SessionCatalog.isTemporaryFunction(nameParts.asFunctionIdentifier)
+ }
+
+ private object ResolvedIdent {
+ def unapply(unresolved: UnresolvedIdentifier): Option[ResolvedIdentifier] = unresolved match {
+ case UnresolvedIdentifier(nameParts, true) if isTempView(nameParts) =>
+ None
+
+ case UnresolvedIdentifier(CatalogAndIdentifier(catalog, ident), _)
+ if ViewUtil.isViewCatalog(catalog) =>
+ Some(ResolvedIdentifier(catalog, ident))
+
+ case _ =>
+ None
+ }
+ }
+
+ /**
+ * Permanent views are not allowed to reference temp objects
+ */
+ private def verifyTemporaryObjectsDontExist(
+ identifier: ResolvedIdentifier,
+ child: LogicalPlan): Unit = {
+ val tempViews = collectTemporaryViews(child)
+ if (tempViews.nonEmpty) {
+ throw invalidRefToTempObject(
+ identifier,
+ tempViews.map(v => v.quoted).mkString("[", ", ", "]"),
+ "view")
+ }
+
+ val tempFunctions = collectTemporaryFunctions(child)
+ if (tempFunctions.nonEmpty) {
+ throw invalidRefToTempObject(identifier, tempFunctions.mkString("[", ", ", "]"), "function")
+ }
+ }
+
+ private def invalidRefToTempObject(
+ ident: ResolvedIdentifier,
+ tempObjectNames: String,
+ tempObjectType: String) = {
+ new IcebergAnalysisException(
+ String.format(
+ "Cannot create view %s.%s that references temporary %s: %s",
+ ident.catalog.name(),
+ ident.identifier,
+ tempObjectType,
+ tempObjectNames))
+ }
+
+ /**
+ * Collect all temporary views and return the identifiers separately
+ */
+ private def collectTemporaryViews(child: LogicalPlan): Seq[Seq[String]] = {
+ def collectTempViews(child: LogicalPlan): Seq[Seq[String]] = {
+ child.flatMap {
+ case unresolved: UnresolvedRelation if isTempView(unresolved.multipartIdentifier) =>
+ Seq(unresolved.multipartIdentifier)
+ case view: View if view.isTempView => Seq(view.desc.identifier.nameParts)
+ case plan =>
+ plan.expressions.flatMap(_.flatMap {
+ case e: SubqueryExpression => collectTempViews(e.plan)
+ case _ => Seq.empty
+ })
+ }.distinct
+ }
+
+ collectTempViews(child)
+ }
+
+ private object ResolvedView {
+ def unapply(identifier: Seq[String]): Option[ResolvedV2View] = identifier match {
+ case nameParts if isTempView(nameParts) =>
+ None
+
+ case CatalogAndIdentifier(catalog, ident) if ViewUtil.isViewCatalog(catalog) =>
+ ViewUtil
+ .loadView(catalog, ident)
+ .flatMap(_ => Some(ResolvedV2View(catalog.asViewCatalog, ident)))
+
+ case _ =>
+ None
+ }
+ }
+
+ /**
+ * Collect the names of all temporary functions.
+ */
+ private def collectTemporaryFunctions(child: LogicalPlan): Seq[String] = {
+ val tempFunctions = new mutable.HashSet[String]()
+ child.resolveExpressionsWithPruning(_.containsAnyPattern(UNRESOLVED_FUNCTION)) {
+ case f @ UnresolvedFunction(nameParts, _, _, _, _, _, _) if isTempFunction(nameParts) =>
+ tempFunctions += nameParts.head
+ f
+ case e: SubqueryExpression =>
+ tempFunctions ++= collectTemporaryFunctions(e.plan)
+ e
+ }
+ tempFunctions.toSeq
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ViewUtil.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ViewUtil.scala
new file mode 100644
index 000000000000..c27cb140347e
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/analysis/ViewUtil.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.connector.catalog.CatalogPlugin
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.View
+import org.apache.spark.sql.connector.catalog.ViewCatalog
+import org.apache.spark.sql.errors.QueryCompilationErrors
+
+object ViewUtil {
+ def loadView(catalog: CatalogPlugin, ident: Identifier): Option[View] = catalog match {
+ case viewCatalog: ViewCatalog =>
+ try {
+ Option(viewCatalog.loadView(ident))
+ } catch {
+ case _: NoSuchViewException => None
+ }
+ case _ => None
+ }
+
+ def isViewCatalog(catalog: CatalogPlugin): Boolean = {
+ catalog.isInstanceOf[ViewCatalog]
+ }
+
+ implicit class IcebergViewHelper(plugin: CatalogPlugin) {
+ def asViewCatalog: ViewCatalog = plugin match {
+ case viewCatalog: ViewCatalog =>
+ viewCatalog
+ case _ =>
+ throw QueryCompilationErrors.missingCatalogViewsAbilityError(plugin)
+ }
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceStaticInvoke.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceStaticInvoke.scala
new file mode 100644
index 000000000000..bdec4aae884d
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceStaticInvoke.scala
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.iceberg.spark.functions.SparkFunctions
+import org.apache.spark.sql.catalyst.expressions.ApplyFunctionExpression
+import org.apache.spark.sql.catalyst.expressions.BinaryComparison
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.In
+import org.apache.spark.sql.catalyst.expressions.InSet
+import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke
+import org.apache.spark.sql.catalyst.plans.logical.Filter
+import org.apache.spark.sql.catalyst.plans.logical.Join
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.ReplaceData
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.trees.TreePattern.BINARY_COMPARISON
+import org.apache.spark.sql.catalyst.trees.TreePattern.COMMAND
+import org.apache.spark.sql.catalyst.trees.TreePattern.FILTER
+import org.apache.spark.sql.catalyst.trees.TreePattern.IN
+import org.apache.spark.sql.catalyst.trees.TreePattern.INSET
+import org.apache.spark.sql.catalyst.trees.TreePattern.JOIN
+import org.apache.spark.sql.connector.catalog.functions.ScalarFunction
+import org.apache.spark.sql.types.StructField
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Spark analyzes the Iceberg system function to {@link StaticInvoke} which could not be pushed
+ * down to datasource. This rule will replace {@link StaticInvoke} to
+ * {@link ApplyFunctionExpression} for Iceberg system function in a filter condition.
+ */
+object ReplaceStaticInvoke extends Rule[LogicalPlan] {
+
+ override def apply(plan: LogicalPlan): LogicalPlan =
+ plan.transformWithPruning(_.containsAnyPattern(COMMAND, FILTER, JOIN)) {
+ case replace @ ReplaceData(_, cond, _, _, _, _, _) =>
+ replaceStaticInvoke(replace, cond, newCond => replace.copy(condition = newCond))
+
+ case join @ Join(_, _, _, Some(cond), _) =>
+ replaceStaticInvoke(join, cond, newCond => join.copy(condition = Some(newCond)))
+
+ case filter @ Filter(cond, _) =>
+ replaceStaticInvoke(filter, cond, newCond => filter.copy(condition = newCond))
+ }
+
+ private def replaceStaticInvoke[T <: LogicalPlan](
+ node: T,
+ condition: Expression,
+ copy: Expression => T): T = {
+ val newCondition = replaceStaticInvoke(condition)
+ if (newCondition fastEquals condition) node else copy(newCondition)
+ }
+
+ private def replaceStaticInvoke(condition: Expression): Expression = {
+ condition.transformWithPruning(_.containsAnyPattern(BINARY_COMPARISON, IN, INSET)) {
+ case in @ In(value: StaticInvoke, _) if canReplace(value) =>
+ in.copy(value = replaceStaticInvoke(value))
+
+ case in @ InSet(value: StaticInvoke, _) if canReplace(value) =>
+ in.copy(child = replaceStaticInvoke(value))
+
+ case c @ BinaryComparison(left: StaticInvoke, right) if canReplace(left) && right.foldable =>
+ c.withNewChildren(Seq(replaceStaticInvoke(left), right))
+
+ case c @ BinaryComparison(left, right: StaticInvoke) if canReplace(right) && left.foldable =>
+ c.withNewChildren(Seq(left, replaceStaticInvoke(right)))
+ }
+ }
+
+ private def replaceStaticInvoke(invoke: StaticInvoke): Expression = {
+ // Adaptive from `resolveV2Function` in org.apache.spark.sql.catalyst.analysis.ResolveFunctions
+ val unbound = SparkFunctions.loadFunctionByClass(invoke.staticObject)
+ if (unbound == null) {
+ return invoke
+ }
+
+ val inputType = StructType(invoke.arguments.zipWithIndex.map { case (exp, pos) =>
+ StructField(s"_$pos", exp.dataType, exp.nullable)
+ })
+
+ val bound =
+ try {
+ unbound.bind(inputType)
+ } catch {
+ case _: Exception =>
+ return invoke
+ }
+
+ if (bound.inputTypes().length != invoke.arguments.length) {
+ return invoke
+ }
+
+ bound match {
+ case scalarFunc: ScalarFunction[_] =>
+ ApplyFunctionExpression(scalarFunc, invoke.arguments)
+ case _ => invoke
+ }
+ }
+
+ @inline
+ private def canReplace(invoke: StaticInvoke): Boolean = {
+ invoke.functionName == ScalarFunction.MAGIC_METHOD_NAME && !invoke.foldable
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/parser/extensions/IcebergSparkSqlExtensionsParser.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/parser/extensions/IcebergSparkSqlExtensionsParser.scala
new file mode 100644
index 000000000000..ac127f754a91
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/parser/extensions/IcebergSparkSqlExtensionsParser.scala
@@ -0,0 +1,342 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.parser.extensions
+
+import java.util.Locale
+import org.antlr.v4.runtime._
+import org.antlr.v4.runtime.atn.PredictionMode
+import org.antlr.v4.runtime.misc.Interval
+import org.antlr.v4.runtime.misc.ParseCancellationException
+import org.antlr.v4.runtime.tree.TerminalNodeImpl
+import org.apache.iceberg.common.DynConstructors
+import org.apache.iceberg.spark.ExtendedParser
+import org.apache.iceberg.spark.ExtendedParser.RawOrderField
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.FunctionIdentifier
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.analysis.RewriteViewCommands
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.parser.ParserInterface
+import org.apache.spark.sql.catalyst.parser.extensions.IcebergSqlExtensionsParser.NonReservedContext
+import org.apache.spark.sql.catalyst.parser.extensions.IcebergSqlExtensionsParser.QuotedIdentifierContext
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.trees.Origin
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.VariableSubstitution
+import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types.StructType
+import scala.jdk.CollectionConverters._
+
+class IcebergSparkSqlExtensionsParser(delegate: ParserInterface)
+ extends ParserInterface
+ with ExtendedParser {
+
+ import IcebergSparkSqlExtensionsParser._
+
+ private lazy val substitutor = substitutorCtor.newInstance(SQLConf.get)
+ private lazy val astBuilder = new IcebergSqlExtensionsAstBuilder(delegate)
+
+ /**
+ * Parse a string to a DataType.
+ */
+ override def parseDataType(sqlText: String): DataType = {
+ delegate.parseDataType(sqlText)
+ }
+
+ /**
+ * Parse a string to a raw DataType without CHAR/VARCHAR replacement.
+ */
+ def parseRawDataType(sqlText: String): DataType = throw new UnsupportedOperationException()
+
+ /**
+ * Parse a string to an Expression.
+ */
+ override def parseExpression(sqlText: String): Expression = {
+ delegate.parseExpression(sqlText)
+ }
+
+ /**
+ * Parse a string to a TableIdentifier.
+ */
+ override def parseTableIdentifier(sqlText: String): TableIdentifier = {
+ delegate.parseTableIdentifier(sqlText)
+ }
+
+ /**
+ * Parse a string to a FunctionIdentifier.
+ */
+ override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = {
+ delegate.parseFunctionIdentifier(sqlText)
+ }
+
+ /**
+ * Parse a string to a multi-part identifier.
+ */
+ override def parseMultipartIdentifier(sqlText: String): Seq[String] = {
+ delegate.parseMultipartIdentifier(sqlText)
+ }
+
+ /**
+ * Creates StructType for a given SQL string, which is a comma separated list of field
+ * definitions which will preserve the correct Hive metadata.
+ */
+ override def parseTableSchema(sqlText: String): StructType = {
+ delegate.parseTableSchema(sqlText)
+ }
+
+ override def parseSortOrder(sqlText: String): java.util.List[RawOrderField] = {
+ val fields = parse(sqlText) { parser => astBuilder.visitSingleOrder(parser.singleOrder()) }
+ fields.map { field =>
+ val (term, direction, order) = field
+ new RawOrderField(term, direction, order)
+ }.asJava
+ }
+
+ override def parseRoutineParam(sqlText: String): StructType =
+ throw new UnsupportedOperationException()
+
+ /**
+ * Parse a string to a LogicalPlan.
+ */
+ override def parsePlan(sqlText: String): LogicalPlan = {
+ val sqlTextAfterSubstitution = substitutor.substitute(sqlText)
+ if (isIcebergCommand(sqlTextAfterSubstitution)) {
+ parse(sqlTextAfterSubstitution) { parser => astBuilder.visit(parser.singleStatement()) }
+ .asInstanceOf[LogicalPlan]
+ } else {
+ RewriteViewCommands(SparkSession.active).apply(delegate.parsePlan(sqlText))
+ }
+ }
+
+ private def isIcebergCommand(sqlText: String): Boolean = {
+ val normalized = sqlText
+ .toLowerCase(Locale.ROOT)
+ .trim()
+ // Strip simple SQL comments that terminate a line, e.g. comments starting with `--` .
+ .replaceAll("--.*?\\n", " ")
+ // Strip newlines.
+ .replaceAll("\\s+", " ")
+ // Strip comments of the form /* ... */. This must come after stripping newlines so that
+ // comments that span multiple lines are caught.
+ .replaceAll("/\\*.*?\\*/", " ")
+ // Strip backtick then `system`.`ancestors_of` changes to system.ancestors_of
+ .replaceAll("`", "")
+ .trim()
+
+ normalized.startsWith("alter table") && (normalized.contains("add partition field") ||
+ normalized.contains("drop partition field") ||
+ normalized.contains("replace partition field") ||
+ normalized.contains("write ordered by") ||
+ normalized.contains("write locally ordered by") ||
+ normalized.contains("write distributed by") ||
+ normalized.contains("write unordered") ||
+ normalized.contains("set identifier fields") ||
+ normalized.contains("drop identifier fields") ||
+ isSnapshotRefDdl(normalized))
+ }
+
+ private def isSnapshotRefDdl(normalized: String): Boolean = {
+ normalized.contains("create branch") ||
+ normalized.contains("replace branch") ||
+ normalized.contains("create tag") ||
+ normalized.contains("replace tag") ||
+ normalized.contains("drop branch") ||
+ normalized.contains("drop tag")
+ }
+
+ protected def parse[T](command: String)(toResult: IcebergSqlExtensionsParser => T): T = {
+ val lexer = new IcebergSqlExtensionsLexer(
+ new UpperCaseCharStream(CharStreams.fromString(command)))
+ lexer.removeErrorListeners()
+ lexer.addErrorListener(IcebergParseErrorListener)
+
+ val tokenStream = new CommonTokenStream(lexer)
+ val parser = new IcebergSqlExtensionsParser(tokenStream)
+ parser.addParseListener(IcebergSqlExtensionsPostProcessor)
+ parser.removeErrorListeners()
+ parser.addErrorListener(IcebergParseErrorListener)
+
+ // https://github.com/antlr/antlr4/issues/192#issuecomment-15238595
+ // Save a great deal of time on correct inputs by using a two-stage parsing strategy.
+ try {
+ try {
+ // first, try parsing with potentially faster SLL mode and BailErrorStrategy
+ parser.setErrorHandler(new BailErrorStrategy)
+ parser.getInterpreter.setPredictionMode(PredictionMode.SLL)
+ toResult(parser)
+ } catch {
+ case _: ParseCancellationException =>
+ // if we fail, parse with LL mode with DefaultErrorStrategy
+ tokenStream.seek(0) // rewind input stream
+ parser.reset()
+
+ // Try Again.
+ parser.setErrorHandler(new DefaultErrorStrategy)
+ parser.getInterpreter.setPredictionMode(PredictionMode.LL)
+ toResult(parser)
+ }
+ } catch {
+ case e: IcebergParseException if e.command.isDefined =>
+ throw e
+ case e: IcebergParseException =>
+ throw e.withCommand(command)
+ case e: AnalysisException =>
+ val position = Origin(e.line, e.startPosition)
+ throw new IcebergParseException(Option(command), e.message, position, position)
+ }
+ }
+
+ override def parseQuery(sqlText: String): LogicalPlan = {
+ parsePlan(sqlText)
+ }
+}
+
+object IcebergSparkSqlExtensionsParser {
+ private val substitutorCtor: DynConstructors.Ctor[VariableSubstitution] =
+ DynConstructors
+ .builder()
+ .impl(classOf[VariableSubstitution])
+ .impl(classOf[VariableSubstitution], classOf[SQLConf])
+ .build()
+}
+
+/* Copied from Apache Spark's to avoid dependency on Spark Internals */
+class UpperCaseCharStream(wrapped: CodePointCharStream) extends CharStream {
+ override def consume(): Unit = wrapped.consume
+ override def getSourceName(): String = wrapped.getSourceName
+ override def index(): Int = wrapped.index
+ override def mark(): Int = wrapped.mark
+ override def release(marker: Int): Unit = wrapped.release(marker)
+ override def seek(where: Int): Unit = wrapped.seek(where)
+ override def size(): Int = wrapped.size
+
+ override def getText(interval: Interval): String = wrapped.getText(interval)
+
+ // scalastyle:off
+ override def LA(i: Int): Int = {
+ val la = wrapped.LA(i)
+ if (la == 0 || la == IntStream.EOF) la
+ else Character.toUpperCase(la)
+ }
+ // scalastyle:on
+}
+
+/**
+ * The post-processor validates & cleans-up the parse tree during the parse process.
+ */
+case object IcebergSqlExtensionsPostProcessor extends IcebergSqlExtensionsBaseListener {
+
+ /** Remove the back ticks from an Identifier. */
+ override def exitQuotedIdentifier(ctx: QuotedIdentifierContext): Unit = {
+ replaceTokenByIdentifier(ctx, 1) { token =>
+ // Remove the double back ticks in the string.
+ token.setText(token.getText.replace("``", "`"))
+ token
+ }
+ }
+
+ /** Treat non-reserved keywords as Identifiers. */
+ override def exitNonReserved(ctx: NonReservedContext): Unit = {
+ replaceTokenByIdentifier(ctx, 0)(identity)
+ }
+
+ private def replaceTokenByIdentifier(ctx: ParserRuleContext, stripMargins: Int)(
+ f: CommonToken => CommonToken = identity): Unit = {
+ val parent = ctx.getParent
+ parent.removeLastChild()
+ val token = ctx.getChild(0).getPayload.asInstanceOf[Token]
+ val newToken = new CommonToken(
+ new org.antlr.v4.runtime.misc.Pair(token.getTokenSource, token.getInputStream),
+ IcebergSqlExtensionsParser.IDENTIFIER,
+ token.getChannel,
+ token.getStartIndex + stripMargins,
+ token.getStopIndex - stripMargins)
+ parent.addChild(new TerminalNodeImpl(f(newToken)))
+ }
+}
+
+/* Partially copied from Apache Spark's Parser to avoid dependency on Spark Internals */
+case object IcebergParseErrorListener extends BaseErrorListener {
+ override def syntaxError(
+ recognizer: Recognizer[_, _],
+ offendingSymbol: scala.Any,
+ line: Int,
+ charPositionInLine: Int,
+ msg: String,
+ e: RecognitionException): Unit = {
+ val (start, stop) = offendingSymbol match {
+ case token: CommonToken =>
+ val start = Origin(Some(line), Some(token.getCharPositionInLine))
+ val length = token.getStopIndex - token.getStartIndex + 1
+ val stop = Origin(Some(line), Some(token.getCharPositionInLine + length))
+ (start, stop)
+ case _ =>
+ val start = Origin(Some(line), Some(charPositionInLine))
+ (start, start)
+ }
+ throw new IcebergParseException(None, msg, start, stop)
+ }
+}
+
+/**
+ * Copied from Apache Spark
+ * A [[ParseException]] is an [[AnalysisException]] that is thrown during the parse process. It
+ * contains fields and an extended error message that make reporting and diagnosing errors easier.
+ */
+class IcebergParseException(
+ val command: Option[String],
+ message: String,
+ val start: Origin,
+ val stop: Origin)
+ extends AnalysisException(message, start.line, start.startPosition) {
+
+ def this(message: String, ctx: ParserRuleContext) = {
+ this(
+ Option(IcebergParserUtils.command(ctx)),
+ message,
+ IcebergParserUtils.position(ctx.getStart),
+ IcebergParserUtils.position(ctx.getStop))
+ }
+
+ override def getMessage: String = {
+ val builder = new StringBuilder
+ builder ++= "\n" ++= message
+ start match {
+ case Origin(Some(l), Some(p), Some(_), Some(_), Some(_), Some(_), Some(_), _, _, _) =>
+ builder ++= s"(line $l, pos $p)\n"
+ command.foreach { cmd =>
+ val (above, below) = cmd.split("\n").splitAt(l)
+ builder ++= "\n== SQL ==\n"
+ above.foreach(builder ++= _ += '\n')
+ builder ++= (0 until p).map(_ => "-").mkString("") ++= "^^^\n"
+ below.foreach(builder ++= _ += '\n')
+ }
+ case _ =>
+ command.foreach { cmd =>
+ builder ++= "\n== SQL ==\n" ++= cmd
+ }
+ }
+ builder.toString
+ }
+
+ def withCommand(cmd: String): IcebergParseException = {
+ new IcebergParseException(Option(cmd), message, start, stop)
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/parser/extensions/IcebergSqlExtensionsAstBuilder.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/parser/extensions/IcebergSqlExtensionsAstBuilder.scala
new file mode 100644
index 000000000000..724101cfe11d
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/parser/extensions/IcebergSqlExtensionsAstBuilder.scala
@@ -0,0 +1,385 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.parser.extensions
+
+import java.util.Locale
+import java.util.concurrent.TimeUnit
+import org.antlr.v4.runtime._
+import org.antlr.v4.runtime.misc.Interval
+import org.antlr.v4.runtime.tree.ParseTree
+import org.antlr.v4.runtime.tree.TerminalNode
+import org.apache.iceberg.DistributionMode
+import org.apache.iceberg.NullOrder
+import org.apache.iceberg.SortDirection
+import org.apache.iceberg.expressions.Term
+import org.apache.iceberg.spark.Spark3Util
+import org.apache.spark.sql.catalyst.analysis.IcebergAnalysisException
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.Literal
+import org.apache.spark.sql.catalyst.parser.ParserInterface
+import org.apache.spark.sql.catalyst.parser.extensions.IcebergParserUtils.withOrigin
+import org.apache.spark.sql.catalyst.parser.extensions.IcebergSqlExtensionsParser._
+import org.apache.spark.sql.catalyst.plans.logical.AddPartitionField
+import org.apache.spark.sql.catalyst.plans.logical.BranchOptions
+import org.apache.spark.sql.catalyst.plans.logical.CreateOrReplaceBranch
+import org.apache.spark.sql.catalyst.plans.logical.CreateOrReplaceTag
+import org.apache.spark.sql.catalyst.plans.logical.DropBranch
+import org.apache.spark.sql.catalyst.plans.logical.DropIdentifierFields
+import org.apache.spark.sql.catalyst.plans.logical.DropPartitionField
+import org.apache.spark.sql.catalyst.plans.logical.DropTag
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.ReplacePartitionField
+import org.apache.spark.sql.catalyst.plans.logical.SetIdentifierFields
+import org.apache.spark.sql.catalyst.plans.logical.SetWriteDistributionAndOrdering
+import org.apache.spark.sql.catalyst.plans.logical.TagOptions
+import org.apache.spark.sql.catalyst.trees.CurrentOrigin
+import org.apache.spark.sql.catalyst.trees.Origin
+import org.apache.spark.sql.connector.expressions
+import org.apache.spark.sql.connector.expressions.ApplyTransform
+import org.apache.spark.sql.connector.expressions.FieldReference
+import org.apache.spark.sql.connector.expressions.IdentityTransform
+import org.apache.spark.sql.connector.expressions.LiteralValue
+import org.apache.spark.sql.connector.expressions.Transform
+import scala.jdk.CollectionConverters._
+
+class IcebergSqlExtensionsAstBuilder(delegate: ParserInterface)
+ extends IcebergSqlExtensionsBaseVisitor[AnyRef] {
+
+ private def toBuffer[T](list: java.util.List[T]): scala.collection.mutable.Buffer[T] =
+ list.asScala
+ private def toSeq[T](list: java.util.List[T]): Seq[T] = toBuffer(list).toSeq
+
+ /**
+ * Create an ADD PARTITION FIELD logical command.
+ */
+ override def visitAddPartitionField(ctx: AddPartitionFieldContext): AddPartitionField =
+ withOrigin(ctx) {
+ AddPartitionField(
+ typedVisit[Seq[String]](ctx.multipartIdentifier),
+ typedVisit[Transform](ctx.transform),
+ Option(ctx.name).map(_.getText))
+ }
+
+ /**
+ * Create a DROP PARTITION FIELD logical command.
+ */
+ override def visitDropPartitionField(ctx: DropPartitionFieldContext): DropPartitionField =
+ withOrigin(ctx) {
+ DropPartitionField(
+ typedVisit[Seq[String]](ctx.multipartIdentifier),
+ typedVisit[Transform](ctx.transform))
+ }
+
+ /**
+ * Create a CREATE OR REPLACE BRANCH logical command.
+ */
+ override def visitCreateOrReplaceBranch(
+ ctx: CreateOrReplaceBranchContext): CreateOrReplaceBranch = withOrigin(ctx) {
+ val createOrReplaceBranchClause = ctx.createReplaceBranchClause()
+
+ val branchName = createOrReplaceBranchClause.identifier()
+ val branchOptionsContext = Option(createOrReplaceBranchClause.branchOptions())
+ val snapshotId = branchOptionsContext
+ .flatMap(branchOptions => Option(branchOptions.snapshotId()))
+ .map(_.getText.toLong)
+ val snapshotRetention =
+ branchOptionsContext.flatMap(branchOptions => Option(branchOptions.snapshotRetention()))
+ val minSnapshotsToKeep = snapshotRetention
+ .flatMap(retention => Option(retention.minSnapshotsToKeep()))
+ .map(minSnapshots => minSnapshots.number().getText.toLong)
+ val maxSnapshotAgeMs = snapshotRetention
+ .flatMap(retention => Option(retention.maxSnapshotAge()))
+ .map(retention =>
+ TimeUnit
+ .valueOf(retention.timeUnit().getText.toUpperCase(Locale.ENGLISH))
+ .toMillis(retention.number().getText.toLong))
+ val branchRetention =
+ branchOptionsContext.flatMap(branchOptions => Option(branchOptions.refRetain()))
+ val branchRefAgeMs = branchRetention.map(retain =>
+ TimeUnit
+ .valueOf(retain.timeUnit().getText.toUpperCase(Locale.ENGLISH))
+ .toMillis(retain.number().getText.toLong))
+ val create = createOrReplaceBranchClause.CREATE() != null
+ val replace = ctx.createReplaceBranchClause().REPLACE() != null
+ val ifNotExists = createOrReplaceBranchClause.EXISTS() != null
+
+ val branchOptions =
+ BranchOptions(snapshotId, minSnapshotsToKeep, maxSnapshotAgeMs, branchRefAgeMs)
+
+ CreateOrReplaceBranch(
+ typedVisit[Seq[String]](ctx.multipartIdentifier),
+ branchName.getText,
+ branchOptions,
+ create,
+ replace,
+ ifNotExists)
+ }
+
+ /**
+ * Create an CREATE OR REPLACE TAG logical command.
+ */
+ override def visitCreateOrReplaceTag(ctx: CreateOrReplaceTagContext): CreateOrReplaceTag =
+ withOrigin(ctx) {
+ val createTagClause = ctx.createReplaceTagClause()
+
+ val tagName = createTagClause.identifier().getText
+
+ val tagOptionsContext = Option(createTagClause.tagOptions())
+ val snapshotId = tagOptionsContext
+ .flatMap(tagOptions => Option(tagOptions.snapshotId()))
+ .map(_.getText.toLong)
+ val tagRetain = tagOptionsContext.flatMap(tagOptions => Option(tagOptions.refRetain()))
+ val tagRefAgeMs = tagRetain.map(retain =>
+ TimeUnit
+ .valueOf(retain.timeUnit().getText.toUpperCase(Locale.ENGLISH))
+ .toMillis(retain.number().getText.toLong))
+ val tagOptions = TagOptions(snapshotId, tagRefAgeMs)
+
+ val create = createTagClause.CREATE() != null
+ val replace = createTagClause.REPLACE() != null
+ val ifNotExists = createTagClause.EXISTS() != null
+
+ CreateOrReplaceTag(
+ typedVisit[Seq[String]](ctx.multipartIdentifier),
+ tagName,
+ tagOptions,
+ create,
+ replace,
+ ifNotExists)
+ }
+
+ /**
+ * Create an DROP BRANCH logical command.
+ */
+ override def visitDropBranch(ctx: DropBranchContext): DropBranch = withOrigin(ctx) {
+ DropBranch(
+ typedVisit[Seq[String]](ctx.multipartIdentifier),
+ ctx.identifier().getText,
+ ctx.EXISTS() != null)
+ }
+
+ /**
+ * Create an DROP TAG logical command.
+ */
+ override def visitDropTag(ctx: DropTagContext): DropTag = withOrigin(ctx) {
+ DropTag(
+ typedVisit[Seq[String]](ctx.multipartIdentifier),
+ ctx.identifier().getText,
+ ctx.EXISTS() != null)
+ }
+
+ /**
+ * Create an REPLACE PARTITION FIELD logical command.
+ */
+ override def visitReplacePartitionField(
+ ctx: ReplacePartitionFieldContext): ReplacePartitionField = withOrigin(ctx) {
+ ReplacePartitionField(
+ typedVisit[Seq[String]](ctx.multipartIdentifier),
+ typedVisit[Transform](ctx.transform(0)),
+ typedVisit[Transform](ctx.transform(1)),
+ Option(ctx.name).map(_.getText))
+ }
+
+ /**
+ * Create an SET IDENTIFIER FIELDS logical command.
+ */
+ override def visitSetIdentifierFields(ctx: SetIdentifierFieldsContext): SetIdentifierFields =
+ withOrigin(ctx) {
+ SetIdentifierFields(
+ typedVisit[Seq[String]](ctx.multipartIdentifier),
+ toSeq(ctx.fieldList.fields).map(_.getText))
+ }
+
+ /**
+ * Create an DROP IDENTIFIER FIELDS logical command.
+ */
+ override def visitDropIdentifierFields(ctx: DropIdentifierFieldsContext): DropIdentifierFields =
+ withOrigin(ctx) {
+ DropIdentifierFields(
+ typedVisit[Seq[String]](ctx.multipartIdentifier),
+ toSeq(ctx.fieldList.fields).map(_.getText))
+ }
+
+ /**
+ * Create a [[SetWriteDistributionAndOrdering]] for changing the write distribution and ordering.
+ */
+ override def visitSetWriteDistributionAndOrdering(
+ ctx: SetWriteDistributionAndOrderingContext): SetWriteDistributionAndOrdering = {
+
+ val tableName = typedVisit[Seq[String]](ctx.multipartIdentifier)
+
+ val (distributionSpec, orderingSpec) = toDistributionAndOrderingSpec(ctx.writeSpec)
+
+ if (distributionSpec == null && orderingSpec == null) {
+ throw new IcebergAnalysisException(
+ "ALTER TABLE has no changes: missing both distribution and ordering clauses")
+ }
+
+ val distributionMode = if (distributionSpec != null) {
+ Some(DistributionMode.HASH)
+ } else if (orderingSpec.UNORDERED != null) {
+ Some(DistributionMode.NONE)
+ } else if (orderingSpec.LOCALLY() != null) {
+ None
+ } else {
+ Some(DistributionMode.RANGE)
+ }
+
+ val ordering = if (orderingSpec != null && orderingSpec.order != null) {
+ toSeq(orderingSpec.order.fields).map(typedVisit[(Term, SortDirection, NullOrder)])
+ } else {
+ Seq.empty
+ }
+
+ SetWriteDistributionAndOrdering(tableName, distributionMode, ordering)
+ }
+
+ private def toDistributionAndOrderingSpec(
+ writeSpec: WriteSpecContext): (WriteDistributionSpecContext, WriteOrderingSpecContext) = {
+
+ if (writeSpec.writeDistributionSpec.size > 1) {
+ throw new IcebergAnalysisException("ALTER TABLE contains multiple distribution clauses")
+ }
+
+ if (writeSpec.writeOrderingSpec.size > 1) {
+ throw new IcebergAnalysisException("ALTER TABLE contains multiple ordering clauses")
+ }
+
+ val distributionSpec = toBuffer(writeSpec.writeDistributionSpec).headOption.orNull
+ val orderingSpec = toBuffer(writeSpec.writeOrderingSpec).headOption.orNull
+
+ (distributionSpec, orderingSpec)
+ }
+
+ /**
+ * Create an order field.
+ */
+ override def visitOrderField(ctx: OrderFieldContext): (Term, SortDirection, NullOrder) = {
+ val term = Spark3Util.toIcebergTerm(typedVisit[Transform](ctx.transform))
+ val direction = Option(ctx.ASC)
+ .map(_ => SortDirection.ASC)
+ .orElse(Option(ctx.DESC).map(_ => SortDirection.DESC))
+ .getOrElse(SortDirection.ASC)
+ val nullOrder = Option(ctx.FIRST)
+ .map(_ => NullOrder.NULLS_FIRST)
+ .orElse(Option(ctx.LAST).map(_ => NullOrder.NULLS_LAST))
+ .getOrElse(
+ if (direction == SortDirection.ASC) NullOrder.NULLS_FIRST else NullOrder.NULLS_LAST)
+ (term, direction, nullOrder)
+ }
+
+ /**
+ * Create an IdentityTransform for a column reference.
+ */
+ override def visitIdentityTransform(ctx: IdentityTransformContext): Transform = withOrigin(ctx) {
+ IdentityTransform(FieldReference(typedVisit[Seq[String]](ctx.multipartIdentifier())))
+ }
+
+ /**
+ * Create a named Transform from argument expressions.
+ */
+ override def visitApplyTransform(ctx: ApplyTransformContext): Transform = withOrigin(ctx) {
+ val args = toSeq(ctx.arguments).map(typedVisit[expressions.Expression])
+ ApplyTransform(ctx.transformName.getText, args)
+ }
+
+ /**
+ * Create a transform argument from a column reference or a constant.
+ */
+ override def visitTransformArgument(ctx: TransformArgumentContext): expressions.Expression =
+ withOrigin(ctx) {
+ val reference = Option(ctx.multipartIdentifier())
+ .map(typedVisit[Seq[String]])
+ .map(FieldReference(_))
+ val literal = Option(ctx.constant)
+ .map(visitConstant)
+ .map(lit => LiteralValue(lit.value, lit.dataType))
+ reference
+ .orElse(literal)
+ .getOrElse(throw new IcebergParseException(s"Invalid transform argument", ctx))
+ }
+
+ /**
+ * Return a multi-part identifier as Seq[String].
+ */
+ override def visitMultipartIdentifier(ctx: MultipartIdentifierContext): Seq[String] =
+ withOrigin(ctx) {
+ toSeq(ctx.parts).map(_.getText)
+ }
+
+ override def visitSingleOrder(ctx: SingleOrderContext): Seq[(Term, SortDirection, NullOrder)] =
+ withOrigin(ctx) {
+ toSeq(ctx.order.fields).map(typedVisit[(Term, SortDirection, NullOrder)])
+ }
+
+ override def visitSingleStatement(ctx: SingleStatementContext): LogicalPlan = withOrigin(ctx) {
+ visit(ctx.statement).asInstanceOf[LogicalPlan]
+ }
+
+ def visitConstant(ctx: ConstantContext): Literal = {
+ delegate.parseExpression(ctx.getText).asInstanceOf[Literal]
+ }
+
+ override def visitExpression(ctx: ExpressionContext): Expression = {
+ // reconstruct the SQL string and parse it using the main Spark parser
+ // while we can avoid the logic to build Spark expressions, we still have to parse them
+ // we cannot call ctx.getText directly since it will not render spaces correctly
+ // that's why we need to recurse down the tree in reconstructSqlString
+ val sqlString = reconstructSqlString(ctx)
+ delegate.parseExpression(sqlString)
+ }
+
+ private def reconstructSqlString(ctx: ParserRuleContext): String = {
+ toBuffer(ctx.children)
+ .map {
+ case c: ParserRuleContext => reconstructSqlString(c)
+ case t: TerminalNode => t.getText
+ }
+ .mkString(" ")
+ }
+
+ private def typedVisit[T](ctx: ParseTree): T = {
+ ctx.accept(this).asInstanceOf[T]
+ }
+}
+
+/* Partially copied from Apache Spark's Parser to avoid dependency on Spark Internals */
+object IcebergParserUtils {
+
+ private[sql] def withOrigin[T](ctx: ParserRuleContext)(f: => T): T = {
+ val current = CurrentOrigin.get
+ CurrentOrigin.set(position(ctx.getStart))
+ try {
+ f
+ } finally {
+ CurrentOrigin.set(current)
+ }
+ }
+
+ private[sql] def position(token: Token): Origin = {
+ val opt = Option(token)
+ Origin(opt.map(_.getLine), opt.map(_.getCharPositionInLine))
+ }
+
+ /** Get the command which created the token. */
+ private[sql] def command(ctx: ParserRuleContext): String = {
+ val stream = ctx.getStart.getInputStream
+ stream.getText(Interval.of(0, stream.size() - 1))
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AddPartitionField.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AddPartitionField.scala
new file mode 100644
index 000000000000..0a830dbd4f6a
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AddPartitionField.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.expressions.Transform
+
+case class AddPartitionField(table: Seq[String], transform: Transform, name: Option[String])
+ extends LeafCommand {
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override def simpleString(maxFields: Int): String = {
+ s"AddPartitionField ${table.quoted} ${name.map(n => s"$n=").getOrElse("")}${transform.describe}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/BranchOptions.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/BranchOptions.scala
new file mode 100644
index 000000000000..15b908300213
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/BranchOptions.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical
+
+case class BranchOptions(
+ snapshotId: Option[Long],
+ numSnapshots: Option[Long],
+ snapshotRetain: Option[Long],
+ snapshotRefRetain: Option[Long])
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/CreateOrReplaceBranch.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/CreateOrReplaceBranch.scala
new file mode 100644
index 000000000000..6900f6e8cc50
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/CreateOrReplaceBranch.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.Attribute
+
+case class CreateOrReplaceBranch(
+ table: Seq[String],
+ branch: String,
+ branchOptions: BranchOptions,
+ create: Boolean,
+ replace: Boolean,
+ ifNotExists: Boolean)
+ extends LeafCommand {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override def simpleString(maxFields: Int): String = {
+ s"CreateOrReplaceBranch branch: ${branch} for table: ${table.quoted}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/CreateOrReplaceTag.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/CreateOrReplaceTag.scala
new file mode 100644
index 000000000000..957c68e7a540
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/CreateOrReplaceTag.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.Attribute
+
+case class CreateOrReplaceTag(
+ table: Seq[String],
+ tag: String,
+ tagOptions: TagOptions,
+ create: Boolean,
+ replace: Boolean,
+ ifNotExists: Boolean)
+ extends LeafCommand {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override def simpleString(maxFields: Int): String = {
+ s"CreateOrReplaceTag tag: ${tag} for table: ${table.quoted}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropBranch.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropBranch.scala
new file mode 100644
index 000000000000..ed4f1f512b85
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropBranch.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.Attribute
+
+case class DropBranch(table: Seq[String], branch: String, ifExists: Boolean) extends LeafCommand {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override def simpleString(maxFields: Int): String = {
+ s"DropBranch branch: ${branch} for table: ${table.quoted}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropIdentifierFields.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropIdentifierFields.scala
new file mode 100644
index 000000000000..1a91806280b3
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropIdentifierFields.scala
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.Attribute
+
+case class DropIdentifierFields(table: Seq[String], fields: Seq[String]) extends LeafCommand {
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override def simpleString(maxFields: Int): String = {
+ s"DropIdentifierFields ${table.quoted} (${fields.quoted})"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropPartitionField.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropPartitionField.scala
new file mode 100644
index 000000000000..ec952c8c7118
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropPartitionField.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.expressions.Transform
+
+case class DropPartitionField(table: Seq[String], transform: Transform) extends LeafCommand {
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override def simpleString(maxFields: Int): String = {
+ s"DropPartitionField ${table.quoted} ${transform.describe}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropTag.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropTag.scala
new file mode 100644
index 000000000000..da69ca0383a1
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DropTag.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.Attribute
+
+case class DropTag(table: Seq[String], tag: String, ifExists: Boolean) extends LeafCommand {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override def simpleString(maxFields: Int): String = {
+ s"DropTag tag: ${tag} for table: ${table.quoted}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ReplacePartitionField.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ReplacePartitionField.scala
new file mode 100644
index 000000000000..c2525369e7c7
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ReplacePartitionField.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.expressions.Transform
+
+case class ReplacePartitionField(
+ table: Seq[String],
+ transformFrom: Transform,
+ transformTo: Transform,
+ name: Option[String])
+ extends LeafCommand {
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override def simpleString(maxFields: Int): String = {
+ s"ReplacePartitionField ${table.quoted} ${transformFrom.describe} " +
+ s"with ${name.map(n => s"$n=").getOrElse("")}${transformTo.describe}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/SetIdentifierFields.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/SetIdentifierFields.scala
new file mode 100644
index 000000000000..8cd2c0ddad05
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/SetIdentifierFields.scala
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.Attribute
+
+case class SetIdentifierFields(table: Seq[String], fields: Seq[String]) extends LeafCommand {
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override def simpleString(maxFields: Int): String = {
+ s"SetIdentifierFields ${table.quoted} (${fields.quoted})"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/TagOptions.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/TagOptions.scala
new file mode 100644
index 000000000000..6afe1478d747
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/TagOptions.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical
+
+case class TagOptions(snapshotId: Option[Long], snapshotRefRetain: Option[Long])
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/views/CreateIcebergView.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/views/CreateIcebergView.scala
new file mode 100644
index 000000000000..84a00a4a9a88
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/views/CreateIcebergView.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical.views
+
+import org.apache.spark.sql.catalyst.analysis.AnalysisContext
+import org.apache.spark.sql.catalyst.plans.logical.AnalysisOnlyCommand
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+
+// Align Iceberg's CreateIcebergView with Spark’s CreateViewCommand by extending AnalysisOnlyCommand.
+// The command’s children are analyzed then hidden, so the optimizer/planner won’t traverse the view body.
+case class CreateIcebergView(
+ child: LogicalPlan,
+ queryText: String,
+ query: LogicalPlan,
+ columnAliases: Seq[String],
+ columnComments: Seq[Option[String]],
+ queryColumnNames: Seq[String] = Seq.empty,
+ comment: Option[String],
+ properties: Map[String, String],
+ allowExisting: Boolean,
+ replace: Boolean,
+ rewritten: Boolean = false,
+ isAnalyzed: Boolean = false)
+ extends AnalysisOnlyCommand {
+
+ override def childrenToAnalyze: Seq[LogicalPlan] = child :: query :: Nil
+
+ override def markAsAnalyzed(analysisContext: AnalysisContext): LogicalPlan = {
+ copy(isAnalyzed = true)
+ }
+
+ override protected def withNewChildrenInternal(
+ newChildren: IndexedSeq[LogicalPlan]): LogicalPlan = {
+ assert(!isAnalyzed)
+ copy(child = newChildren.head, query = newChildren.last)
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/views/DropIcebergView.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/views/DropIcebergView.scala
new file mode 100644
index 000000000000..092b6b33fb0c
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/views/DropIcebergView.scala
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical.views
+
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.UnaryCommand
+
+case class DropIcebergView(child: LogicalPlan, ifExists: Boolean) extends UnaryCommand {
+ override protected def withNewChildInternal(newChild: LogicalPlan): DropIcebergView =
+ copy(child = newChild)
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/views/ResolvedV2View.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/views/ResolvedV2View.scala
new file mode 100644
index 000000000000..4d384e857703
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/views/ResolvedV2View.scala
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical.views
+
+import org.apache.spark.sql.catalyst.analysis.LeafNodeWithoutStats
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.ViewCatalog
+
+case class ResolvedV2View(catalog: ViewCatalog, identifier: Identifier)
+ extends LeafNodeWithoutStats {
+ override def output: Seq[Attribute] = Nil
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/views/ShowIcebergViews.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/views/ShowIcebergViews.scala
new file mode 100644
index 000000000000..cbfe23d94cbe
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/views/ShowIcebergViews.scala
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical.views
+
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.ShowViews
+import org.apache.spark.sql.catalyst.plans.logical.UnaryCommand
+
+case class ShowIcebergViews(
+ namespace: LogicalPlan,
+ pattern: Option[String],
+ override val output: Seq[Attribute] = ShowViews.getOutputAttrs)
+ extends UnaryCommand {
+ override def child: LogicalPlan = namespace
+
+ override protected def withNewChildInternal(newChild: LogicalPlan): ShowIcebergViews =
+ copy(namespace = newChild)
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AddPartitionFieldExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AddPartitionFieldExec.scala
new file mode 100644
index 000000000000..e28dcfb194b6
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AddPartitionFieldExec.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.iceberg.spark.Spark3Util
+import org.apache.iceberg.spark.source.SparkTable
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.TableCatalog
+import org.apache.spark.sql.connector.expressions.Transform
+
+case class AddPartitionFieldExec(
+ catalog: TableCatalog,
+ ident: Identifier,
+ transform: Transform,
+ name: Option[String])
+ extends LeafV2CommandExec {
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ catalog.loadTable(ident) match {
+ case iceberg: SparkTable =>
+ iceberg.table
+ .updateSpec()
+ .addField(name.orNull, Spark3Util.toIcebergTerm(transform))
+ .commit()
+
+ case table =>
+ throw new UnsupportedOperationException(
+ s"Cannot add partition field to non-Iceberg table: $table")
+ }
+
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"AddPartitionField ${catalog.name}.${ident.quoted} ${name.map(n => s"$n=").getOrElse("")}${transform.describe}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterV2ViewSetPropertiesExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterV2ViewSetPropertiesExec.scala
new file mode 100644
index 000000000000..d6630e51ff5a
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterV2ViewSetPropertiesExec.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.ViewCatalog
+import org.apache.spark.sql.connector.catalog.ViewChange
+
+case class AlterV2ViewSetPropertiesExec(
+ catalog: ViewCatalog,
+ ident: Identifier,
+ properties: Map[String, String])
+ extends LeafV2CommandExec {
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ val changes = properties.map { case (property, value) =>
+ ViewChange.setProperty(property, value)
+ }.toSeq
+
+ catalog.alterView(ident, changes: _*)
+
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"AlterV2ViewSetProperties: ${ident}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterV2ViewUnsetPropertiesExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterV2ViewUnsetPropertiesExec.scala
new file mode 100644
index 000000000000..aa57842f58b5
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterV2ViewUnsetPropertiesExec.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.IcebergAnalysisException
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.ViewCatalog
+import org.apache.spark.sql.connector.catalog.ViewChange
+
+case class AlterV2ViewUnsetPropertiesExec(
+ catalog: ViewCatalog,
+ ident: Identifier,
+ propertyKeys: Seq[String],
+ ifExists: Boolean)
+ extends LeafV2CommandExec {
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ if (!ifExists) {
+ propertyKeys.filterNot(catalog.loadView(ident).properties.containsKey).foreach { property =>
+ throw new IcebergAnalysisException(s"Cannot remove property that is not set: '$property'")
+ }
+ }
+
+ val changes = propertyKeys.map(ViewChange.removeProperty)
+ catalog.alterView(ident, changes: _*)
+
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"AlterV2ViewUnsetProperties: ${ident}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateOrReplaceBranchExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateOrReplaceBranchExec.scala
new file mode 100644
index 000000000000..baf985f53a22
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateOrReplaceBranchExec.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions
+import org.apache.iceberg.spark.source.SparkTable
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.logical.BranchOptions
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.TableCatalog
+
+case class CreateOrReplaceBranchExec(
+ catalog: TableCatalog,
+ ident: Identifier,
+ branch: String,
+ branchOptions: BranchOptions,
+ create: Boolean,
+ replace: Boolean,
+ ifNotExists: Boolean)
+ extends LeafV2CommandExec {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ catalog.loadTable(ident) match {
+ case iceberg: SparkTable =>
+ val snapshotId: java.lang.Long = branchOptions.snapshotId
+ .orElse(Option(iceberg.table.currentSnapshot()).map(_.snapshotId()))
+ .map(java.lang.Long.valueOf)
+ .orNull
+
+ val manageSnapshots = iceberg.table().manageSnapshots()
+ val refExists = null != iceberg.table().refs().get(branch)
+
+ def safeCreateBranch(): Unit = {
+ if (snapshotId == null) {
+ manageSnapshots.createBranch(branch)
+ } else {
+ manageSnapshots.createBranch(branch, snapshotId)
+ }
+ }
+
+ if (create && replace && !refExists) {
+ safeCreateBranch()
+ } else if (replace) {
+ Preconditions.checkArgument(
+ snapshotId != null,
+ "Cannot complete replace branch operation on %s, main has no snapshot",
+ ident)
+ manageSnapshots.replaceBranch(branch, snapshotId)
+ } else {
+ if (refExists && ifNotExists) {
+ return Nil
+ }
+
+ safeCreateBranch()
+ }
+
+ if (branchOptions.numSnapshots.nonEmpty) {
+ manageSnapshots.setMinSnapshotsToKeep(branch, branchOptions.numSnapshots.get.toInt)
+ }
+
+ if (branchOptions.snapshotRetain.nonEmpty) {
+ manageSnapshots.setMaxSnapshotAgeMs(branch, branchOptions.snapshotRetain.get)
+ }
+
+ if (branchOptions.snapshotRefRetain.nonEmpty) {
+ manageSnapshots.setMaxRefAgeMs(branch, branchOptions.snapshotRefRetain.get)
+ }
+
+ manageSnapshots.commit()
+
+ case table =>
+ throw new UnsupportedOperationException(
+ s"Cannot create or replace branch on non-Iceberg table: $table")
+ }
+
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"CreateOrReplace branch: $branch for table: ${ident.quoted}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateOrReplaceTagExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateOrReplaceTagExec.scala
new file mode 100644
index 000000000000..e486892614cb
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateOrReplaceTagExec.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions
+import org.apache.iceberg.spark.source.SparkTable
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.logical.TagOptions
+import org.apache.spark.sql.connector.catalog._
+
+case class CreateOrReplaceTagExec(
+ catalog: TableCatalog,
+ ident: Identifier,
+ tag: String,
+ tagOptions: TagOptions,
+ create: Boolean,
+ replace: Boolean,
+ ifNotExists: Boolean)
+ extends LeafV2CommandExec {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ catalog.loadTable(ident) match {
+ case iceberg: SparkTable =>
+ val snapshotId: java.lang.Long = tagOptions.snapshotId
+ .orElse(Option(iceberg.table.currentSnapshot()).map(_.snapshotId()))
+ .map(java.lang.Long.valueOf)
+ .orNull
+
+ Preconditions.checkArgument(
+ snapshotId != null,
+ "Cannot complete create or replace tag operation on %s, main has no snapshot",
+ ident)
+
+ val manageSnapshot = iceberg.table.manageSnapshots()
+ val refExists = null != iceberg.table().refs().get(tag)
+
+ if (create && replace && !refExists) {
+ manageSnapshot.createTag(tag, snapshotId)
+ } else if (replace) {
+ manageSnapshot.replaceTag(tag, snapshotId)
+ } else {
+ if (refExists && ifNotExists) {
+ return Nil
+ }
+
+ manageSnapshot.createTag(tag, snapshotId)
+ }
+
+ if (tagOptions.snapshotRefRetain.nonEmpty) {
+ manageSnapshot.setMaxRefAgeMs(tag, tagOptions.snapshotRefRetain.get)
+ }
+
+ manageSnapshot.commit()
+
+ case table =>
+ throw new UnsupportedOperationException(s"Cannot create tag to non-Iceberg table: $table")
+ }
+
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"Create tag: $tag for table: ${ident.quoted}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateV2ViewExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateV2ViewExec.scala
new file mode 100644
index 000000000000..04f2b2af731d
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateV2ViewExec.scala
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.iceberg.spark.SupportsReplaceView
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.NoSuchViewException
+import org.apache.spark.sql.catalyst.analysis.ViewAlreadyExistsException
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.ViewCatalog
+import org.apache.spark.sql.connector.catalog.ViewInfo
+import org.apache.spark.sql.types.StructType
+import scala.jdk.CollectionConverters._
+
+case class CreateV2ViewExec(
+ catalog: ViewCatalog,
+ ident: Identifier,
+ queryText: String,
+ viewSchema: StructType,
+ columnAliases: Seq[String],
+ columnComments: Seq[Option[String]],
+ queryColumnNames: Seq[String],
+ comment: Option[String],
+ properties: Map[String, String],
+ allowExisting: Boolean,
+ replace: Boolean)
+ extends LeafV2CommandExec {
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ val currentCatalogName = session.sessionState.catalogManager.currentCatalog.name
+ val currentCatalog =
+ if (!catalog.name().equals(currentCatalogName)) currentCatalogName else null
+ val currentNamespace = session.sessionState.catalogManager.currentNamespace
+
+ val engineVersion = "Spark " + org.apache.spark.SPARK_VERSION
+ val newProperties = properties ++
+ comment.map(ViewCatalog.PROP_COMMENT -> _) ++
+ Map(
+ ViewCatalog.PROP_CREATE_ENGINE_VERSION -> engineVersion,
+ ViewCatalog.PROP_ENGINE_VERSION -> engineVersion)
+
+ if (replace) {
+ // CREATE OR REPLACE VIEW
+ catalog match {
+ case c: SupportsReplaceView =>
+ try {
+ replaceView(c, currentCatalog, currentNamespace, newProperties)
+ } catch {
+ // view might have been concurrently dropped during replace
+ case _: NoSuchViewException =>
+ replaceView(c, currentCatalog, currentNamespace, newProperties)
+ }
+ case _ =>
+ if (catalog.viewExists(ident)) {
+ catalog.dropView(ident)
+ }
+
+ createView(currentCatalog, currentNamespace, newProperties)
+ }
+ } else {
+ try {
+ // CREATE VIEW [IF NOT EXISTS]
+ createView(currentCatalog, currentNamespace, newProperties)
+ } catch {
+ case _: ViewAlreadyExistsException if allowExisting => // Ignore
+ }
+ }
+
+ Nil
+ }
+
+ private def replaceView(
+ supportsReplaceView: SupportsReplaceView,
+ currentCatalog: String,
+ currentNamespace: Array[String],
+ newProperties: Map[String, String]) = {
+ supportsReplaceView.replaceView(
+ ident,
+ queryText,
+ currentCatalog,
+ currentNamespace,
+ viewSchema,
+ queryColumnNames.toArray,
+ columnAliases.toArray,
+ columnComments.map(c => c.orNull).toArray,
+ newProperties.asJava)
+ }
+
+ private def createView(
+ currentCatalog: String,
+ currentNamespace: Array[String],
+ newProperties: Map[String, String]) = {
+ val viewInfo: ViewInfo = new ViewInfo(
+ ident,
+ queryText,
+ currentCatalog,
+ currentNamespace,
+ viewSchema,
+ queryColumnNames.toArray,
+ columnAliases.toArray,
+ columnComments.map(c => c.orNull).toArray,
+ newProperties.asJava)
+ catalog.createView(viewInfo)
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"CreateV2ViewExec: ${ident}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeV2ViewExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeV2ViewExec.scala
new file mode 100644
index 000000000000..106734d2078f
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeV2ViewExec.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.util.escapeSingleQuotedString
+import org.apache.spark.sql.connector.catalog.View
+import org.apache.spark.sql.connector.catalog.ViewCatalog
+import org.apache.spark.sql.execution.LeafExecNode
+import scala.jdk.CollectionConverters._
+
+case class DescribeV2ViewExec(output: Seq[Attribute], view: View, isExtended: Boolean)
+ extends V2CommandExec
+ with LeafExecNode {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override protected def run(): Seq[InternalRow] = {
+ if (isExtended) {
+ (describeSchema :+ emptyRow) ++ describeExtended
+ } else {
+ describeSchema
+ }
+ }
+
+ private def describeSchema: Seq[InternalRow] =
+ view.schema().map { column =>
+ toCatalystRow(column.name, column.dataType.simpleString, column.getComment().getOrElse(""))
+ }
+
+ private def emptyRow: InternalRow = toCatalystRow("", "", "")
+
+ private def describeExtended: Seq[InternalRow] = {
+ val outputColumns = view.queryColumnNames.mkString("[", ", ", "]")
+ val properties: Map[String, String] =
+ view.properties.asScala.toMap -- ViewCatalog.RESERVED_PROPERTIES.asScala
+ val viewCatalogAndNamespace: Seq[String] = view.name.split("\\.").take(2).toIndexedSeq
+ val viewProperties = properties.toSeq
+ .sortBy(_._1)
+ .map { case (key, value) =>
+ s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'"
+ }
+ .mkString("[", ", ", "]")
+
+ // omitting view text here because it is shown as
+ // part of SHOW CREATE TABLE and can result in weird formatting in the DESCRIBE output
+ toCatalystRow("# Detailed View Information", "", "") ::
+ toCatalystRow("Comment", view.properties.getOrDefault(ViewCatalog.PROP_COMMENT, ""), "") ::
+ toCatalystRow("View Catalog and Namespace", viewCatalogAndNamespace.quoted, "") ::
+ toCatalystRow("View Query Output Columns", outputColumns, "") ::
+ toCatalystRow("View Properties", viewProperties, "") ::
+ toCatalystRow(
+ "Created By",
+ view.properties.getOrDefault(ViewCatalog.PROP_CREATE_ENGINE_VERSION, ""),
+ "") ::
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"DescribeV2ViewExec"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropBranchExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropBranchExec.scala
new file mode 100644
index 000000000000..e7d9c7b70d82
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropBranchExec.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.iceberg.spark.source.SparkTable
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.TableCatalog
+
+case class DropBranchExec(
+ catalog: TableCatalog,
+ ident: Identifier,
+ branch: String,
+ ifExists: Boolean)
+ extends LeafV2CommandExec {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ catalog.loadTable(ident) match {
+ case iceberg: SparkTable =>
+ val ref = iceberg.table().refs().get(branch)
+ if (ref != null || !ifExists) {
+ iceberg.table().manageSnapshots().removeBranch(branch).commit()
+ }
+
+ case table =>
+ throw new UnsupportedOperationException(s"Cannot drop branch on non-Iceberg table: $table")
+ }
+
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"DropBranch branch: ${branch} for table: ${ident.quoted}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropIdentifierFieldsExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropIdentifierFieldsExec.scala
new file mode 100644
index 000000000000..87b18594d573
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropIdentifierFieldsExec.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions
+import org.apache.iceberg.relocated.com.google.common.collect.Sets
+import org.apache.iceberg.spark.source.SparkTable
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.TableCatalog
+
+case class DropIdentifierFieldsExec(catalog: TableCatalog, ident: Identifier, fields: Seq[String])
+ extends LeafV2CommandExec {
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ catalog.loadTable(ident) match {
+ case iceberg: SparkTable =>
+ val schema = iceberg.table.schema
+ val identifierFieldNames = Sets.newHashSet(schema.identifierFieldNames)
+
+ for (name <- fields) {
+ Preconditions.checkArgument(
+ schema.findField(name) != null,
+ "Cannot complete drop identifier fields operation: field %s not found",
+ name)
+ Preconditions.checkArgument(
+ identifierFieldNames.contains(name),
+ "Cannot complete drop identifier fields operation: %s is not an identifier field",
+ name)
+ identifierFieldNames.remove(name)
+ }
+
+ iceberg.table
+ .updateSchema()
+ .setIdentifierFields(identifierFieldNames)
+ .commit();
+ case table =>
+ throw new UnsupportedOperationException(
+ s"Cannot drop identifier fields in non-Iceberg table: $table")
+ }
+
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"DropIdentifierFields ${catalog.name}.${ident.quoted} (${fields.quoted})";
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropPartitionFieldExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropPartitionFieldExec.scala
new file mode 100644
index 000000000000..db43263e0e66
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropPartitionFieldExec.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.iceberg.spark.Spark3Util
+import org.apache.iceberg.spark.source.SparkTable
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.TableCatalog
+import org.apache.spark.sql.connector.expressions.FieldReference
+import org.apache.spark.sql.connector.expressions.IdentityTransform
+import org.apache.spark.sql.connector.expressions.Transform
+
+case class DropPartitionFieldExec(catalog: TableCatalog, ident: Identifier, transform: Transform)
+ extends LeafV2CommandExec {
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ catalog.loadTable(ident) match {
+ case iceberg: SparkTable =>
+ val schema = iceberg.table.schema
+ transform match {
+ case IdentityTransform(FieldReference(parts))
+ if parts.size == 1 && schema.findField(parts.head) == null =>
+ // the name is not present in the Iceberg schema, so it must be a partition field name, not a column name
+ iceberg.table
+ .updateSpec()
+ .removeField(parts.head)
+ .commit()
+
+ case _ =>
+ iceberg.table
+ .updateSpec()
+ .removeField(Spark3Util.toIcebergTerm(transform))
+ .commit()
+ }
+
+ case table =>
+ throw new UnsupportedOperationException(
+ s"Cannot drop partition field in non-Iceberg table: $table")
+ }
+
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"DropPartitionField ${catalog.name}.${ident.quoted} ${transform.describe}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropTagExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropTagExec.scala
new file mode 100644
index 000000000000..79b4a1525591
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropTagExec.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.iceberg.spark.source.SparkTable
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.TableCatalog
+
+case class DropTagExec(catalog: TableCatalog, ident: Identifier, tag: String, ifExists: Boolean)
+ extends LeafV2CommandExec {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ catalog.loadTable(ident) match {
+ case iceberg: SparkTable =>
+ val ref = iceberg.table().refs().get(tag)
+ if (ref != null || !ifExists) {
+ iceberg.table().manageSnapshots().removeTag(tag).commit()
+ }
+
+ case table =>
+ throw new UnsupportedOperationException(s"Cannot drop tag on non-Iceberg table: $table")
+ }
+
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"DropTag tag: ${tag} for table: ${ident.quoted}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropV2ViewExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropV2ViewExec.scala
new file mode 100644
index 000000000000..6dd1188b78e8
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropV2ViewExec.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.NoSuchViewException
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.ViewCatalog
+
+case class DropV2ViewExec(catalog: ViewCatalog, ident: Identifier, ifExists: Boolean)
+ extends LeafV2CommandExec {
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ val dropped = catalog.dropView(ident)
+ if (!dropped && !ifExists) {
+ throw new NoSuchViewException(ident)
+ }
+
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"DropV2View: ${ident}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ExtendedDataSourceV2Strategy.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ExtendedDataSourceV2Strategy.scala
new file mode 100644
index 000000000000..da540f5891b7
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ExtendedDataSourceV2Strategy.scala
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.iceberg.spark.Spark3Util
+import org.apache.iceberg.spark.SparkCatalog
+import org.apache.iceberg.spark.SparkSessionCatalog
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.analysis.IcebergAnalysisException
+import org.apache.spark.sql.catalyst.analysis.ResolvedIdentifier
+import org.apache.spark.sql.catalyst.analysis.ResolvedNamespace
+import org.apache.spark.sql.catalyst.expressions.PredicateHelper
+import org.apache.spark.sql.catalyst.plans.logical.AddPartitionField
+import org.apache.spark.sql.catalyst.plans.logical.CreateOrReplaceBranch
+import org.apache.spark.sql.catalyst.plans.logical.CreateOrReplaceTag
+import org.apache.spark.sql.catalyst.plans.logical.DescribeRelation
+import org.apache.spark.sql.catalyst.plans.logical.DropBranch
+import org.apache.spark.sql.catalyst.plans.logical.DropIdentifierFields
+import org.apache.spark.sql.catalyst.plans.logical.DropPartitionField
+import org.apache.spark.sql.catalyst.plans.logical.DropTag
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.OrderAwareCoalesce
+import org.apache.spark.sql.catalyst.plans.logical.RenameTable
+import org.apache.spark.sql.catalyst.plans.logical.ReplacePartitionField
+import org.apache.spark.sql.catalyst.plans.logical.SetIdentifierFields
+import org.apache.spark.sql.catalyst.plans.logical.SetViewProperties
+import org.apache.spark.sql.catalyst.plans.logical.SetWriteDistributionAndOrdering
+import org.apache.spark.sql.catalyst.plans.logical.ShowCreateTable
+import org.apache.spark.sql.catalyst.plans.logical.ShowTableProperties
+import org.apache.spark.sql.catalyst.plans.logical.UnsetViewProperties
+import org.apache.spark.sql.catalyst.plans.logical.views.CreateIcebergView
+import org.apache.spark.sql.catalyst.plans.logical.views.DropIcebergView
+import org.apache.spark.sql.catalyst.plans.logical.views.ResolvedV2View
+import org.apache.spark.sql.catalyst.plans.logical.views.ShowIcebergViews
+import org.apache.spark.sql.classic.Strategy
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.TableCatalog
+import org.apache.spark.sql.connector.catalog.ViewCatalog
+import org.apache.spark.sql.execution.OrderAwareCoalesceExec
+import org.apache.spark.sql.execution.SparkPlan
+import scala.jdk.CollectionConverters._
+
+case class ExtendedDataSourceV2Strategy(spark: SparkSession) extends Strategy with PredicateHelper {
+
+ override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+ case AddPartitionField(IcebergCatalogAndIdentifier(catalog, ident), transform, name) =>
+ AddPartitionFieldExec(catalog, ident, transform, name) :: Nil
+
+ case CreateOrReplaceBranch(
+ IcebergCatalogAndIdentifier(catalog, ident),
+ branch,
+ branchOptions,
+ create,
+ replace,
+ ifNotExists) =>
+ CreateOrReplaceBranchExec(
+ catalog,
+ ident,
+ branch,
+ branchOptions,
+ create,
+ replace,
+ ifNotExists) :: Nil
+
+ case CreateOrReplaceTag(
+ IcebergCatalogAndIdentifier(catalog, ident),
+ tag,
+ tagOptions,
+ create,
+ replace,
+ ifNotExists) =>
+ CreateOrReplaceTagExec(catalog, ident, tag, tagOptions, create, replace, ifNotExists) :: Nil
+
+ case DropBranch(IcebergCatalogAndIdentifier(catalog, ident), branch, ifExists) =>
+ DropBranchExec(catalog, ident, branch, ifExists) :: Nil
+
+ case DropTag(IcebergCatalogAndIdentifier(catalog, ident), tag, ifExists) =>
+ DropTagExec(catalog, ident, tag, ifExists) :: Nil
+
+ case DropPartitionField(IcebergCatalogAndIdentifier(catalog, ident), transform) =>
+ DropPartitionFieldExec(catalog, ident, transform) :: Nil
+
+ case ReplacePartitionField(
+ IcebergCatalogAndIdentifier(catalog, ident),
+ transformFrom,
+ transformTo,
+ name) =>
+ ReplacePartitionFieldExec(catalog, ident, transformFrom, transformTo, name) :: Nil
+
+ case SetIdentifierFields(IcebergCatalogAndIdentifier(catalog, ident), fields) =>
+ SetIdentifierFieldsExec(catalog, ident, fields) :: Nil
+
+ case DropIdentifierFields(IcebergCatalogAndIdentifier(catalog, ident), fields) =>
+ DropIdentifierFieldsExec(catalog, ident, fields) :: Nil
+
+ case SetWriteDistributionAndOrdering(
+ IcebergCatalogAndIdentifier(catalog, ident),
+ distributionMode,
+ ordering) =>
+ SetWriteDistributionAndOrderingExec(catalog, ident, distributionMode, ordering) :: Nil
+
+ case OrderAwareCoalesce(numPartitions, coalescer, child) =>
+ OrderAwareCoalesceExec(numPartitions, coalescer, planLater(child)) :: Nil
+
+ case RenameTable(ResolvedV2View(oldCatalog: ViewCatalog, oldIdent), newName, isView @ true) =>
+ val newIdent = Spark3Util.catalogAndIdentifier(spark, newName.toList.asJava)
+ if (oldCatalog.name != newIdent.catalog().name()) {
+ throw new IcebergAnalysisException(
+ s"Cannot move view between catalogs: from=${oldCatalog.name} and to=${newIdent.catalog().name()}")
+ }
+ RenameV2ViewExec(oldCatalog, oldIdent, newIdent.identifier()) :: Nil
+
+ case DropIcebergView(ResolvedIdentifier(viewCatalog: ViewCatalog, ident), ifExists) =>
+ DropV2ViewExec(viewCatalog, ident, ifExists) :: Nil
+
+ case CreateIcebergView(
+ ResolvedIdentifier(viewCatalog: ViewCatalog, ident),
+ queryText,
+ query,
+ columnAliases,
+ columnComments,
+ queryColumnNames,
+ comment,
+ properties,
+ allowExisting,
+ replace,
+ _,
+ _) =>
+ CreateV2ViewExec(
+ catalog = viewCatalog,
+ ident = ident,
+ queryText = queryText,
+ columnAliases = columnAliases,
+ columnComments = columnComments,
+ queryColumnNames = queryColumnNames,
+ viewSchema = query.schema,
+ comment = comment,
+ properties = properties,
+ allowExisting = allowExisting,
+ replace = replace) :: Nil
+
+ case DescribeRelation(ResolvedV2View(catalog, ident), _, isExtended, output) =>
+ DescribeV2ViewExec(output, catalog.loadView(ident), isExtended) :: Nil
+
+ case ShowTableProperties(ResolvedV2View(catalog, ident), propertyKey, output) =>
+ ShowV2ViewPropertiesExec(output, catalog.loadView(ident), propertyKey) :: Nil
+
+ case ShowIcebergViews(ResolvedNamespace(catalog: ViewCatalog, namespace, _), pattern, output) =>
+ ShowV2ViewsExec(output, catalog, namespace, pattern) :: Nil
+
+ case ShowCreateTable(ResolvedV2View(catalog, ident), _, output) =>
+ ShowCreateV2ViewExec(output, catalog.loadView(ident)) :: Nil
+
+ case SetViewProperties(ResolvedV2View(catalog, ident), properties) =>
+ AlterV2ViewSetPropertiesExec(catalog, ident, properties) :: Nil
+
+ case UnsetViewProperties(ResolvedV2View(catalog, ident), propertyKeys, ifExists) =>
+ AlterV2ViewUnsetPropertiesExec(catalog, ident, propertyKeys, ifExists) :: Nil
+
+ case _ => Nil
+ }
+
+ private object IcebergCatalogAndIdentifier {
+ def unapply(identifier: Seq[String]): Option[(TableCatalog, Identifier)] = {
+ val catalogAndIdentifier = Spark3Util.catalogAndIdentifier(spark, identifier.asJava)
+ catalogAndIdentifier.catalog match {
+ case icebergCatalog: SparkCatalog =>
+ Some((icebergCatalog, catalogAndIdentifier.identifier))
+ case icebergCatalog: SparkSessionCatalog[_] =>
+ Some((icebergCatalog, catalogAndIdentifier.identifier))
+ case _ =>
+ None
+ }
+ }
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RenameV2ViewExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RenameV2ViewExec.scala
new file mode 100644
index 000000000000..5dada1cab0bb
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RenameV2ViewExec.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.ViewCatalog
+
+case class RenameV2ViewExec(catalog: ViewCatalog, oldIdent: Identifier, newIdent: Identifier)
+ extends LeafV2CommandExec {
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ catalog.renameView(oldIdent, newIdent)
+
+ Seq.empty
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"RenameV2View ${oldIdent} to {newIdent}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplacePartitionFieldExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplacePartitionFieldExec.scala
new file mode 100644
index 000000000000..00b998c49e83
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplacePartitionFieldExec.scala
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.iceberg.spark.Spark3Util
+import org.apache.iceberg.spark.source.SparkTable
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.TableCatalog
+import org.apache.spark.sql.connector.expressions.FieldReference
+import org.apache.spark.sql.connector.expressions.IdentityTransform
+import org.apache.spark.sql.connector.expressions.Transform
+
+case class ReplacePartitionFieldExec(
+ catalog: TableCatalog,
+ ident: Identifier,
+ transformFrom: Transform,
+ transformTo: Transform,
+ name: Option[String])
+ extends LeafV2CommandExec {
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ catalog.loadTable(ident) match {
+ case iceberg: SparkTable =>
+ val schema = iceberg.table.schema
+ transformFrom match {
+ case IdentityTransform(FieldReference(parts))
+ if parts.size == 1 && schema.findField(parts.head) == null =>
+ // the name is not present in the Iceberg schema, so it must be a partition field name, not a column name
+ iceberg.table
+ .updateSpec()
+ .removeField(parts.head)
+ .addField(name.orNull, Spark3Util.toIcebergTerm(transformTo))
+ .commit()
+
+ case _ =>
+ iceberg.table
+ .updateSpec()
+ .removeField(Spark3Util.toIcebergTerm(transformFrom))
+ .addField(name.orNull, Spark3Util.toIcebergTerm(transformTo))
+ .commit()
+ }
+
+ case table =>
+ throw new UnsupportedOperationException(
+ s"Cannot replace partition field in non-Iceberg table: $table")
+ }
+
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"ReplacePartitionField ${catalog.name}.${ident.quoted} ${transformFrom.describe} " +
+ s"with ${name.map(n => s"$n=").getOrElse("")}${transformTo.describe}"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetIdentifierFieldsExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetIdentifierFieldsExec.scala
new file mode 100644
index 000000000000..50c53473ab60
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetIdentifierFieldsExec.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.iceberg.spark.source.SparkTable
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.TableCatalog
+import scala.jdk.CollectionConverters._
+
+case class SetIdentifierFieldsExec(catalog: TableCatalog, ident: Identifier, fields: Seq[String])
+ extends LeafV2CommandExec {
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ catalog.loadTable(ident) match {
+ case iceberg: SparkTable =>
+ iceberg.table
+ .updateSchema()
+ .setIdentifierFields(fields.asJava)
+ .commit();
+ case table =>
+ throw new UnsupportedOperationException(
+ s"Cannot set identifier fields in non-Iceberg table: $table")
+ }
+
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"SetIdentifierFields ${catalog.name}.${ident.quoted} (${fields.quoted})";
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetWriteDistributionAndOrderingExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetWriteDistributionAndOrderingExec.scala
new file mode 100644
index 000000000000..9a10949d5e9e
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetWriteDistributionAndOrderingExec.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.iceberg.DistributionMode
+import org.apache.iceberg.NullOrder
+import org.apache.iceberg.SortDirection
+import org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE
+import org.apache.iceberg.expressions.Term
+import org.apache.iceberg.spark.SparkUtil
+import org.apache.iceberg.spark.source.SparkTable
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.CatalogV2Implicits
+import org.apache.spark.sql.connector.catalog.Identifier
+import org.apache.spark.sql.connector.catalog.TableCatalog
+
+case class SetWriteDistributionAndOrderingExec(
+ catalog: TableCatalog,
+ ident: Identifier,
+ distributionMode: Option[DistributionMode],
+ sortOrder: Seq[(Term, SortDirection, NullOrder)])
+ extends LeafV2CommandExec {
+
+ import CatalogV2Implicits._
+
+ override lazy val output: Seq[Attribute] = Nil
+
+ override protected def run(): Seq[InternalRow] = {
+ catalog.loadTable(ident) match {
+ case iceberg: SparkTable =>
+ val txn = iceberg.table.newTransaction()
+
+ val orderBuilder = txn.replaceSortOrder().caseSensitive(SparkUtil.caseSensitive(session))
+ sortOrder.foreach {
+ case (term, SortDirection.ASC, nullOrder) =>
+ orderBuilder.asc(term, nullOrder)
+ case (term, SortDirection.DESC, nullOrder) =>
+ orderBuilder.desc(term, nullOrder)
+ }
+ orderBuilder.commit()
+
+ distributionMode.foreach { mode =>
+ txn
+ .updateProperties()
+ .set(WRITE_DISTRIBUTION_MODE, mode.modeName())
+ .commit()
+ }
+
+ txn.commitTransaction()
+
+ case table =>
+ throw new UnsupportedOperationException(
+ s"Cannot set write order of non-Iceberg table: $table")
+ }
+
+ Nil
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ val tableIdent = s"${catalog.name}.${ident.quoted}"
+ val order = sortOrder
+ .map { case (term, direction, nullOrder) =>
+ s"$term $direction $nullOrder"
+ }
+ .mkString(", ")
+ s"SetWriteDistributionAndOrdering $tableIdent $distributionMode $order"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCreateV2ViewExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCreateV2ViewExec.scala
new file mode 100644
index 000000000000..07ac4aeda8fb
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCreateV2ViewExec.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.util.escapeSingleQuotedString
+import org.apache.spark.sql.connector.catalog.View
+import org.apache.spark.sql.connector.catalog.ViewCatalog
+import org.apache.spark.sql.execution.LeafExecNode
+import scala.jdk.CollectionConverters._
+
+case class ShowCreateV2ViewExec(output: Seq[Attribute], view: View)
+ extends V2CommandExec
+ with LeafExecNode {
+
+ override protected def run(): Seq[InternalRow] = {
+ val builder = new StringBuilder
+ builder ++= s"CREATE VIEW ${view.name} "
+ showColumns(view, builder)
+ showComment(view, builder)
+ showProperties(view, builder)
+ builder ++= s"AS\n${view.query}\n"
+
+ Seq(toCatalystRow(builder.toString))
+ }
+
+ private def showColumns(view: View, builder: StringBuilder): Unit = {
+ val columns = concatByMultiLines(
+ view
+ .schema()
+ .fields
+ .map(x => s"${x.name}${x.getComment().map(c => s" COMMENT '$c'").getOrElse("")}"))
+ builder ++= columns
+ }
+
+ private def showComment(view: View, builder: StringBuilder): Unit = {
+ Option(view.properties.get(ViewCatalog.PROP_COMMENT))
+ .map("COMMENT '" + escapeSingleQuotedString(_) + "'\n")
+ .foreach(builder.append)
+ }
+
+ private def showProperties(view: View, builder: StringBuilder): Unit = {
+ val showProps = view.properties.asScala.toMap -- ViewCatalog.RESERVED_PROPERTIES.asScala
+ if (showProps.nonEmpty) {
+ val props = conf.redactOptions(showProps).toSeq.sortBy(_._1).map { case (key, value) =>
+ s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'"
+ }
+
+ builder ++= "TBLPROPERTIES "
+ builder ++= concatByMultiLines(props)
+ }
+ }
+
+ private def concatByMultiLines(iter: Iterable[String]): String = {
+ iter.mkString("(\n ", ",\n ", ")\n")
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"ShowCreateV2ViewExec"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowV2ViewPropertiesExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowV2ViewPropertiesExec.scala
new file mode 100644
index 000000000000..ace43eb6c07b
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowV2ViewPropertiesExec.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.connector.catalog.View
+import org.apache.spark.sql.connector.catalog.ViewCatalog
+import org.apache.spark.sql.execution.LeafExecNode
+import scala.jdk.CollectionConverters._
+
+case class ShowV2ViewPropertiesExec(output: Seq[Attribute], view: View, propertyKey: Option[String])
+ extends V2CommandExec
+ with LeafExecNode {
+
+ override protected def run(): Seq[InternalRow] = {
+ propertyKey match {
+ case Some(p) =>
+ val propValue = properties.getOrElse(p, s"View ${view.name()} does not have property: $p")
+ Seq(toCatalystRow(p, propValue))
+ case None =>
+ properties.map { case (k, v) =>
+ toCatalystRow(k, v)
+ }.toSeq
+ }
+ }
+
+ private def properties = {
+ view.properties.asScala.toMap -- ViewCatalog.RESERVED_PROPERTIES.asScala
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"ShowV2ViewPropertiesExec"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowV2ViewsExec.scala b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowV2ViewsExec.scala
new file mode 100644
index 000000000000..4e7700b43978
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowV2ViewsExec.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.spark.sql.execution.datasources.v2
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.util.StringUtils
+import org.apache.spark.sql.connector.catalog.ViewCatalog
+import org.apache.spark.sql.execution.LeafExecNode
+import org.apache.spark.sql.internal.SQLConf
+import scala.collection.mutable.ArrayBuffer
+
+case class ShowV2ViewsExec(
+ output: Seq[Attribute],
+ catalog: ViewCatalog,
+ namespace: Seq[String],
+ pattern: Option[String])
+ extends V2CommandExec
+ with LeafExecNode {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override protected def run(): Seq[InternalRow] = {
+ val rows = new ArrayBuffer[InternalRow]()
+
+ // handle GLOBAL VIEWS
+ val globalTemp = SQLConf.get.globalTempDatabase
+ if (namespace.nonEmpty && globalTemp == namespace.head) {
+ pattern
+ .map(p => session.sessionState.catalog.globalTempViewManager.listViewNames(p))
+ .getOrElse(session.sessionState.catalog.globalTempViewManager.listViewNames("*"))
+ .map(name => rows += toCatalystRow(globalTemp, name, true))
+ } else {
+ val views = catalog.listViews(namespace: _*)
+ views.map { view =>
+ if (pattern.map(StringUtils.filterPattern(Seq(view.name()), _).nonEmpty).getOrElse(true)) {
+ rows += toCatalystRow(view.namespace().quoted, view.name(), false)
+ }
+ }
+ }
+
+ // include TEMP VIEWS
+ pattern
+ .map(p => session.sessionState.catalog.listLocalTempViews(p))
+ .getOrElse(session.sessionState.catalog.listLocalTempViews("*"))
+ .map(v => rows += toCatalystRow(v.database.toArray.quoted, v.table, true))
+
+ rows.toSeq
+ }
+
+ override def simpleString(maxFields: Int): String = {
+ s"ShowV2ViewsExec"
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java
new file mode 100644
index 000000000000..bfcb5af235d3
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+import java.lang.reflect.Field;
+import java.util.Collections;
+import java.util.List;
+import org.apache.iceberg.NullOrder;
+import org.apache.iceberg.SortDirection;
+import org.apache.iceberg.expressions.Term;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.catalyst.parser.AbstractSqlParser;
+import org.apache.spark.sql.catalyst.parser.AstBuilder;
+import org.apache.spark.sql.catalyst.parser.ParserInterface;
+import org.apache.spark.sql.catalyst.parser.extensions.IcebergSparkSqlExtensionsParser;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+public class TestExtendedParser {
+
+ private static SparkSession spark;
+ private static final String SQL_PARSER_FIELD = "sqlParser";
+ private ParserInterface originalParser;
+
+ @BeforeAll
+ public static void before() {
+ spark = SparkSession.builder().master("local").appName("TestExtendedParser").getOrCreate();
+ }
+
+ @AfterAll
+ public static void after() {
+ if (spark != null) {
+ spark.stop();
+ }
+ }
+
+ @BeforeEach
+ public void saveOriginalParser() throws Exception {
+ Class> clazz = spark.sessionState().getClass();
+ Field parserField = null;
+ while (clazz != null && parserField == null) {
+ try {
+ parserField = clazz.getDeclaredField(SQL_PARSER_FIELD);
+ } catch (NoSuchFieldException e) {
+ clazz = clazz.getSuperclass();
+ }
+ }
+ parserField.setAccessible(true);
+ originalParser = (ParserInterface) parserField.get(spark.sessionState());
+ }
+
+ @AfterEach
+ public void restoreOriginalParser() throws Exception {
+ setSessionStateParser(spark.sessionState(), originalParser);
+ }
+
+ /**
+ * Tests that the Iceberg extended SQL parser can correctly parse a sort order string and return
+ * the expected RawOrderField.
+ *
+ * @throws Exception if reflection access fails
+ */
+ @Test
+ public void testParseSortOrderWithRealIcebergExtendedParser() throws Exception {
+ ParserInterface origParser = null;
+ Class> clazz = spark.sessionState().getClass();
+ while (clazz != null && origParser == null) {
+ try {
+ Field parserField = clazz.getDeclaredField(SQL_PARSER_FIELD);
+ parserField.setAccessible(true);
+ origParser = (ParserInterface) parserField.get(spark.sessionState());
+ } catch (NoSuchFieldException e) {
+ clazz = clazz.getSuperclass();
+ }
+ }
+ assertThat(origParser).isNotNull();
+
+ IcebergSparkSqlExtensionsParser icebergParser = new IcebergSparkSqlExtensionsParser(origParser);
+
+ setSessionStateParser(spark.sessionState(), icebergParser);
+
+ List fields =
+ ExtendedParser.parseSortOrder(spark, "id ASC NULLS FIRST");
+
+ assertThat(fields).isNotEmpty();
+ ExtendedParser.RawOrderField first = fields.get(0);
+ assertThat(first.direction()).isEqualTo(SortDirection.ASC);
+ assertThat(first.nullOrder()).isEqualTo(NullOrder.NULLS_FIRST);
+ }
+
+ /**
+ * Tests that parseSortOrder can find and use an ExtendedParser that is wrapped inside another
+ * ParserInterface implementation.
+ *
+ * @throws Exception if reflection access fails
+ */
+ @Test
+ public void testParseSortOrderFindsNestedExtendedParser() throws Exception {
+ ExtendedParser icebergParser = mock(ExtendedParser.class);
+
+ ExtendedParser.RawOrderField field =
+ new ExtendedParser.RawOrderField(
+ mock(Term.class), SortDirection.ASC, NullOrder.NULLS_FIRST);
+ List expected = Collections.singletonList(field);
+
+ when(icebergParser.parseSortOrder("id ASC NULLS FIRST")).thenReturn(expected);
+
+ ParserInterface wrapper = new WrapperParser(icebergParser);
+
+ setSessionStateParser(spark.sessionState(), wrapper);
+
+ List result =
+ ExtendedParser.parseSortOrder(spark, "id ASC NULLS FIRST");
+ assertThat(result).isSameAs(expected);
+
+ verify(icebergParser).parseSortOrder("id ASC NULLS FIRST");
+ }
+
+ /**
+ * Tests that parseSortOrder throws an exception if no ExtendedParser instance can be found in the
+ * parser chain.
+ *
+ * @throws Exception if reflection access fails
+ */
+ @Test
+ public void testParseSortOrderThrowsWhenNoExtendedParserFound() throws Exception {
+ ParserInterface dummy = mock(ParserInterface.class);
+ setSessionStateParser(spark.sessionState(), dummy);
+
+ assertThatThrownBy(() -> ExtendedParser.parseSortOrder(spark, "id ASC"))
+ .isInstanceOf(IllegalStateException.class)
+ .hasMessageContaining("Iceberg ExtendedParser");
+ }
+
+ /**
+ * Tests that parseSortOrder can find an ExtendedParser in a parent class field of the parser.
+ *
+ * @throws Exception if reflection access fails
+ */
+ @Test
+ public void testParseSortOrderFindsExtendedParserInParentClassField() throws Exception {
+ ExtendedParser icebergParser = mock(ExtendedParser.class);
+ ExtendedParser.RawOrderField field =
+ new ExtendedParser.RawOrderField(
+ mock(Term.class), SortDirection.ASC, NullOrder.NULLS_FIRST);
+ List expected = Collections.singletonList(field);
+ when(icebergParser.parseSortOrder("id ASC NULLS FIRST")).thenReturn(expected);
+ ParserInterface parser = new GrandChildParser(icebergParser);
+ setSessionStateParser(spark.sessionState(), parser);
+
+ List result =
+ ExtendedParser.parseSortOrder(spark, "id ASC NULLS FIRST");
+ assertThat(result).isSameAs(expected);
+ verify(icebergParser).parseSortOrder("id ASC NULLS FIRST");
+ }
+
+ private static void setSessionStateParser(Object sessionState, ParserInterface parser)
+ throws Exception {
+ Class> clazz = sessionState.getClass();
+ Field targetField = null;
+ while (clazz != null && targetField == null) {
+ try {
+ targetField = clazz.getDeclaredField(SQL_PARSER_FIELD);
+ } catch (NoSuchFieldException e) {
+ clazz = clazz.getSuperclass();
+ }
+ }
+ if (targetField == null) {
+ throw new IllegalStateException(
+ "No suitable sqlParser field found in sessionState class hierarchy!");
+ }
+ targetField.setAccessible(true);
+ targetField.set(sessionState, parser);
+ }
+
+ private static class WrapperParser extends AbstractSqlParser {
+ private final ParserInterface delegate;
+ private String name;
+
+ WrapperParser(ParserInterface delegate) {
+ this.delegate = delegate;
+ this.name = "delegate";
+ }
+
+ public ParserInterface getDelegate() {
+ return delegate;
+ }
+
+ @Override
+ public AstBuilder astBuilder() {
+ return null;
+ }
+ }
+
+ private static class ChildParser extends WrapperParser {
+ ChildParser(ParserInterface parent) {
+ super(parent);
+ }
+ }
+
+ private static class GrandChildParser extends ChildParser {
+ GrandChildParser(ParserInterface parent) {
+ super(parent);
+ }
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java
new file mode 100644
index 000000000000..8918dfec6584
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark.extensions;
+
+import java.util.Objects;
+
+public class Employee {
+ private Integer id;
+ private String dep;
+
+ public Employee() {}
+
+ public Employee(Integer id, String dep) {
+ this.id = id;
+ this.dep = dep;
+ }
+
+ public Integer getId() {
+ return id;
+ }
+
+ public void setId(Integer id) {
+ this.id = id;
+ }
+
+ public String getDep() {
+ return dep;
+ }
+
+ public void setDep(String dep) {
+ this.dep = dep;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ } else if (other == null || getClass() != other.getClass()) {
+ return false;
+ }
+
+ Employee employee = (Employee) other;
+ return Objects.equals(id, employee.id) && Objects.equals(dep, employee.dep);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(id, dep);
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java
new file mode 100644
index 000000000000..796c47b545cc
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark.extensions;
+
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS;
+
+import java.net.InetAddress;
+import java.util.Random;
+import java.util.concurrent.ThreadLocalRandom;
+import org.apache.iceberg.CatalogUtil;
+import org.apache.iceberg.hive.HiveCatalog;
+import org.apache.iceberg.hive.TestHiveMetastore;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.spark.CatalogTestBase;
+import org.apache.iceberg.spark.TestBase;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.internal.SQLConf;
+import org.junit.jupiter.api.BeforeAll;
+
+public abstract class ExtensionsTestBase extends CatalogTestBase {
+
+ private static final Random RANDOM = ThreadLocalRandom.current();
+
+ @BeforeAll
+ public static void startMetastoreAndSpark() {
+ TestBase.metastore = new TestHiveMetastore();
+ metastore.start();
+ TestBase.hiveConf = metastore.hiveConf();
+
+ TestBase.spark.stop();
+
+ TestBase.spark =
+ SparkSession.builder()
+ .master("local[2]")
+ .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress())
+ .config("spark.testing", "true")
+ .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic")
+ .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName())
+ .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname))
+ .config("spark.sql.shuffle.partitions", "4")
+ .config("spark.sql.hive.metastorePartitionPruningFallbackOnException", "true")
+ .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true")
+ .config(
+ SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), String.valueOf(RANDOM.nextBoolean()))
+ .enableHiveSupport()
+ .getOrCreate();
+
+ TestBase.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+ TestBase.catalog =
+ (HiveCatalog)
+ CatalogUtil.loadCatalog(
+ HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf);
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ProcedureUtil.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ProcedureUtil.java
new file mode 100644
index 000000000000..de4acd74a7ed
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ProcedureUtil.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark.extensions;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.UUID;
+import org.apache.iceberg.ImmutableGenericPartitionStatisticsFile;
+import org.apache.iceberg.PartitionStatisticsFile;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.PositionOutputStream;
+
+public class ProcedureUtil {
+
+ private ProcedureUtil() {}
+
+ static PartitionStatisticsFile writePartitionStatsFile(
+ long snapshotId, String statsLocation, FileIO fileIO) {
+ PositionOutputStream positionOutputStream;
+ try {
+ positionOutputStream = fileIO.newOutputFile(statsLocation).create();
+ positionOutputStream.close();
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+
+ return ImmutableGenericPartitionStatisticsFile.builder()
+ .snapshotId(snapshotId)
+ .fileSizeInBytes(42L)
+ .path(statsLocation)
+ .build();
+ }
+
+ static String statsFileLocation(String tableLocation) {
+ String statsFileName = "stats-file-" + UUID.randomUUID();
+ return tableLocation.replaceFirst("file:", "") + "/metadata/" + statsFileName;
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkPlanUtil.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkPlanUtil.java
new file mode 100644
index 000000000000..830d07d86eab
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkPlanUtil.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark.extensions;
+
+import static scala.collection.JavaConverters.seqAsJavaListConverter;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.spark.sql.catalyst.expressions.Expression;
+import org.apache.spark.sql.execution.CommandResultExec;
+import org.apache.spark.sql.execution.SparkPlan;
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper;
+import org.apache.spark.sql.execution.datasources.v2.BatchScanExec;
+import scala.PartialFunction;
+import scala.collection.Seq;
+
+public class SparkPlanUtil {
+
+ private static final AdaptiveSparkPlanHelper SPARK_HELPER = new AdaptiveSparkPlanHelper() {};
+
+ private SparkPlanUtil() {}
+
+ public static List collectLeaves(SparkPlan plan) {
+ return toJavaList(SPARK_HELPER.collectLeaves(actualPlan(plan)));
+ }
+
+ public static List collectBatchScans(SparkPlan plan) {
+ List leaves = collectLeaves(plan);
+ return leaves.stream()
+ .filter(scan -> scan instanceof BatchScanExec)
+ .collect(Collectors.toList());
+ }
+
+ private static SparkPlan actualPlan(SparkPlan plan) {
+ if (plan instanceof CommandResultExec) {
+ return ((CommandResultExec) plan).commandPhysicalPlan();
+ } else {
+ return plan;
+ }
+ }
+
+ public static List collectExprs(
+ SparkPlan sparkPlan, Predicate predicate) {
+ Seq> seq =
+ SPARK_HELPER.collect(
+ sparkPlan,
+ new PartialFunction>() {
+ @Override
+ public List apply(SparkPlan plan) {
+ List exprs = Lists.newArrayList();
+
+ for (Expression expr : toJavaList(plan.expressions())) {
+ exprs.addAll(collectExprs(expr, predicate));
+ }
+
+ return exprs;
+ }
+
+ @Override
+ public boolean isDefinedAt(SparkPlan plan) {
+ return true;
+ }
+ });
+ return toJavaList(seq).stream().flatMap(Collection::stream).collect(Collectors.toList());
+ }
+
+ private static List collectExprs(
+ Expression expression, Predicate predicate) {
+ Seq seq =
+ expression.collect(
+ new PartialFunction() {
+ @Override
+ public Expression apply(Expression expr) {
+ return expr;
+ }
+
+ @Override
+ public boolean isDefinedAt(Expression expr) {
+ return predicate.test(expr);
+ }
+ });
+ return toJavaList(seq);
+ }
+
+ private static List toJavaList(Seq seq) {
+ return seqAsJavaListConverter(seq).asJava();
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java
new file mode 100644
index 000000000000..b5d641576314
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java
@@ -0,0 +1,461 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark.extensions;
+
+import static org.apache.iceberg.DataOperations.DELETE;
+import static org.apache.iceberg.DataOperations.OVERWRITE;
+import static org.apache.iceberg.PlanningMode.DISTRIBUTED;
+import static org.apache.iceberg.PlanningMode.LOCAL;
+import static org.apache.iceberg.SnapshotSummary.ADDED_DELETE_FILES_PROP;
+import static org.apache.iceberg.SnapshotSummary.ADDED_DVS_PROP;
+import static org.apache.iceberg.SnapshotSummary.ADDED_FILES_PROP;
+import static org.apache.iceberg.SnapshotSummary.ADD_POS_DELETE_FILES_PROP;
+import static org.apache.iceberg.SnapshotSummary.CHANGED_PARTITION_COUNT_PROP;
+import static org.apache.iceberg.SnapshotSummary.DELETED_FILES_PROP;
+import static org.apache.iceberg.TableProperties.DATA_PLANNING_MODE;
+import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT;
+import static org.apache.iceberg.TableProperties.DELETE_PLANNING_MODE;
+import static org.apache.iceberg.TableProperties.FORMAT_VERSION;
+import static org.apache.iceberg.TableProperties.ORC_VECTORIZATION_ENABLED;
+import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED;
+import static org.apache.iceberg.TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED;
+import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE;
+import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH;
+import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE;
+import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE;
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.stream.Collectors;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.Files;
+import org.apache.iceberg.Parameter;
+import org.apache.iceberg.ParameterizedTestExtension;
+import org.apache.iceberg.Parameters;
+import org.apache.iceberg.PlanningMode;
+import org.apache.iceberg.RowLevelOperationMode;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.SnapshotRef;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.data.GenericRecord;
+import org.apache.iceberg.data.parquet.GenericParquetWriter;
+import org.apache.iceberg.deletes.DeleteGranularity;
+import org.apache.iceberg.io.DataWriter;
+import org.apache.iceberg.io.OutputFile;
+import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.spark.SparkCatalog;
+import org.apache.iceberg.spark.SparkSessionCatalog;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoder;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
+import org.apache.spark.sql.execution.SparkPlan;
+import org.junit.jupiter.api.extension.ExtendWith;
+
+@ExtendWith(ParameterizedTestExtension.class)
+public abstract class SparkRowLevelOperationsTestBase extends ExtensionsTestBase {
+
+ private static final Random RANDOM = ThreadLocalRandom.current();
+
+ @Parameter(index = 3)
+ protected FileFormat fileFormat;
+
+ @Parameter(index = 4)
+ protected boolean vectorized;
+
+ @Parameter(index = 5)
+ protected String distributionMode;
+
+ @Parameter(index = 6)
+ protected boolean fanoutEnabled;
+
+ @Parameter(index = 7)
+ protected String branch;
+
+ @Parameter(index = 8)
+ protected PlanningMode planningMode;
+
+ @Parameter(index = 9)
+ protected int formatVersion;
+
+ @Parameters(
+ name =
+ "catalogName = {0}, implementation = {1}, config = {2},"
+ + " format = {3}, vectorized = {4}, distributionMode = {5},"
+ + " fanout = {6}, branch = {7}, planningMode = {8}, formatVersion = {9}")
+ public static Object[][] parameters() {
+ return new Object[][] {
+ {
+ "testhive",
+ SparkCatalog.class.getName(),
+ ImmutableMap.of(
+ "type", "hive",
+ "default-namespace", "default"),
+ FileFormat.ORC,
+ true,
+ WRITE_DISTRIBUTION_MODE_NONE,
+ true,
+ SnapshotRef.MAIN_BRANCH,
+ LOCAL,
+ 2
+ },
+ {
+ "testhive",
+ SparkCatalog.class.getName(),
+ ImmutableMap.of(
+ "type", "hive",
+ "default-namespace", "default"),
+ FileFormat.PARQUET,
+ true,
+ WRITE_DISTRIBUTION_MODE_NONE,
+ false,
+ "test",
+ DISTRIBUTED,
+ 2
+ },
+ {
+ "testhadoop",
+ SparkCatalog.class.getName(),
+ ImmutableMap.of("type", "hadoop"),
+ FileFormat.PARQUET,
+ RANDOM.nextBoolean(),
+ WRITE_DISTRIBUTION_MODE_HASH,
+ true,
+ null,
+ LOCAL,
+ 2
+ },
+ {
+ "spark_catalog",
+ SparkSessionCatalog.class.getName(),
+ ImmutableMap.of(
+ "type", "hive",
+ "default-namespace", "default",
+ "clients", "1",
+ "parquet-enabled", "false",
+ "cache-enabled",
+ "false" // Spark will delete tables using v1, leaving the cache out of sync
+ ),
+ FileFormat.AVRO,
+ false,
+ WRITE_DISTRIBUTION_MODE_RANGE,
+ false,
+ "test",
+ DISTRIBUTED,
+ 2
+ },
+ {
+ "testhadoop",
+ SparkCatalog.class.getName(),
+ ImmutableMap.of("type", "hadoop"),
+ FileFormat.PARQUET,
+ RANDOM.nextBoolean(),
+ WRITE_DISTRIBUTION_MODE_HASH,
+ true,
+ null,
+ LOCAL,
+ 3
+ },
+ {
+ "spark_catalog",
+ SparkSessionCatalog.class.getName(),
+ ImmutableMap.of(
+ "type",
+ "hive",
+ "default-namespace",
+ "default",
+ "clients",
+ "1",
+ "parquet-enabled",
+ "false",
+ "cache-enabled",
+ "false" // Spark will delete tables using v1, leaving the cache out of sync
+ ),
+ FileFormat.AVRO,
+ false,
+ WRITE_DISTRIBUTION_MODE_RANGE,
+ false,
+ "test",
+ DISTRIBUTED,
+ 3
+ },
+ };
+ }
+
+ protected abstract Map extraTableProperties();
+
+ protected void initTable() {
+ sql(
+ "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s', '%s' '%s', '%s' '%s', '%s' '%s', '%s' '%s', '%s' '%s')",
+ tableName,
+ DEFAULT_FILE_FORMAT,
+ fileFormat,
+ WRITE_DISTRIBUTION_MODE,
+ distributionMode,
+ SPARK_WRITE_PARTITIONED_FANOUT_ENABLED,
+ String.valueOf(fanoutEnabled),
+ DATA_PLANNING_MODE,
+ planningMode.modeName(),
+ DELETE_PLANNING_MODE,
+ planningMode.modeName(),
+ FORMAT_VERSION,
+ formatVersion);
+
+ switch (fileFormat) {
+ case PARQUET:
+ sql(
+ "ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')",
+ tableName, PARQUET_VECTORIZATION_ENABLED, vectorized);
+ break;
+ case ORC:
+ sql(
+ "ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')",
+ tableName, ORC_VECTORIZATION_ENABLED, vectorized);
+ break;
+ case AVRO:
+ assertThat(vectorized).isFalse();
+ break;
+ }
+
+ Map props = extraTableProperties();
+ props.forEach(
+ (prop, value) -> {
+ sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, prop, value);
+ });
+ }
+
+ protected void createAndInitTable(String schema) {
+ createAndInitTable(schema, null);
+ }
+
+ protected void createAndInitTable(String schema, String jsonData) {
+ createAndInitTable(schema, "", jsonData);
+ }
+
+ protected void createAndInitTable(String schema, String partitioning, String jsonData) {
+ sql("CREATE TABLE %s (%s) USING iceberg %s", tableName, schema, partitioning);
+ initTable();
+
+ if (jsonData != null) {
+ try {
+ Dataset ds = toDS(schema, jsonData);
+ ds.coalesce(1).writeTo(tableName).append();
+ createBranchIfNeeded();
+ } catch (NoSuchTableException e) {
+ throw new RuntimeException("Failed to write data", e);
+ }
+ }
+ }
+
+ protected void append(String table, String jsonData) {
+ append(table, null, jsonData);
+ }
+
+ protected void append(String table, String schema, String jsonData) {
+ try {
+ Dataset ds = toDS(schema, jsonData);
+ ds.coalesce(1).writeTo(table).append();
+ } catch (NoSuchTableException e) {
+ throw new RuntimeException("Failed to write data", e);
+ }
+ }
+
+ protected void createOrReplaceView(String name, String jsonData) {
+ createOrReplaceView(name, null, jsonData);
+ }
+
+ protected void createOrReplaceView(String name, String schema, String jsonData) {
+ Dataset ds = toDS(schema, jsonData);
+ ds.createOrReplaceTempView(name);
+ }
+
+ protected void createOrReplaceView(String name, List data, Encoder encoder) {
+ spark.createDataset(data, encoder).createOrReplaceTempView(name);
+ }
+
+ private Dataset toDS(String schema, String jsonData) {
+ List jsonRows =
+ Arrays.stream(jsonData.split("\n"))
+ .filter(str -> !str.trim().isEmpty())
+ .collect(Collectors.toList());
+ Dataset jsonDS = spark.createDataset(jsonRows, Encoders.STRING());
+
+ if (schema != null) {
+ return spark.read().schema(schema).json(jsonDS);
+ } else {
+ return spark.read().json(jsonDS);
+ }
+ }
+
+ protected void validateDelete(
+ Snapshot snapshot, String changedPartitionCount, String deletedDataFiles) {
+ validateSnapshot(snapshot, DELETE, changedPartitionCount, deletedDataFiles, null, null);
+ }
+
+ protected void validateCopyOnWrite(
+ Snapshot snapshot,
+ String changedPartitionCount,
+ String deletedDataFiles,
+ String addedDataFiles) {
+ String operation = null == addedDataFiles && null != deletedDataFiles ? DELETE : OVERWRITE;
+ validateSnapshot(
+ snapshot, operation, changedPartitionCount, deletedDataFiles, null, addedDataFiles);
+ }
+
+ protected void validateMergeOnRead(
+ Snapshot snapshot,
+ String changedPartitionCount,
+ String addedDeleteFiles,
+ String addedDataFiles) {
+ String operation = null == addedDataFiles && null != addedDeleteFiles ? DELETE : OVERWRITE;
+ validateSnapshot(
+ snapshot, operation, changedPartitionCount, null, addedDeleteFiles, addedDataFiles);
+ }
+
+ protected void validateSnapshot(
+ Snapshot snapshot,
+ String operation,
+ String changedPartitionCount,
+ String deletedDataFiles,
+ String addedDeleteFiles,
+ String addedDataFiles) {
+ assertThat(snapshot.operation()).as("Operation must match").isEqualTo(operation);
+ validateProperty(snapshot, CHANGED_PARTITION_COUNT_PROP, changedPartitionCount);
+ validateProperty(snapshot, DELETED_FILES_PROP, deletedDataFiles);
+ validateProperty(snapshot, ADDED_DELETE_FILES_PROP, addedDeleteFiles);
+ validateProperty(snapshot, ADDED_FILES_PROP, addedDataFiles);
+ if (formatVersion >= 3) {
+ validateProperty(snapshot, ADDED_DVS_PROP, addedDeleteFiles);
+ assertThat(snapshot.summary()).doesNotContainKey(ADD_POS_DELETE_FILES_PROP);
+ }
+ }
+
+ protected void validateProperty(Snapshot snapshot, String property, Set expectedValues) {
+ String actual = snapshot.summary().get(property);
+ assertThat(actual)
+ .as(
+ "Snapshot property "
+ + property
+ + " has unexpected value, actual = "
+ + actual
+ + ", expected one of : "
+ + String.join(",", expectedValues))
+ .isIn(expectedValues);
+ }
+
+ protected void validateProperty(Snapshot snapshot, String property, String expectedValue) {
+ if (null == expectedValue) {
+ assertThat(snapshot.summary()).doesNotContainKey(property);
+ } else {
+ assertThat(snapshot.summary())
+ .as("Snapshot property " + property + " has unexpected value.")
+ .containsEntry(property, expectedValue);
+ }
+ }
+
+ protected void sleep(long millis) {
+ try {
+ Thread.sleep(millis);
+ } catch (InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ protected DataFile writeDataFile(Table table, List records) {
+ try {
+ OutputFile file =
+ Files.localOutput(
+ temp.resolve(fileFormat.addExtension(UUID.randomUUID().toString())).toFile());
+
+ DataWriter dataWriter =
+ Parquet.writeData(file)
+ .forTable(table)
+ .createWriterFunc(GenericParquetWriter::create)
+ .overwrite()
+ .build();
+
+ try {
+ for (GenericRecord record : records) {
+ dataWriter.write(record);
+ }
+ } finally {
+ dataWriter.close();
+ }
+
+ return dataWriter.toDataFile();
+
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+ }
+
+ @Override
+ protected String commitTarget() {
+ return branch == null ? tableName : String.format("%s.branch_%s", tableName, branch);
+ }
+
+ @Override
+ protected String selectTarget() {
+ return branch == null ? tableName : String.format("%s VERSION AS OF '%s'", tableName, branch);
+ }
+
+ protected void createBranchIfNeeded() {
+ if (branch != null && !branch.equals(SnapshotRef.MAIN_BRANCH)) {
+ sql("ALTER TABLE %s CREATE BRANCH %s", tableName, branch);
+ }
+ }
+
+ // ORC currently does not support vectorized reads with deletes
+ protected boolean supportsVectorization() {
+ return vectorized && (isParquet() || isCopyOnWrite());
+ }
+
+ private boolean isParquet() {
+ return fileFormat.equals(FileFormat.PARQUET);
+ }
+
+ private boolean isCopyOnWrite() {
+ return extraTableProperties().containsValue(RowLevelOperationMode.COPY_ON_WRITE.modeName());
+ }
+
+ protected void assertAllBatchScansVectorized(SparkPlan plan) {
+ List batchScans = SparkPlanUtil.collectBatchScans(plan);
+ assertThat(batchScans).hasSizeGreaterThan(0).allMatch(SparkPlan::supportsColumnar);
+ }
+
+ protected void createTableWithDeleteGranularity(
+ String schema, String partitionedBy, DeleteGranularity deleteGranularity) {
+ createAndInitTable(schema, partitionedBy, null /* empty */);
+ sql(
+ "ALTER TABLE %s SET TBLPROPERTIES ('%s' '%s')",
+ tableName, TableProperties.DELETE_GRANULARITY, deleteGranularity);
+ }
+}
diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java
new file mode 100644
index 000000000000..29993380b50c
--- /dev/null
+++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java
@@ -0,0 +1,1479 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark.extensions;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.assertj.core.api.Assumptions.assumeThat;
+
+import java.io.File;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import org.apache.avro.Schema;
+import org.apache.avro.SchemaBuilder;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericDatumWriter;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.io.DatumWriter;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.HasTableOperations;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.ManifestReader;
+import org.apache.iceberg.Parameter;
+import org.apache.iceberg.ParameterizedTestExtension;
+import org.apache.iceberg.Parameters;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.Spark3Util;
+import org.apache.iceberg.spark.SparkCatalogConfig;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
+import org.apache.spark.sql.catalyst.parser.ParseException;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.joda.time.DateTime;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.TestTemplate;
+import org.junit.jupiter.api.extension.ExtendWith;
+
+@ExtendWith(ParameterizedTestExtension.class)
+public class TestAddFilesProcedure extends ExtensionsTestBase {
+
+ @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}, formatVersion = {3}")
+ public static Object[][] parameters() {
+ return new Object[][] {
+ {
+ SparkCatalogConfig.HIVE.catalogName(),
+ SparkCatalogConfig.HIVE.implementation(),
+ SparkCatalogConfig.HIVE.properties(),
+ 1
+ },
+ {
+ SparkCatalogConfig.HADOOP.catalogName(),
+ SparkCatalogConfig.HADOOP.implementation(),
+ SparkCatalogConfig.HADOOP.properties(),
+ 2
+ },
+ {
+ SparkCatalogConfig.SPARK_SESSION.catalogName(),
+ SparkCatalogConfig.SPARK_SESSION.implementation(),
+ SparkCatalogConfig.SPARK_SESSION.properties(),
+ 2
+ }
+ };
+ }
+
+ @Parameter(index = 3)
+ private int formatVersion;
+
+ private final String sourceTableName = "source_table";
+ private File fileTableDir;
+
+ @BeforeEach
+ public void setupTempDirs() {
+ fileTableDir = temp.toFile();
+ }
+
+ @AfterEach
+ public void dropTables() {
+ sql("DROP TABLE IF EXISTS %s PURGE", sourceTableName);
+ sql("DROP TABLE IF EXISTS %s", tableName);
+ }
+
+ @TestTemplate
+ public void addDataUnpartitioned() {
+ createUnpartitionedFileTable("parquet");
+
+ createIcebergTable("id Integer, name String, dept String, subdept String");
+
+ List