delta-io · xupefei · May 3, 2023 · May 3, 2023 · May 4, 2023 · May 5, 2023
diff --git a/core/src/main/antlr4/io/delta/sql/parser/DeltaSqlBase.g4 b/core/src/main/antlr4/io/delta/sql/parser/DeltaSqlBase.g4
@@ -87,8 +87,11 @@ statement
     | ALTER TABLE table=qualifiedName
         DROP CONSTRAINT (IF EXISTS)? name=identifier                    #dropTableConstraint
     | OPTIMIZE (path=STRING | table=qualifiedName)
-        (WHERE partitionPredicate = predicateToken)?
+        (WHERE partitionPredicate=predicateToken)?
         (zorderSpec)?                                                   #optimizeTable
+    | REORG TABLE table=qualifiedName
+        (WHERE partitionPredicate=predicateToken)?
+        APPLY LEFT_PAREN PURGE RIGHT_PAREN                              #reorgTable
     | SHOW COLUMNS (IN | FROM) tableName=qualifiedName
         ((IN | FROM) schemaName=identifier)?                            #showColumns
     | cloneTableHeader SHALLOW CLONE source=qualifiedName clause=temporalClause?
@@ -210,6 +213,7 @@ nonReserved
     | CONVERT | TO | DELTA | PARTITIONED | BY
     | DESC | DESCRIBE | LIMIT | DETAIL
     | GENERATE | FOR | TABLE | CHECK | EXISTS | OPTIMIZE
+    | REORG | APPLY | PURGE
     | RESTORE | AS | OF
     | ZORDER | LEFT_PAREN | RIGHT_PAREN
     | SHOW | COLUMNS | IN | FROM | NO | STATISTICS
@@ -219,6 +223,7 @@ nonReserved
 // Define how the keywords above should appear in a user's SQL statement.
 ADD: 'ADD';
 ALTER: 'ALTER';
+APPLY: 'APPLY';
 AS: 'AS';
 BY: 'BY';
 CHECK: 'CHECK';
@@ -255,7 +260,9 @@ NULL: 'NULL';
 OF: 'OF';
 OR: 'OR';
 OPTIMIZE: 'OPTIMIZE';
+REORG: 'REORG';
 PARTITIONED: 'PARTITIONED';
+PURGE: 'PURGE';
 REPLACE: 'REPLACE';
 RESTORE: 'RESTORE';
 RETAIN: 'RETAIN';

diff --git a/core/src/main/scala/io/delta/sql/parser/DeltaSqlParser.scala b/core/src/main/scala/io/delta/sql/parser/DeltaSqlParser.scala
@@ -346,7 +346,28 @@ class DeltaSqlAstBuilder extends DeltaSqlBaseBaseVisitor[AnyRef] {
     OptimizeTableCommand(
       Option(ctx.path).map(string),
       Option(ctx.table).map(visitTableIdentifier),
-      Option(ctx.partitionPredicate).map(extractRawText(_)).toSeq, Map.empty)(interleaveBy)
+      Option(ctx.partitionPredicate).map(extractRawText(_)).toSeq,
+      Map.empty)(interleaveBy)
+  }
+
+  /**
+   * Creates a [[ReorgTable]] logical plan.
+   * Examples:
+   * {{{
+   *   -- Physically delete dropped columns of target table
+   *   REORG TABLE (delta.`/path/to/table` | delta_table_name)
+   *    [WHERE partition_predicate] APPLY (PURGE)
+   * }}}
+   */
+  override def visitReorgTable(ctx: ReorgTableContext): AnyRef = withOrigin(ctx) {
+    if (ctx.table == null) {
+      throw new ParseException("REORG command requires a file path or table name.", ctx)
+    }
+
+    val targetIdentifier = visitMultipartIdentifier(ctx.table)
+    val targetTable = createUnresolvedTable(targetIdentifier, "REORG")
+
+    ReorgTable(targetTable)(Option(ctx.partitionPredicate).map(extractRawText(_)).toSeq)
   }
 
   override def visitDescribeDeltaDetail(

diff --git a/core/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala b/core/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala
@@ -433,6 +433,14 @@ class DeltaAnalysis(session: SparkSession)
 
       DeltaMergeInto.resolveReferencesAndSchema(deltaMerge, conf)(tryResolveReferences(session))
 
+    case reorg@ReorgTable(_@ResolvedTable(_, _, t, _)) =>
+      t match {
+        case table: DeltaTableV2 =>
+          ReorgTableCommand(table)(reorg.predicates)
+        case _ =>
+          throw DeltaErrors.notADeltaTable(t.name())
+      }
+
     case deltaMerge: DeltaMergeInto =>
       val d = if (deltaMerge.childrenResolved && !deltaMerge.resolved) {
         DeltaMergeInto.resolveReferencesAndSchema(deltaMerge, conf)(tryResolveReferences(session))

diff --git a/core/src/main/scala/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala b/core/src/main/scala/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala
@@ -109,7 +109,8 @@ case class OptimizeTableCommand(
     path: Option[String],
     tableId: Option[TableIdentifier],
     userPartitionPredicates: Seq[String],
-    options: Map[String, String])(val zOrderBy: Seq[UnresolvedAttribute])
+    options: Map[String, String],
+    isPurge: Boolean = false)(val zOrderBy: Seq[UnresolvedAttribute])
   extends OptimizeTableCommandBase with LeafRunnableCommand {
 
   override val otherCopyArgs: Seq[AnyRef] = zOrderBy :: Nil
@@ -138,7 +139,8 @@ case class OptimizeTableCommand(
     validateZorderByColumns(sparkSession, txn, zOrderBy)
     val zOrderByColumns = zOrderBy.map(_.name).toSeq
 
-    new OptimizeExecutor(sparkSession, txn, partitionPredicates, zOrderByColumns).optimize()
+    new OptimizeExecutor(sparkSession, txn, partitionPredicates, zOrderByColumns, isPurge)
+      .optimize()
   }
 }
 
@@ -154,7 +156,8 @@ class OptimizeExecutor(
     sparkSession: SparkSession,
     txn: OptimisticTransaction,
     partitionPredicate: Seq[Expression],
-    zOrderByColumns: Seq[String])
+    zOrderByColumns: Seq[String],
+    isPurge: Boolean = false)
   extends DeltaCommand with SQLMetricsReporting with Serializable {
 
   /** Timestamp to use in [[FileAction]] */
@@ -164,18 +167,22 @@ class OptimizeExecutor(
 
   def optimize(): Seq[Row] = {
     recordDeltaOperation(txn.deltaLog, "delta.optimize") {
-      val minFileSize = sparkSession.sessionState.conf.getConf(
-        DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE)
       val maxFileSize = sparkSession.sessionState.conf.getConf(
         DeltaSQLConf.DELTA_OPTIMIZE_MAX_FILE_SIZE)
-      require(minFileSize > 0, "minFileSize must be > 0")
       require(maxFileSize > 0, "maxFileSize must be > 0")
-
+      val (minFileSize, maxDeletedRowsRatio) = if (isPurge) {
+        (0L, 0d) // Only selects files with DV
+      } else {
+        val minFileSize = sparkSession.sessionState.conf.getConf(
+          DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE)
+        val maxDeletedRowsRatio = sparkSession.sessionState.conf.getConf(
+          DeltaSQLConf.DELTA_OPTIMIZE_MAX_DELETED_ROWS_RATIO)
+        require(minFileSize > 0, "minFileSize must be > 0")
+        (minFileSize, maxDeletedRowsRatio)
+      }
       val candidateFiles = txn.filterFiles(partitionPredicate, keepNumRecords = true)
       val partitionSchema = txn.metadata.partitionSchema
 
-      val maxDeletedRowsRatio = sparkSession.sessionState.conf.getConf(
-        DeltaSQLConf.DELTA_OPTIMIZE_MAX_DELETED_ROWS_RATIO)
       val filesToProcess = pruneCandidateFileList(minFileSize, maxDeletedRowsRatio, candidateFiles)
       val partitionsToCompact = filesToProcess.groupBy(_.partitionValues).toSeq
 

diff --git a/core/src/main/scala/org/apache/spark/sql/delta/commands/ReorgTableCommand.scala b/core/src/main/scala/org/apache/spark/sql/delta/commands/ReorgTableCommand.scala
@@ -0,0 +1,48 @@
+/*
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta.commands
+
+import org.apache.spark.sql.delta.catalog.DeltaTableV2
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.catalyst.plans.logical.{IgnoreCachedData, LeafCommand, LogicalPlan, UnaryCommand}
+import org.apache.spark.sql.catalyst.TableIdentifier
+
+case class ReorgTable(target: LogicalPlan)(val predicates: Seq[String]) extends UnaryCommand {
+
+  def child: LogicalPlan = target
+
+  protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan =
+    copy(target = newChild)(predicates)
+
+  override val otherCopyArgs: Seq[AnyRef] = predicates :: Nil
+}
+
+case class ReorgTableCommand(target: DeltaTableV2)(val predicates: Seq[String])
+  extends OptimizeTableCommandBase with LeafCommand with IgnoreCachedData {
+
+  override val otherCopyArgs: Seq[AnyRef] = predicates :: Nil
+
+  override def run(sparkSession: SparkSession): Seq[Row] = {
+    val command = OptimizeTableCommand(
+      Option(target.path.toString),
+      target.catalogTable.map(_.identifier),
+      predicates,
+      options = Map.empty,
+      isPurge = true)(zOrderBy = Nil)
+    command.run(sparkSession)
+  }
+}
diff --git a/core/src/test/scala/io/delta/sql/parser/DeltaSqlParserSuite.scala b/core/src/test/scala/io/delta/sql/parser/DeltaSqlParserSuite.scala
@@ -19,11 +19,10 @@ package io.delta.sql.parser
 import io.delta.tables.execution.VacuumTableCommand
 
 import org.apache.spark.sql.delta.CloneTableSQLTestUtils
-import org.apache.spark.sql.delta.commands.OptimizeTableCommand
-
+import org.apache.spark.sql.delta.commands.{OptimizeTableCommand, ReorgTable}
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.{TableIdentifier, TimeTravel}
-import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation}
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation, UnresolvedTable}
 import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.SQLHelper
@@ -120,6 +119,60 @@ class DeltaSqlParserSuite extends SparkFunSuite with SQLHelper {
         Seq(unresolvedAttr("optimize"), unresolvedAttr("zorder"))))
   }
 
+  private def targetPlanForTable(tableParts: String*): UnresolvedTable =
+    UnresolvedTable(tableParts.toSeq, "REORG", relationTypeMismatchHint = None)
+
+  test("REORG command is parsed as expected") {
+    val parser = new DeltaSqlParser(null)
+
+    assert(parser.parsePlan("REORG TABLE tbl APPLY (PURGE)") ===
+      ReorgTable(targetPlanForTable("tbl"))(Seq.empty))
+
+    assert(parser.parsePlan("REORG TABLE tbl_${system:spark.testing} APPLY (PURGE)") ===
+      ReorgTable(targetPlanForTable("tbl_true"))(Seq.empty))
+
+    withSQLConf("tbl_var" -> "tbl") {
+      assert(parser.parsePlan("REORG TABLE ${tbl_var} APPLY (PURGE)") ===
+        ReorgTable(targetPlanForTable("tbl"))(Seq.empty))
+
+      assert(parser.parsePlan("REORG TABLE ${spark:tbl_var} APPLY (PURGE)") ===
+        ReorgTable(targetPlanForTable("tbl"))(Seq.empty))
+
+      assert(parser.parsePlan("REORG TABLE ${sparkconf:tbl_var} APPLY (PURGE)") ===
+        ReorgTable(targetPlanForTable("tbl"))(Seq.empty))
+
+      assert(parser.parsePlan("REORG TABLE ${hiveconf:tbl_var} APPLY (PURGE)") ===
+        ReorgTable(targetPlanForTable("tbl"))(Seq.empty))
+
+      assert(parser.parsePlan("REORG TABLE ${hivevar:tbl_var} APPLY (PURGE)") ===
+        ReorgTable(targetPlanForTable("tbl"))(Seq.empty))
+    }
+
+    assert(parser.parsePlan("REORG TABLE delta.`/path/to/tbl` APPLY (PURGE)") ===
+      ReorgTable(targetPlanForTable("delta", "/path/to/tbl"))(Seq.empty))
+
+    assert(parser.parsePlan("REORG TABLE tbl WHERE part = 1 APPLY (PURGE)") ===
+      ReorgTable(targetPlanForTable("tbl"))(Seq("part = 1")))
+  }
+
+  test("REORG command new tokens are non-reserved keywords") {
+    // new keywords: REORG, APPLY, PURGE
+    val parser = new DeltaSqlParser(null)
+
+    // Use the new keywords in table name
+    assert(parser.parsePlan("REORG TABLE reorg APPLY (PURGE)") ===
+      ReorgTable(targetPlanForTable("reorg"))(Seq.empty))
+    assert(parser.parsePlan("REORG TABLE apply APPLY (PURGE)") ===
+      ReorgTable(targetPlanForTable("apply"))(Seq.empty))
+    assert(parser.parsePlan("REORG TABLE purge APPLY (PURGE)") ===
+      ReorgTable(targetPlanForTable("purge"))(Seq.empty))
+
+    // Use the new keywords in column name
+    assert(parser.parsePlan(
+      "REORG TABLE tbl WHERE reorg = 1 AND apply = 2 AND purge = 3 APPLY (PURGE)") ===
+    ReorgTable(targetPlanForTable("tbl"))(Seq("reorg = 1 AND apply =2 AND purge = 3")))
+  }
+
   // scalastyle:off argcount
   private def checkCloneStmt(
       parser: DeltaSqlParser,

diff --git a/core/src/test/scala/org/apache/spark/sql/delta/DeletionVectorsTestUtils.scala b/core/src/test/scala/org/apache/spark/sql/delta/DeletionVectorsTestUtils.scala
@@ -161,6 +161,14 @@ trait DeletionVectorsTestUtils extends QueryTest with SharedSparkSession {
     txn.commit(actions, Truncate())
   }
 
+  protected def getFileActionsInLastVersion(log: DeltaLog): (Seq[AddFile], Seq[RemoveFile]) = {
+    val version = log.update().version
+    val allFiles = log.getChanges(version).toSeq.head._2
+    val add = allFiles.collect { case a: AddFile => a }
+    val remove = allFiles.collect { case r: RemoveFile => r }
+    (add, remove)
+  }
+
   protected def serializeRoaringBitmapArrayWithDefaultFormat(
       dv: RoaringBitmapArray): Array[Byte] = {
     val serializationFormat = RoaringBitmapArrayFormat.Portable

diff --git a/core/src/test/scala/org/apache/spark/sql/delta/DeltaPurgeSuite.scala b/core/src/test/scala/org/apache/spark/sql/delta/DeltaPurgeSuite.scala
@@ -0,0 +1,101 @@
+/*
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta
+
+import org.apache.spark.sql.delta.test.DeltaSQLCommandTest
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.test.SharedSparkSession
+
+class DeltaPurgeSuite extends QueryTest
+  with SharedSparkSession
+  with DeltaSQLCommandTest
+  with DeletionVectorsTestUtils {
+
+  import testImplicits._
+
+  def executePurge(table: String, condition: Option[String] = None): Unit = {
+    condition match {
+      case Some(cond) => sql(s"REORG TABLE delta.`$table` WHERE $cond APPLY (PURGE)")
+      case None => sql(s"REORG TABLE delta.`$table` APPLY (PURGE)")
+    }
+  }
+
+  testWithDVs("Purge DVs will combine small files") {
+    withTempDir { tempDir =>
+      val path = tempDir.getCanonicalPath
+      val log = DeltaLog.forTable(spark, path)
+      spark
+        .range(0, 100, 1, numPartitions = 5)
+        .write
+        .format("delta")
+        .save(path)
+      sql(s"DELETE FROM delta.`$path` WHERE id IN (0, 99)")
+      assert(log.update().allFiles.filter(_.deletionVector != null).count() === 2)
+      executePurge(path)
+      val (addFiles, _) = getFileActionsInLastVersion(log)
+      assert(addFiles.forall(_.deletionVector === null))
+      checkAnswer(
+        sql(s"SELECT * FROM delta.`$path`"),
+        (1 to 98).toDF())
+    }
+  }
+
+  testWithDVs("Purge DVs") {
+    withTempDir { tempDir =>
+      val path = tempDir.getCanonicalPath
+      val log = DeltaLog.forTable(spark, path)
+      spark
+        .range(0, 100, 1, numPartitions = 5)
+        .write
+        .format("delta")
+        .save(path)
+      sql(s"DELETE FROM delta.`$path` WHERE id IN (0, 99)")
+      assert(log.update().allFiles.filter(_.deletionVector != null).count() === 2)
+
+      // First purge
+      executePurge(path)
+      val (addFiles, _) = getFileActionsInLastVersion(log)
+      assert(addFiles.size === 1) // two files are combined
+      assert(addFiles.forall(_.deletionVector === null))
+      checkAnswer(
+        sql(s"SELECT * FROM delta.`$path`"),
+        (1 to 98).toDF())
+
+      // Second purge is a noop
+      val versionBefore = log.update().version
+      executePurge(path)
+      val versionAfter = log.update().version
+      assert(versionBefore === versionAfter)
+    }
+  }
+
+  test("Purge a non-DV table is a noop") {
+    withTempDir { tempDir =>
+      val path = tempDir.getCanonicalPath
+      val log = DeltaLog.forTable(spark, path)
+      spark
+        .range(0, 100, 1, numPartitions = 5)
+        .write
+        .format("delta")
+        .save(path)
+      val versionBefore = log.update().version
+      executePurge(path)
+      val versionAfter = log.update().version
+      assert(versionBefore === versionAfter)
+    }
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/sql/delta/deletionvectors/DeletionVectorsSuite.scala b/core/src/test/scala/org/apache/spark/sql/delta/deletionvectors/DeletionVectorsSuite.scala
@@ -551,14 +551,6 @@ class DeletionVectorsSuite extends QueryTest
     }
   }
 
-  private def getFileActionsInLastVersion(log: DeltaLog): (Seq[AddFile], Seq[RemoveFile]) = {
-    val version = log.update().version
-    val allFiles = log.getChanges(version).toSeq.head._2
-    val add = allFiles.collect { case a: AddFile => a }
-    val remove = allFiles.collect { case r: RemoveFile => r }
-    (add, remove)
-  }
-
   private def assertPlanContains(queryDf: DataFrame, expected: String): Unit = {
     val optimizedPlan = queryDf.queryExecution.analyzed.toString()
     assert(optimizedPlan.contains(expected), s"Plan is missing `$expected`: $optimizedPlan")