delta-io · vkorukanti · Jan 22, 2022 · Feb 17, 2022 · Feb 18, 2022
diff --git a/core/src/main/antlr4/io/delta/sql/parser/DeltaSqlBase.g4 b/core/src/main/antlr4/io/delta/sql/parser/DeltaSqlBase.g4
@@ -84,6 +84,8 @@ statement
       constraint                                                        #addTableConstraint
     | ALTER TABLE table=qualifiedName
         DROP CONSTRAINT (IF EXISTS)? name=identifier                    #dropTableConstraint
+    | OPTIMIZE (path=STRING | table=qualifiedName)
+        (WHERE partitionPredicate = exprToken)?                    #optimizeTable
     | .*?                                                               #passThrough
     ;
 
@@ -124,12 +126,12 @@ number
     ;
 
 constraint
-    : CHECK '(' checkExprToken+ ')'                                 #checkConstraint
+    : CHECK '(' exprToken+ ')'                                 #checkConstraint
     ;
 
 // We don't have an expression rule in our grammar here, so we just grab the tokens and defer
 // parsing them to later.
-checkExprToken
+exprToken
     :  .+?
     ;
 
@@ -139,7 +141,7 @@ nonReserved
     : VACUUM | RETAIN | HOURS | DRY | RUN
     | CONVERT | TO | DELTA | PARTITIONED | BY
     | DESC | DESCRIBE | LIMIT | DETAIL
-    | GENERATE | FOR | TABLE | CHECK | EXISTS
+    | GENERATE | FOR | TABLE | CHECK | EXISTS | OPTIMIZE
     ;
 
 // Define how the keywords above should appear in a user's SQL statement.
@@ -165,13 +167,15 @@ LIMIT: 'LIMIT';
 MINUS: '-';
 NOT: 'NOT' | '!';
 NULL: 'NULL';
+OPTIMIZE: 'OPTIMIZE';
 FOR: 'FOR';
 TABLE: 'TABLE';
 PARTITIONED: 'PARTITIONED';
 RETAIN: 'RETAIN';
 RUN: 'RUN';
 TO: 'TO';
 VACUUM: 'VACUUM';
+WHERE: 'WHERE';
 
 // Multi-character operator tokens need to be defined even though we don't explicitly reference
 // them so that they can be recognized as single tokens when parsing. If we split them up and

diff --git a/core/src/main/scala/io/delta/sql/parser/DeltaSqlParser.scala b/core/src/main/scala/io/delta/sql/parser/DeltaSqlParser.scala
@@ -156,6 +156,30 @@ class DeltaSqlAstBuilder extends DeltaSqlBaseBaseVisitor[AnyRef] {
       ctx.RUN != null)
   }
 
+  /**
+   * Create a [[OptimizeTableCommand]] logical plan.
+   * Syntax:
+   * {{{
+   *    OPTIMIZE <table-identifier> [WHERE predicate-using-partition-columns]
+   * }}}
+   * Examples:
+   * {{{
+   *    OPTIMIZE '/path/to/delta/table';
+   *    OPTIMIZE delta_table_name;
+   *    OPTIMIZE delta.`/path/to/delta/table`;
+   *    OPTIMIZE delta_table_name WHERE partCol = 25;
+   * }}}
+   */
+  override def visitOptimizeTable(ctx: OptimizeTableContext): AnyRef = withOrigin(ctx) {
+    if (ctx.path == null && ctx.table == null) {
+      throw new ParseException("OPTIMIZE command requires a file path or table name.", ctx)
+    }
+    OptimizeTableCommand(
+      Option(ctx.path).map(string),
+      Option(ctx.table).map(visitTableIdentifier),
+      Option(ctx.partitionPredicate).map(extractRawText(_)))
+  }
+
   override def visitDescribeDeltaDetail(
       ctx: DescribeDeltaDetailContext): LogicalPlan = withOrigin(ctx) {
     DescribeDeltaDetailCommand(
@@ -225,10 +249,17 @@ class DeltaSqlAstBuilder extends DeltaSqlBaseBaseVisitor[AnyRef] {
   // space. This produces some strange spacing (e.g. `structCol . arr [ 0 ]`), but right now we
   // think that's preferable to the additional complexity involved in trying to produce cleaner
   // output.
-  private def buildCheckConstraintText(tokens: Seq[CheckExprTokenContext]): String = {
+  private def buildCheckConstraintText(tokens: Seq[ExprTokenContext]): String = {
     tokens.map(_.getText).mkString(" ")
   }
 
+  private def extractRawText(exprContext: ExprTokenContext): String = {
+    // Extract the raw expression which will be parsed later
+    exprContext.getStart.getInputStream.getText(new Interval(
+      exprContext.getStart.getStartIndex,
+      exprContext.getStop.getStopIndex))
+  }
+
   override def visitAddTableConstraint(
       ctx: AddTableConstraintContext): LogicalPlan = withOrigin(ctx) {
     val checkConstraint = ctx.constraint().asInstanceOf[CheckConstraintContext]
@@ -237,7 +268,7 @@ class DeltaSqlAstBuilder extends DeltaSqlBaseBaseVisitor[AnyRef] {
       createUnresolvedTable(ctx.table.identifier.asScala.map(_.getText).toSeq,
         "ALTER TABLE ... ADD CONSTRAINT"),
       ctx.name.getText,
-      buildCheckConstraintText(checkConstraint.checkExprToken().asScala.toSeq))
+      buildCheckConstraintText(checkConstraint.exprToken().asScala.toSeq))
   }
 
   override def visitDropTableConstraint(

diff --git a/core/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala b/core/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala
@@ -975,6 +975,10 @@ object DeltaErrors
     new AnalysisException("Cannot describe the history of a view.")
   }
 
+  def viewNotSupported(operationName: String): Throwable = {
+    new AnalysisException(s"Operation $operationName can not be performed on a view")
+  }
+
   def copyIntoValidationRequireDeltaTableExists: Throwable = {
     new AnalysisException("COPY INTO validation failed. Target table does not exist.")
   }

diff --git a/core/src/main/scala/org/apache/spark/sql/delta/DeltaOperations.scala b/core/src/main/scala/org/apache/spark/sql/delta/DeltaOperations.scala
@@ -330,6 +330,17 @@ object DeltaOperations {
     override def changesData: Boolean = true
   }
 
+  val OPTIMIZE_OPERATION_NAME = "OPTIMIZE"
+
+  /** Recorded when optimizing the table. */
+  case class Optimize(
+      predicate: Seq[String]
+  ) extends Operation(OPTIMIZE_OPERATION_NAME) {
+    override val parameters: Map[String, Any] = Map(
+      "predicate" -> JsonUtils.toJson(predicate)
+      )
+  }
+
 
   private def structFieldToMap(colPath: Seq[String], field: StructField): Map[String, Any] = {
     Map(

diff --git a/core/src/main/scala/org/apache/spark/sql/delta/commands/DeltaCommand.scala b/core/src/main/scala/org/apache/spark/sql/delta/commands/DeltaCommand.scala
@@ -35,6 +35,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.{AnalysisException, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.{Analyzer, EliminateSubqueryAliases, NoSuchTableException, UnresolvedAttribute, UnresolvedRelation}
+import org.apache.spark.sql.catalyst.catalog.CatalogTableType
 import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -392,11 +393,28 @@ trait DeltaCommand extends DeltaLogging {
       path: Option[String],
       tableIdentifier: Option[TableIdentifier],
       operationName: String): DeltaLog = {
-    val tablePath = tableIdentifier.map { ti =>
-      new Path(spark.sessionState.catalog.getTableMetadata(ti).location)
-    }.orElse(path.map(new Path(_))).getOrElse {
-      throw DeltaErrors.missingTableIdentifierException(operationName)
-    }
+    val tablePath =
+      if (path.nonEmpty) {
+        new Path(path.get)
+      } else if (tableIdentifier.nonEmpty) {
+        val sessionCatalog = spark.sessionState.catalog
+        lazy val metadata = sessionCatalog.getTableMetadata(tableIdentifier.get)
+
+        DeltaTableIdentifier(spark, tableIdentifier.get) match {
+          case Some(id) if id.path.nonEmpty =>
+            new Path(id.path.get)
+          case Some(id) if id.table.nonEmpty =>
+            new Path(metadata.location)
+          case _ =>
+            if (metadata.tableType == CatalogTableType.VIEW) {
+              throw DeltaErrors.viewNotSupported(operationName)
+            }
+            throw DeltaErrors.notADeltaTableException(operationName)
+        }
+      } else {
+        throw DeltaErrors.missingTableIdentifierException(operationName)
+      }
+
     val deltaLog = DeltaLog.forTable(spark, tablePath)
     if (deltaLog.snapshot.version < 0) {
       throw DeltaErrors.notADeltaTableException(