apache · flaming-archer · Aug 27, 2025 · Sep 9, 2025 · Sep 22, 2025 · pan3793
diff --git a/...nnector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/HiveTableCatalog.scala b/...nnector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/HiveTableCatalog.scala
@@ -20,11 +20,9 @@ package org.apache.kyuubi.spark.connector.hive
 import java.lang.{Boolean => JBoolean, Long => JLong}
 import java.net.URI
 import java.util
-
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.util.Try
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
@@ -45,10 +43,10 @@ import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
 import org.apache.spark.sql.internal.StaticSQLConf.{CATALOG_IMPLEMENTATION, GLOBAL_TEMP_DATABASE}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
-
 import org.apache.kyuubi.spark.connector.hive.HiveConnectorUtils.withSparkSQLConf
-import org.apache.kyuubi.spark.connector.hive.HiveTableCatalog.{getStorageFormatAndProvider, toCatalogDatabase, CatalogDatabaseHelper, IdentifierHelper, NamespaceHelper}
+import org.apache.kyuubi.spark.connector.hive.HiveTableCatalog.{CatalogDatabaseHelper, IdentifierHelper, NamespaceHelper, getStorageFormatAndProvider, toCatalogDatabase}
 import org.apache.kyuubi.spark.connector.hive.KyuubiHiveConnectorDelegationTokenProvider.metastoreTokenSignature
+import org.apache.kyuubi.spark.connector.hive.read.HiveFileStatusCache
 import org.apache.kyuubi.util.reflect.{DynClasses, DynConstructors}
 
 /**
@@ -388,18 +386,20 @@ class HiveTableCatalog(sparkSession: SparkSession)
         case _: NoSuchTableException =>
           throw new NoSuchTableException(ident)
       }
-
+      invalidateTable(ident)
       loadTable(ident)
     }
 
   override def dropTable(ident: Identifier): Boolean =
     withSparkSQLConf(LEGACY_NON_IDENTIFIER_OUTPUT_CATALOG_NAME -> "true") {
       try {
-        if (loadTable(ident) != null) {
+        val table = loadTable(ident)
+        if (table != null) {
           catalog.dropTable(
             ident.asTableIdentifier,
             ignoreIfNotExists = true,
             purge = true /* skip HDFS trash */ )
+          invalidateTable(ident)
           true
         } else {
           false
@@ -417,10 +417,16 @@ class HiveTableCatalog(sparkSession: SparkSession)
       }
 
       // Load table to make sure the table exists
-      loadTable(oldIdent)
+      val table = loadTable(oldIdent)
       catalog.renameTable(oldIdent.asTableIdentifier, newIdent.asTableIdentifier)
+      invalidateTable(oldIdent)
     }
 
+  override def invalidateTable(ident: Identifier): Unit = {
+    val qualifiedName = s"$catalogName.${ident.namespace().mkString(".")}.${ident.name()}"
+    HiveFileStatusCache.getOrCreate(sparkSession, qualifiedName).invalidateAll()
+  }
+
   private def toOptions(properties: Map[String, String]): Map[String, String] = {
     properties.filterKeys(_.startsWith(TableCatalog.OPTION_PREFIX)).map {
       case (key, value) => key.drop(TableCatalog.OPTION_PREFIX.length) -> value

diff --git a/...ector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/read/HiveFileIndex.scala b/...ector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/read/HiveFileIndex.scala
@@ -48,7 +48,8 @@ class HiveCatalogFileIndex(
   private val partPathToBindHivePart: mutable.Map[PartitionPath, CatalogTablePartition] =
     mutable.Map()
 
-  private val fileStatusCache = FileStatusCache.getOrCreate(sparkSession)
+  private val fileStatusCache = HiveFileStatusCache.getOrCreate(sparkSession,
+    hiveCatalog.name() + "." + catalogTable.qualifiedName)
 
   private val baseLocation: Option[URI] = table.storage.locationUri
 

diff --git a/...hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/read/HiveFileStatusCache.scala b/...hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/read/HiveFileStatusCache.scala
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.spark.connector.hive.read
+
+import java.util.concurrent.TimeUnit
+import java.util.concurrent.atomic.AtomicBoolean
+
+import scala.collection.JavaConverters._
+
+import com.google.common.cache._
+import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.execution.datasources.{FileStatusCache, NoopCache}
+import org.apache.spark.util.SizeEstimator
+
+/**
+ * Use [[HiveFileStatusCache.getOrCreate()]] to construct a globally shared file status cache.
+ */
+object HiveFileStatusCache {
+  private var sharedCache: HiveSharedInMemoryCache = _
+
+  /**
+   * @return a new FileStatusCache based on session configuration. Cache memory quota is
+   *         shared across all clients.
+   */
+  def getOrCreate(session: SparkSession, qualifiedName: String): FileStatusCache =
+    synchronized {
+      if (session.sessionState.conf.manageFilesourcePartitions &&
+        session.sessionState.conf.filesourcePartitionFileCacheSize > 0) {
+        if (sharedCache == null) {
+          sharedCache = new HiveSharedInMemoryCache(
+            session.sessionState.conf.filesourcePartitionFileCacheSize,
+            session.sessionState.conf.metadataCacheTTL)
+        }
+        sharedCache.createForNewClient(qualifiedName)
+      } else {
+        NoopCache
+      }
+    }
+
+  def resetForTesting(): Unit = synchronized {
+    sharedCache = null
+  }
+}
+
+/**
+ * An implementation that caches partition file statuses in memory.
+ *
+ * @param maxSizeInBytes max allowable cache size before entries start getting evicted
+ */
+private class HiveSharedInMemoryCache(maxSizeInBytes: Long, cacheTTL: Long) extends Logging {
+
+  // Opaque object that uniquely identifies a shared cache user
+  private type ClientId = Object
+
+  private val warnedAboutEviction = new AtomicBoolean(false)
+
+  // we use a composite cache key in order to distinguish entries inserted by different clients
+  private val cache: Cache[(ClientId, Path), Array[FileStatus]] = {
+    // [[Weigher]].weigh returns Int so we could only cache objects < 2GB
+    // instead, the weight is divided by this factor (which is smaller
+    // than the size of one [[FileStatus]]).
+    // so it will support objects up to 64GB in size.
+    val weightScale = 32
+    val weigher = new Weigher[(ClientId, Path), Array[FileStatus]] {
+      override def weigh(key: (ClientId, Path), value: Array[FileStatus]): Int = {
+        val estimate = (SizeEstimator.estimate(key) + SizeEstimator.estimate(value)) / weightScale
+        if (estimate > Int.MaxValue) {
+          logWarning(s"Cached table partition metadata size is too big. Approximating to " +
+            s"${Int.MaxValue.toLong * weightScale}.")
+          Int.MaxValue
+        } else {
+          estimate.toInt
+        }
+      }
+    }
+    val removalListener = new RemovalListener[(ClientId, Path), Array[FileStatus]]() {
+      override def onRemoval(
+          removed: RemovalNotification[(ClientId, Path), Array[FileStatus]]): Unit = {
+        if (removed.getCause == RemovalCause.SIZE &&
+          warnedAboutEviction.compareAndSet(false, true)) {
+          logWarning(
+            "Evicting cached table partition metadata from memory due to size constraints " +
+              "(spark.sql.hive.filesourcePartitionFileCacheSize = "
+              + maxSizeInBytes + " bytes). This may impact query planning performance.")
+        }
+      }
+    }
+
+    var builder = CacheBuilder.newBuilder()
+      .weigher(weigher)
+      .removalListener(removalListener)
+      .maximumWeight(maxSizeInBytes / weightScale)
+
+    if (cacheTTL > 0) {
+      builder = builder.expireAfterWrite(cacheTTL, TimeUnit.SECONDS)
+    }
+
+    builder.build[(ClientId, Path), Array[FileStatus]]()
+  }
+
+  /**
+   * @return a FileStatusCache that does not share any entries with any other client, but does
+   *         share memory resources for the purpose of cache eviction.
+   */
+  def createForNewClient(clientId: Object): HiveFileStatusCache = new HiveFileStatusCache {
+
+    override def getLeafFiles(path: Path): Option[Array[FileStatus]] = {
+      Option(cache.getIfPresent((clientId, path)))
+    }
+
+    override def putLeafFiles(path: Path, leafFiles: Array[FileStatus]): Unit = {
+      cache.put((clientId, path), leafFiles)
+    }
+
+    override def invalidateAll(): Unit = {
+      cache.asMap.asScala.foreach { case (key, value) =>
+        if (key._1 == clientId) {
+          cache.invalidate(key)
+        }
+      }
+    }
+  }
+
+  abstract class HiveFileStatusCache extends FileStatusCache {}
+}
diff --git a/...tor-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/write/HiveBatchWrite.scala b/...tor-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/write/HiveBatchWrite.scala
@@ -18,7 +18,6 @@
 package org.apache.kyuubi.spark.connector.hive.write
 
 import scala.util.control.NonFatal
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.conf.HiveConf
@@ -32,8 +31,8 @@ import org.apache.spark.sql.execution.datasources.{WriteJobDescription, WriteTas
 import org.apache.spark.sql.execution.datasources.v2.FileBatchWrite
 import org.apache.spark.sql.hive.kyuubi.connector.HiveBridgeHelper.toSQLValue
 import org.apache.spark.sql.types.StringType
-
 import org.apache.kyuubi.spark.connector.hive.{HiveConnectorUtils, HiveTableCatalog, KyuubiHiveConnectorException}
+import org.apache.spark.sql.connector.catalog.Identifier
 
 class HiveBatchWrite(
     sparkSession: SparkSession,
@@ -69,6 +68,9 @@ class HiveBatchWrite(
 
     // un-cache this table.
     hiveTableCatalog.catalog.invalidateCachedTable(table.identifier)
+    hiveTableCatalog.invalidateTable(
+      Identifier.of(Array(table.identifier.database.getOrElse("")), table.identifier.table)
+    )
 
     val catalog = hiveTableCatalog.catalog
     if (sparkSession.sessionState.conf.autoSizeUpdateEnabled) {

diff --git a/...nnector-hive/src/test/scala/org/apache/kyuubi/spark/connector/hive/HiveCatalogSuite.scala b/...nnector-hive/src/test/scala/org/apache/kyuubi/spark/connector/hive/HiveCatalogSuite.scala
@@ -39,7 +39,7 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap
 
 import org.apache.kyuubi.spark.connector.hive.HiveTableCatalog.IdentifierHelper
 import org.apache.kyuubi.spark.connector.hive.KyuubiHiveConnectorConf.{READ_CONVERT_METASTORE_ORC, READ_CONVERT_METASTORE_PARQUET}
-import org.apache.kyuubi.spark.connector.hive.read.HiveScan
+import org.apache.kyuubi.spark.connector.hive.read.{HiveFileStatusCache, HiveScan}
 
 class HiveCatalogSuite extends KyuubiHiveTest {
 
@@ -284,16 +284,29 @@ class HiveCatalogSuite extends KyuubiHiveTest {
   }
 
   test("invalidateTable") {
-    val table = catalog.createTable(testIdent, schema, Array.empty[Transform], emptyProps)
-    // Hive v2 don't cache table
-    catalog.invalidateTable(testIdent)
-
-    val loaded = catalog.loadTable(testIdent)
-
-    assert(table.name == loaded.name)
-    assert(table.schema == loaded.schema)
-    assert(table.properties == loaded.properties)
-    catalog.dropTable(testIdent)
+    withSparkSession() { spark =>
+      val table = catalog.createTable(testIdent, schema, Array.empty[Transform], emptyProps)
+      val qualifiedName = s"$catalogName.${testIdent.namespace().mkString(".")}.${testIdent.name()}"
+      val location = table.asInstanceOf[HiveTable].catalogTable.location
+
+      spark.sql(s"select * from $qualifiedName").collect()
+      assert(HiveFileStatusCache.getOrCreate(spark, qualifiedName)
+        .getLeafFiles(new Path(location)).isDefined)
+
+      catalog.invalidateTable(testIdent)
+      assert(HiveFileStatusCache.getOrCreate(spark, qualifiedName)
+        .getLeafFiles(new Path(location)).isEmpty)
+
+      spark.sql(s"select * from $qualifiedName").collect()
+      assert(HiveFileStatusCache.getOrCreate(spark, qualifiedName)
+        .getLeafFiles(new Path(location)).isDefined)
+
+      val loaded = catalog.loadTable(testIdent)
+      assert(table.name == loaded.name)
+      assert(table.schema == loaded.schema)
+      assert(table.properties == loaded.properties)
+      catalog.dropTable(testIdent)
+    }
   }
 
   test("listNamespaces: fail if missing namespace") {