apache · chong0929 · May 26, 2021 · zhengchenyu · Oct 22, 2025 · catalinii
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -3210,6 +3210,13 @@ object SQLConf {
     .intConf
     .createWithDefault(0)
 
+  val READ_PARTITION_WITH_SUBDIRECTORY_ENABLED =
+    buildConf("spark.sql.sources.readPartitionWithSubdirectory.enabled")
+      .doc("When set to true, Spark SQL could read the files of " +
+        " partitioned hive table from subdirectories under root path of table")
+      .booleanConf
+      .createWithDefault(false)
+
   /**
    * Holds information about keys that have been deprecated.
    *
@@ -3908,6 +3915,9 @@ class SQLConf extends Serializable with Logging {
 
   def maxConcurrentOutputFileWriters: Int = getConf(SQLConf.MAX_CONCURRENT_OUTPUT_FILE_WRITERS)
 
+  def readPartitionWithSubdirectoryEnabled: Boolean =
+    getConf(READ_PARTITION_WITH_SUBDIRECTORY_ENABLED)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -59,6 +59,9 @@ class InMemoryFileIndex(
   override val rootPaths =
     rootPathsSpecified.filterNot(FileStreamSink.ancestorIsMetadataDirectory(_, hadoopConf))
 
+  val readPartitionWithSubdirectoryEnabled =
+    sparkSession.sessionState.conf.readPartitionWithSubdirectoryEnabled
+
   @volatile private var cachedLeafFiles: mutable.LinkedHashMap[Path, FileStatus] = _
   @volatile private var cachedLeafDirToChildrenFiles: Map[Path, Array[FileStatus]] = _
   @volatile private var cachedPartitionSpec: PartitionSpec = _
@@ -94,10 +97,23 @@ class InMemoryFileIndex(
     val files = listLeafFiles(rootPaths)
     cachedLeafFiles =
       new mutable.LinkedHashMap[Path, FileStatus]() ++= files.map(f => f.getPath -> f)
-    cachedLeafDirToChildrenFiles = files.toArray.groupBy(_.getPath.getParent)
+    cachedLeafDirToChildrenFiles =
+      if (readPartitionWithSubdirectoryEnabled) {
+        files.toArray.groupBy(file => getRootPathsLeafDir(file.getPath.getParent))
+      } else {
+        files.toArray.groupBy(_.getPath.getParent)
+      }
     cachedPartitionSpec = null
   }
 
+  private def getRootPathsLeafDir(path: Path): Path = {
+    if (rootPaths.contains(path)) {
+      path
+    } else {
+      getRootPathsLeafDir(path.getParent)
+    }
+  }
+
   override def equals(other: Any): Boolean = other match {
     case hdfs: InMemoryFileIndex => rootPaths.toSet == hdfs.rootPaths.toSet
     case _ => false

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
@@ -520,6 +520,22 @@ class FileIndexSuite extends SharedSparkSession {
       SQLConf.get.setConf(StaticSQLConf.METADATA_CACHE_TTL_SECONDS, previousValue)
     }
   }
+
+  test("SPARK-28098 - supporting read partitioned Hive tables with subdirectories") {
+    withTempPath { dir =>
+      spark
+        .range(2)
+        .select(col("id").as("p"), col("id"))
+        .write
+        .partitionBy("p")
+        .orc(s"${dir.getAbsolutePath}/sub1/sub2")
+      val path = new Path(dir.getAbsolutePath)
+      val fileIndex = new InMemoryFileIndex(spark, Seq(path), Map.empty, None)
+      val partitionValues = fileIndex.partitionSpec().partitions.map(_.values)
+      assert(partitionValues.length == 2 && partitionValues(0).numFields == 1 &&
+        partitionValues(1).numFields == 1)
+    }
+  }
 }
 
 object DeletionRaceFileSystem {