Fix more CI issues

linliu-code · linliu-code · commit 87efa59ece2f · 2026-01-02T20:04:59.000-08:00
diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java
@@ -72,7 +72,7 @@ public class TestTableSchemaEvolution extends HoodieClientTestBase {
   public static final String EXTRA_FIELD_WITHOUT_DEFAULT_SCHEMA =
       "{\"name\": \"new_field_without_default\", \"type\": \"boolean\"},";
   public static final String EXTRA_FIELD_NULLABLE_SCHEMA =
-      ",{\"name\": \"new_field_without_default\", \"type\": [\"boolean\", \"null\"]}";
+      "{\"name\": \"new_field_without_default\", \"type\": [\"null\", \"boolean\"], \"default\": null},";
 
   // TRIP_EXAMPLE_SCHEMA with a new_field added
   public static final String TRIP_EXAMPLE_SCHEMA_EVOLVED_COL_ADDED = TRIP_SCHEMA_PREFIX + EXTRA_TYPE_SCHEMA + MAP_TYPE_SCHEMA
@@ -152,7 +152,7 @@ public void testSchemaCompatibilityBasic() {
         "Added field without default and not nullable is not compatible (Evolved Schema)");
 
     assertTrue(isSchemaCompatible(TRIP_EXAMPLE_SCHEMA, TRIP_SCHEMA_PREFIX + EXTRA_TYPE_SCHEMA + MAP_TYPE_SCHEMA
-            + FARE_NESTED_SCHEMA + TIP_NESTED_SCHEMA + TRIP_SCHEMA_SUFFIX + EXTRA_FIELD_NULLABLE_SCHEMA, false),
+            + FARE_NESTED_SCHEMA + TIP_NESTED_SCHEMA + EXTRA_FIELD_NULLABLE_SCHEMA + TRIP_SCHEMA_SUFFIX, false),
         "Added nullable field is compatible (Evolved Schema)");
   }
 
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/JsonUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/JsonUtils.java
@@ -44,6 +44,8 @@ public class JsonUtils {
     MAPPER.setVisibility(PropertyAccessor.IS_GETTER, JsonAutoDetect.Visibility.NONE);
     MAPPER.setVisibility(PropertyAccessor.SETTER, JsonAutoDetect.Visibility.NONE);
     MAPPER.setVisibility(PropertyAccessor.CREATOR, JsonAutoDetect.Visibility.NONE);
+    // NOTE: Registering [[JavaTimeModule]] is required for Jackson >= 2.11 (Spark >= 3.3)
+    MAPPER.registerModule(new JavaTimeModule());
   }
 
   public static ObjectMapper getObjectMapper() {
diff --git a/hudi-hadoop-common/src/main/java/org/apache/hudi/io/hadoop/HoodieAvroParquetReader.java b/hudi-hadoop-common/src/main/java/org/apache/hudi/io/hadoop/HoodieAvroParquetReader.java
@@ -107,7 +107,7 @@ protected ClosableIterator<IndexedRecord> getIndexedRecordIterator(Schema schema
 
   @Override
   public ClosableIterator<IndexedRecord> getIndexedRecordIterator(Schema readerSchema, Schema requestedSchema) throws IOException {
-    return getIndexedRecordIteratorInternal(readerSchema, Option.of(requestedSchema));
+    return getIndexedRecordIteratorInternal(requestedSchema, Option.empty());
   }
 
   @Override
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestAvroSchemaResolutionSupport.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestAvroSchemaResolutionSupport.scala
@@ -23,11 +23,12 @@ import org.apache.hudi.common.model.HoodieTableType
 import org.apache.hudi.config.HoodieWriteConfig
 import org.apache.hudi.exception.SchemaCompatibilityException
 import org.apache.hudi.testutils.HoodieClientTestBase
-
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 import org.apache.spark.sql.types._
 import org.junit.jupiter.api.{AfterEach, BeforeEach}
+import org.junit.jupiter.api.Assertions.assertDoesNotThrow
+import org.junit.jupiter.api.function.Executable
 import org.junit.jupiter.params.ParameterizedTest
 import org.junit.jupiter.params.provider.{CsvSource, ValueSource}
 
@@ -818,8 +819,8 @@ class TestAvroSchemaResolutionSupport extends HoodieClientTestBase with ScalaAss
   }
 
   @ParameterizedTest
-  @ValueSource(booleans = Array(true, false))
-  def testNestedTypeVectorizedReadWithTypeChange(isCow: Boolean): Unit = {
+  @ValueSource(strings = Array("COPY_ON_WRITE", "MERGE_ON_READ"))
+  def testNestedTypeVectorizedReadWithTypeChange(tableType: String): Unit = {
     // test to change the value type of a MAP in a column of ARRAY< MAP<k,v> > type
     val tempRecordPath = basePath + "/record_tbl/"
     val arrayMapData = Seq(
@@ -836,7 +837,7 @@ class TestAvroSchemaResolutionSupport extends HoodieClientTestBase with ScalaAss
     df1.show(false)
 
     // recreate table
-    initialiseTable(df1, tempRecordPath, isCow)
+    initialiseTable(df1, tempRecordPath, tableType.equals("COPY_ON_WRITE"))
 
     // read out the table, will not throw any exception
     readTable(tempRecordPath)
@@ -855,15 +856,19 @@ class TestAvroSchemaResolutionSupport extends HoodieClientTestBase with ScalaAss
     df2.printSchema()
     df2.show(false)
     // upsert
-    upsertData(df2, tempRecordPath, isCow)
+    upsertData(df2, tempRecordPath, tableType.equals("COPY_ON_WRITE"))
 
     // after implicit type change, read the table with vectorized read enabled
     if (HoodieSparkUtils.gteqSpark3_3) {
-      assertThrows(classOf[SparkException]){
+      assertThrows(classOf[SparkException]) {
         withSQLConf("spark.sql.parquet.enableNestedColumnVectorizedReader" -> "true") {
           readTable(tempRecordPath)
         }
       }
+    } else {
+      withSQLConf("spark.sql.parquet.enableNestedColumnVectorizedReader" -> "true") {
+        readTable(tempRecordPath)
+      }
     }
 
     withSQLConf("spark.sql.parquet.enableNestedColumnVectorizedReader" -> "false") {
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala
@@ -1132,70 +1132,73 @@ class TestMORDataSource extends HoodieSparkClientTestBase with SparkDatasetMixin
     assertEquals(numRecords - numRecordsToDelete, snapshotDF2.count())
   }
 
+  @ParameterizedTest
   @CsvSource(Array("avro, 6", "parquet, 6"))
   def testLogicalTypesReadRepair(logBlockFormat: String, tableVersion: Int): Unit = {
-    val logBlockString = if (logBlockFormat == "avro") {
-      ""
-    } else {
-      "_parquet_log"
-    }
-    val prevTimezone = spark.conf.get("spark.sql.session.timeZone")
-    val propertyValue: String = System.getProperty("spark.testing")
-    try {
-      if (HoodieSparkUtils.isSpark3_3) {
-        System.setProperty("spark.testing", "true")
+    if (HoodieSparkUtils.gteqSpark3_4) {
+      val logBlockString = if (logBlockFormat == "avro") {
+        ""
+      } else {
+        "_parquet_log"
       }
-      spark.conf.set("spark.sql.session.timeZone", "UTC")
-      val tableName = "trips_logical_types_json_mor_read_v" + tableVersion + logBlockString
-      val dataPath = "file://" + basePath + "/" + tableName
-      val zipOutput = Paths.get(new URI(dataPath))
-      HoodieTestUtils.extractZipToDirectory("/" + tableName + ".zip", zipOutput, getClass)
-      val tableBasePath = zipOutput.toString
-
-      val df = spark.read.format("hudi").load(tableBasePath)
-
-      val rows = df.collect()
-      assertEquals(20, rows.length)
-      for (row <- rows) {
-        val hash = row.get(6).asInstanceOf[String].hashCode()
-        if ((hash & 1)== 0) {
-          assertEquals("2020-01-01T00:00:00.001Z", row.get(15).asInstanceOf[Timestamp].toInstant.toString)
-          assertEquals("2020-06-01T12:00:00.000001Z", row.get(16).asInstanceOf[Timestamp].toInstant.toString)
-          assertEquals("2015-05-20T12:34:56.001", row.get(17).toString)
-          assertEquals("2017-07-07T07:07:07.000001", row.get(18).toString)
-        } else {
-          assertEquals("2019-12-31T23:59:59.999Z", row.get(15).asInstanceOf[Timestamp].toInstant.toString)
-          assertEquals("2020-06-01T11:59:59.999999Z", row.get(16).asInstanceOf[Timestamp].toInstant.toString)
-          assertEquals("2015-05-20T12:34:55.999", row.get(17).toString)
-          assertEquals("2017-07-07T07:07:06.999999", row.get(18).toString)
+      val prevTimezone = spark.conf.get("spark.sql.session.timeZone")
+      val propertyValue: String = System.getProperty("spark.testing")
+      try {
+        if (HoodieSparkUtils.isSpark3_3) {
+          System.setProperty("spark.testing", "true")
         }
-      }
-      assertEquals(10, df.filter("ts_millis > timestamp('2020-01-01 00:00:00Z')").count())
-      assertEquals(10, df.filter("ts_millis < timestamp('2020-01-01 00:00:00Z')").count())
-      assertEquals(0, df.filter("ts_millis > timestamp('2020-01-01 00:00:00.001Z')").count())
-      assertEquals(0, df.filter("ts_millis < timestamp('2019-12-31 23:59:59.999Z')").count())
-
-      assertEquals(10, df.filter("ts_micros > timestamp('2020-06-01 12:00:00Z')").count())
-      assertEquals(10, df.filter("ts_micros < timestamp('2020-06-01 12:00:00Z')").count())
-      assertEquals(0, df.filter("ts_micros > timestamp('2020-06-01 12:00:00.000001Z')").count())
-      assertEquals(0, df.filter("ts_micros < timestamp('2020-06-01 11:59:59.999999Z')").count())
-
-      assertEquals(10, df.filter("local_ts_millis > CAST('2015-05-20 12:34:56' AS TIMESTAMP_NTZ)").count())
-      assertEquals(10, df.filter("local_ts_millis < CAST('2015-05-20 12:34:56' AS TIMESTAMP_NTZ)").count())
-      assertEquals(0, df.filter("local_ts_millis > CAST('2015-05-20 12:34:56.001' AS TIMESTAMP_NTZ)").count())
-      assertEquals(0, df.filter("local_ts_millis < CAST('2015-05-20 12:34:55.999' AS TIMESTAMP_NTZ)").count())
-
-      assertEquals(10, df.filter("local_ts_micros > CAST('2017-07-07 07:07:07' AS TIMESTAMP_NTZ)").count())
-      assertEquals(10, df.filter("local_ts_micros < CAST('2017-07-07 07:07:07' AS TIMESTAMP_NTZ)").count())
-      assertEquals(0, df.filter("local_ts_micros > CAST('2017-07-07 07:07:07.000001' AS TIMESTAMP_NTZ)").count())
-      assertEquals(0, df.filter("local_ts_micros < CAST('2017-07-07 07:07:06.999999' AS TIMESTAMP_NTZ)").count())
-    } finally {
-      spark.conf.set("spark.sql.session.timeZone", prevTimezone)
-      if (HoodieSparkUtils.isSpark3_3) {
-        if (propertyValue == null) {
-          System.clearProperty("spark.testing")
-        } else {
-          System.setProperty("spark.testing", propertyValue)
+        spark.conf.set("spark.sql.session.timeZone", "UTC")
+        val tableName = "trips_logical_types_json_mor_read_v" + tableVersion + logBlockString
+        val dataPath = "file://" + basePath + "/" + tableName
+        val zipOutput = Paths.get(new URI(dataPath))
+        HoodieTestUtils.extractZipToDirectory("/" + tableName + ".zip", zipOutput, getClass)
+        val tableBasePath = zipOutput.toString
+
+        val df = spark.read.format("hudi").load(tableBasePath)
+
+        val rows = df.collect()
+        assertEquals(20, rows.length)
+        for (row <- rows) {
+          val hash = row.get(6).asInstanceOf[String].hashCode()
+          if ((hash & 1) == 0) {
+            assertEquals("2020-01-01T00:00:00.001Z", row.get(15).asInstanceOf[Timestamp].toInstant.toString)
+            assertEquals("2020-06-01T12:00:00.000001Z", row.get(16).asInstanceOf[Timestamp].toInstant.toString)
+            assertEquals("2015-05-20T12:34:56.001", row.get(17).toString)
+            assertEquals("2017-07-07T07:07:07.000001", row.get(18).toString)
+          } else {
+            assertEquals("2019-12-31T23:59:59.999Z", row.get(15).asInstanceOf[Timestamp].toInstant.toString)
+            assertEquals("2020-06-01T11:59:59.999999Z", row.get(16).asInstanceOf[Timestamp].toInstant.toString)
+            assertEquals("2015-05-20T12:34:55.999", row.get(17).toString)
+            assertEquals("2017-07-07T07:07:06.999999", row.get(18).toString)
+          }
+        }
+        assertEquals(10, df.filter("ts_millis > timestamp('2020-01-01 00:00:00Z')").count())
+        assertEquals(10, df.filter("ts_millis < timestamp('2020-01-01 00:00:00Z')").count())
+        assertEquals(0, df.filter("ts_millis > timestamp('2020-01-01 00:00:00.001Z')").count())
+        assertEquals(0, df.filter("ts_millis < timestamp('2019-12-31 23:59:59.999Z')").count())
+
+        assertEquals(10, df.filter("ts_micros > timestamp('2020-06-01 12:00:00Z')").count())
+        assertEquals(10, df.filter("ts_micros < timestamp('2020-06-01 12:00:00Z')").count())
+        assertEquals(0, df.filter("ts_micros > timestamp('2020-06-01 12:00:00.000001Z')").count())
+        assertEquals(0, df.filter("ts_micros < timestamp('2020-06-01 11:59:59.999999Z')").count())
+
+        assertEquals(10, df.filter("local_ts_millis > CAST('2015-05-20 12:34:56' AS TIMESTAMP_NTZ)").count())
+        assertEquals(10, df.filter("local_ts_millis < CAST('2015-05-20 12:34:56' AS TIMESTAMP_NTZ)").count())
+        assertEquals(0, df.filter("local_ts_millis > CAST('2015-05-20 12:34:56.001' AS TIMESTAMP_NTZ)").count())
+        assertEquals(0, df.filter("local_ts_millis < CAST('2015-05-20 12:34:55.999' AS TIMESTAMP_NTZ)").count())
+
+        assertEquals(10, df.filter("local_ts_micros > CAST('2017-07-07 07:07:07' AS TIMESTAMP_NTZ)").count())
+        assertEquals(10, df.filter("local_ts_micros < CAST('2017-07-07 07:07:07' AS TIMESTAMP_NTZ)").count())
+        assertEquals(0, df.filter("local_ts_micros > CAST('2017-07-07 07:07:07.000001' AS TIMESTAMP_NTZ)").count())
+        assertEquals(0, df.filter("local_ts_micros < CAST('2017-07-07 07:07:06.999999' AS TIMESTAMP_NTZ)").count())
+      } finally {
+        spark.conf.set("spark.sql.session.timeZone", prevTimezone)
+        if (HoodieSparkUtils.isSpark3_3) {
+          if (propertyValue == null) {
+            System.clearProperty("spark.testing")
+          } else {
+            System.setProperty("spark.testing", propertyValue)
+          }
         }
       }
     }
diff --git a/hudi-spark-datasource/hudi-spark3.4.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark34LegacyHoodieParquetFileFormat.scala b/hudi-spark-datasource/hudi-spark3.4.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark34LegacyHoodieParquetFileFormat.scala
@@ -78,7 +78,11 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu
       )
     }
   }
-  private lazy val hasTimestampMillisFieldInTableSchema = true
+  private lazy val hasTimestampMillisFieldInTableSchema = if (avroTableSchema == null) {
+    true
+  } else {
+    AvroSchemaRepair.hasTimestampMillisField(avroTableSchema)
+  }
   private lazy val supportBatchWithTableSchema = !hasTimestampMillisFieldInTableSchema
 
   def supportsColumnar(sparkSession: SparkSession, schema: StructType): Boolean = {
@@ -95,7 +99,7 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu
    */
   override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = {
     val conf = sparkSession.sessionState.conf
-    ParquetUtils.isBatchReadSupportedForSchema(conf, schema)
+    ParquetUtils.isBatchReadSupportedForSchema(conf, schema) && supportBatchWithTableSchema
   }
 
   override def buildReaderWithPartitionValues(sparkSession: SparkSession,
diff --git a/hudi-spark-datasource/hudi-spark3.5.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark35LegacyHoodieParquetFileFormat.scala b/hudi-spark-datasource/hudi-spark3.5.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark35LegacyHoodieParquetFileFormat.scala
@@ -80,7 +80,11 @@ class Spark35LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu
       )
     }
   }
-  private lazy val hasTimestampMillisFieldInTableSchema = true
+  private lazy val hasTimestampMillisFieldInTableSchema = if (avroTableSchema == null) {
+    true
+  } else {
+    AvroSchemaRepair.hasTimestampMillisField(avroTableSchema)
+  }
   private lazy val supportBatchWithTableSchema = HoodieSparkUtils.gteqSpark3_5 || !hasTimestampMillisFieldInTableSchema
 
   def supportsColumnar(sparkSession: SparkSession, schema: StructType): Boolean = {
diff --git a/hudi-utilities/pom.xml b/hudi-utilities/pom.xml
@@ -354,7 +354,7 @@
     <dependency>
       <groupId>org.apache.kafka</groupId>
       <artifactId>kafka_${scala.binary.version}</artifactId>
-      <version>${kafka.version}</version>
+      <version>${kafka.spark3.version}</version>
       <scope>test</scope>
     </dependency>
 

Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,8 @@ public class JsonUtils {`
`44`	`44`	`MAPPER.setVisibility(PropertyAccessor.IS_GETTER, JsonAutoDetect.Visibility.NONE);`
`45`	`45`	`MAPPER.setVisibility(PropertyAccessor.SETTER, JsonAutoDetect.Visibility.NONE);`
`46`	`46`	`MAPPER.setVisibility(PropertyAccessor.CREATOR, JsonAutoDetect.Visibility.NONE);`
	`47`	`+ // NOTE: Registering [[JavaTimeModule]] is required for Jackson >= 2.11 (Spark >= 3.3)`
	`48`	`+ MAPPER.registerModule(new JavaTimeModule());`
`47`	`49`	`}`
`48`	`50`
`49`	`51`	`public static ObjectMapper getObjectMapper() {`
Original file line number	Diff line number	Diff line change
`@@ -107,7 +107,7 @@ protected ClosableIterator<IndexedRecord> getIndexedRecordIterator(Schema schema`
`107`	`107`
`108`	`108`	`@Override`
`109`	`109`	`public ClosableIterator<IndexedRecord> getIndexedRecordIterator(Schema readerSchema, Schema requestedSchema) throws IOException {`
`110`		`- return getIndexedRecordIteratorInternal(readerSchema, Option.of(requestedSchema));`
	`110`	`+ return getIndexedRecordIteratorInternal(requestedSchema, Option.empty());`
`111`	`111`	`}`
`112`	`112`
`113`	`113`	`@Override`
Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,11 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu`
`78`	`78`	`)`
`79`	`79`	`}`
`80`	`80`	`}`
`81`		`- private lazy val hasTimestampMillisFieldInTableSchema = true`
	`81`	`+ private lazy val hasTimestampMillisFieldInTableSchema = if (avroTableSchema == null) {`
	`82`	`+ true`
	`83`	`+ } else {`
	`84`	`+ AvroSchemaRepair.hasTimestampMillisField(avroTableSchema)`
	`85`	`+ }`
`82`	`86`	`private lazy val supportBatchWithTableSchema = !hasTimestampMillisFieldInTableSchema`
`83`	`87`
`84`	`88`	`def supportsColumnar(sparkSession: SparkSession, schema: StructType): Boolean = {`
`@@ -95,7 +99,7 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu`
`95`	`99`	`*/`
`96`	`100`	`override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = {`
`97`	`101`	`val conf = sparkSession.sessionState.conf`
`98`		`- ParquetUtils.isBatchReadSupportedForSchema(conf, schema)`
	`102`	`+ ParquetUtils.isBatchReadSupportedForSchema(conf, schema) && supportBatchWithTableSchema`
`99`	`103`	`}`
`100`	`104`
`101`	`105`	`override def buildReaderWithPartitionValues(sparkSession: SparkSession,`
Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,11 @@ class Spark35LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu`
`80`	`80`	`)`
`81`	`81`	`}`
`82`	`82`	`}`
`83`		`- private lazy val hasTimestampMillisFieldInTableSchema = true`
	`83`	`+ private lazy val hasTimestampMillisFieldInTableSchema = if (avroTableSchema == null) {`
	`84`	`+ true`
	`85`	`+ } else {`
	`86`	`+ AvroSchemaRepair.hasTimestampMillisField(avroTableSchema)`
	`87`	`+ }`
`84`	`88`	`private lazy val supportBatchWithTableSchema = HoodieSparkUtils.gteqSpark3_5 \|\| !hasTimestampMillisFieldInTableSchema`
`85`	`89`
`86`	`90`	`def supportsColumnar(sparkSession: SparkSession, schema: StructType): Boolean = {`