Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
baca3e9
Fix timestamp_millis issue
linliu-code Jan 14, 2026
112d3f6
Solve more compiling errors
linliu-code Jan 14, 2026
b132c9b
Fix some bugs
linliu-code Jan 21, 2026
b0cc6a2
Fix incremental queries for auto repair
linliu-code Jan 26, 2026
81c4d2b
Fix data skipping support
linliu-code Jan 28, 2026
19796f1
Pass schema from option instead of global configuration for thread sa…
linliu-code Jan 28, 2026
35a2995
Address partial comments
linliu-code Jan 28, 2026
6b97dce
Address more comments
linliu-code Jan 29, 2026
5d472f2
changing code parts around HoodieStorage (introduced in 0.15.0.) to u…
Feb 5, 2026
eee51be
Addressed comments
linliu-code Jan 30, 2026
ba8c90e
address wiring comments
linliu-code Jan 30, 2026
435bac2
Fix hive related tests
Feb 2, 2026
7be4046
Fix tests
Feb 3, 2026
94526f4
Fix tests
Feb 4, 2026
d3b7897
Fix tests
Feb 4, 2026
ac14313
Fix test and address review comments
Feb 4, 2026
bd20879
Cherry pick bug fixes
Feb 5, 2026
94dff70
Fix build
Feb 4, 2026
d17b0e8
Fix build
Feb 5, 2026
808d4a4
Revert "Fix tests"
Feb 6, 2026
4f3563a
Revert "Fix build"
Feb 6, 2026
4e5ed98
Fix build issues - 2.11 depending on 2.12 version
Feb 6, 2026
cf421c8
Fix build issues - Column stream explicty collection.2
Feb 6, 2026
2f0ae3b
Fix build issues - Use projection schema instead of repairFileSchema
Feb 6, 2026
a10b077
Fix build issues - remove spark3.5 profiles from github workflow bot
Feb 7, 2026
ac619b6
Fix build issues - Fix IT test pom.xml update
Feb 7, 2026
0f39b53
Fix build issues - Use projection schema fields from repaired schema
Feb 8, 2026
e2af120
Fix build issues - throw exception only for spark 3.4 gt
Feb 8, 2026
7508f66
Fix build issues - throw exception only for spark 3.3
Feb 8, 2026
6a7a5fb
Fix build issues - remove exceptions
Feb 9, 2026
6bea797
Fix build issues - remove exceptions for spark33
Feb 9, 2026
281228a
Fix build issues - remove exceptions for spark33
Feb 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 73 additions & 9 deletions .github/workflows/bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -177,29 +177,93 @@ jobs:
java-version: '17'
distribution: 'adopt'
architecture: x64
cache: maven
- name: Verify Java 17 version
run: |
echo "JAVA_HOME: $JAVA_HOME"
java -version
which java
- name: Quickstart Test
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
run:
mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl hudi-examples/hudi-examples-spark $MVN_ARGS
- name: UT - Common & Spark
run: |
export PATH="$JAVA_HOME/bin:$PATH"
mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DwildcardSuites=skipScalaTests -DfailIfNoTests=false -pl hudi-examples/hudi-examples-spark $MVN_ARGS
- name: Java UT - Common & Spark
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SPARK_MODULES: ${{ matrix.sparkModules }}
if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 as it's covered by Azure CI
run:
mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl "hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS
- name: FT - Spark
run: |
export PATH="$JAVA_HOME/bin:$PATH"
mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DwildcardSuites=skipScalaTests -DfailIfNoTests=false -pl "hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS
- name: Java FT - Spark
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SPARK_MODULES: ${{ matrix.sparkModules }}
if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 as it's covered by Azure CI
run:
if: ${{ !endsWith(env.SPARK_PROFILE, '3.4') }} # skip test spark 3.4 as it's covered by Azure CI
run: |
export PATH="$JAVA_HOME/bin:$PATH"
mvn test -Pfunctional-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS

test-spark-java17-scala-tests:
runs-on: ubuntu-latest
strategy:
matrix:
include:
- scalaProfile: "scala-2.12"
sparkProfile: "spark3.3"
sparkModules: "hudi-spark-datasource/hudi-spark3.3.x"
- scalaProfile: "scala-2.12"
sparkProfile: "spark3.4"
sparkModules: "hudi-spark-datasource/hudi-spark3.4.x"

steps:
- uses: actions/checkout@v3
- name: Set up JDK 8
uses: actions/setup-java@v3
with:
java-version: '8'
distribution: 'temurin'
architecture: x64
cache: maven
- name: Build Project
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
run:
mvn clean install -T 2 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DskipTests=true $MVN_ARGS -am -pl "hudi-examples/hudi-examples-spark,hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES"
- name: Set up JDK 17
uses: actions/setup-java@v3
with:
java-version: '17'
distribution: 'temurin'
architecture: x64
cache: maven
- name: Verify Java 17 version
run: |
echo "JAVA_HOME: $JAVA_HOME"
java -version
which java
- name: Scala UT - Common & Spark
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SPARK_MODULES: ${{ matrix.sparkModules }}
run: |
export PATH="$JAVA_HOME/bin:$PATH"
mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -Dtest=skipJavaTests -DfailIfNoTests=false -pl "hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS
- name: Scala FT - Spark
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SPARK_MODULES: ${{ matrix.sparkModules }}
run: |
export PATH="$JAVA_HOME/bin:$PATH"
mvn test -Pfunctional-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -Dtest=skipJavaTests -DfailIfNoTests=false -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS

test-flink:
runs-on: ubuntu-latest
strategy:
Expand Down
6 changes: 3 additions & 3 deletions azure-pipelines-20230430.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ parameters:
default:
- 'hudi-spark-datasource'
- 'hudi-spark-datasource/hudi-spark'
- 'hudi-spark-datasource/hudi-spark3.2.x'
- 'hudi-spark-datasource/hudi-spark3.4.x'
- 'hudi-spark-datasource/hudi-spark3.2plus-common'
- 'hudi-spark-datasource/hudi-spark3-common'
- 'hudi-spark-datasource/hudi-spark-common'
Expand All @@ -73,7 +73,7 @@ parameters:
- '!hudi-flink-datasource/hudi-flink1.18.x'
- '!hudi-spark-datasource'
- '!hudi-spark-datasource/hudi-spark'
- '!hudi-spark-datasource/hudi-spark3.2.x'
- '!hudi-spark-datasource/hudi-spark3.4.x'
- '!hudi-spark-datasource/hudi-spark3.2plus-common'
- '!hudi-spark-datasource/hudi-spark3-common'
- '!hudi-spark-datasource/hudi-spark-common'
Expand All @@ -97,7 +97,7 @@ parameters:
- '!hudi-flink-datasource/hudi-flink1.18.x'

variables:
BUILD_PROFILES: '-Dscala-2.12 -Dspark3.2 -Dflink1.18'
BUILD_PROFILES: '-Dscala-2.12 -Dspark3.4 -Dflink1.18'
PLUGIN_OPTS: '-Dcheckstyle.skip=true -Drat.skip=true -Djacoco.skip=true -ntp -B -V -Pwarn-log -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.shade=warn -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.dependency=warn'
MVN_OPTS_INSTALL: '-Phudi-platform-service -DskipTests $(BUILD_PROFILES) $(PLUGIN_OPTS) -Dmaven.wagon.httpconnectionManager.ttlSeconds=25 -Dmaven.wagon.http.retryHandler.count=5'
MVN_OPTS_TEST: '-fae -Pwarn-log $(BUILD_PROFILES) $(PLUGIN_OPTS)'
Expand Down
7 changes: 7 additions & 0 deletions hudi-cli/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,13 @@
</exclusions>
</dependency>

<!-- Validation API - override old 1.1.0 version from transitive dependencies -->
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
<version>2.0.1.Final</version>
</dependency>

<!-- Scala -->
<dependency>
<groupId>org.scala-lang</groupId>
Expand Down
46 changes: 46 additions & 0 deletions hudi-client/hudi-client-common/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,48 @@
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-timeline-service</artifactId>
<version>${project.version}</version>
<!-- Exclude Jetty from timeline-service to use our managed version -->
<exclusions>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>

<!-- Jetty: Explicitly declare all Jetty dependencies to ensure version alignment -->
<!-- This is critical when running in Spark/Hadoop environments that may have older Jetty versions -->
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-server</artifactId>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-servlet</artifactId>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-http</artifactId>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-io</artifactId>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-util</artifactId>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-webapp</artifactId>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-xml</artifactId>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-security</artifactId>
</dependency>

<dependency>
Expand Down Expand Up @@ -188,6 +230,10 @@
<groupId>org.pentaho</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
</exclusion>
</exclusions>
</dependency>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.apache.hudi.io;

import org.apache.hudi.avro.AvroSchemaUtils;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieLogFile;
Expand Down Expand Up @@ -59,9 +60,12 @@ public HoodieMergedReadHandle(HoodieWriteConfig config,
HoodieTable<T, I, K, O> hoodieTable,
Pair<String, String> partitionPathFileIDPair) {
super(config, instantTime, hoodieTable, partitionPathFileIDPair);
readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField());
Schema orignalReaderSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField());
// config.getSchema is not canonicalized, while config.getWriteSchema is canonicalized. So, we have to use the canonicalized schema to read the existing data.
baseFileReaderSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getWriteSchema()), config.allowOperationMetadataField());
// Repair reader schema.
// Assume writer schema should be correct. If not, no repair happens.
readerSchema = AvroSchemaUtils.getRepairedSchema(orignalReaderSchema, baseFileReaderSchema);
}

public List<HoodieRecord<T>> getMergedRecords() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import java.util.TimeZone;
import java.util.concurrent.TimeUnit;

import static java.util.concurrent.TimeUnit.MICROSECONDS;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.concurrent.TimeUnit.SECONDS;
import static org.apache.hudi.common.config.TimestampKeyGeneratorConfig.DATE_TIME_PARSER;
Expand All @@ -54,7 +55,7 @@
*/
public class TimestampBasedAvroKeyGenerator extends SimpleAvroKeyGenerator {
public enum TimestampType implements Serializable {
UNIX_TIMESTAMP, DATE_STRING, MIXED, EPOCHMILLISECONDS, SCALAR
UNIX_TIMESTAMP, DATE_STRING, MIXED, EPOCHMILLISECONDS, EPOCHMICROSECONDS, SCALAR
}

private final TimeUnit timeUnit;
Expand Down Expand Up @@ -93,6 +94,9 @@ public TimestampBasedAvroKeyGenerator(TypedProperties config) throws IOException
case EPOCHMILLISECONDS:
timeUnit = MILLISECONDS;
break;
case EPOCHMICROSECONDS:
timeUnit = MICROSECONDS;
break;
case UNIX_TIMESTAMP:
timeUnit = SECONDS;
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.HoodieTableMetaClient;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package org.apache.hudi.table.action.commit;

import org.apache.hudi.avro.AvroSchemaUtils;
import org.apache.hudi.common.config.HoodieCommonConfig;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieRecord;
Expand Down Expand Up @@ -86,7 +87,8 @@ public void runMerge(HoodieTable<?, ?, ?, ?> table,
HoodieFileReader bootstrapFileReader = null;

Schema writerSchema = mergeHandle.getWriterSchemaWithMetaFields();
Schema readerSchema = baseFileReader.getSchema();
Schema readerSchema = AvroSchemaUtils.getRepairedSchema(baseFileReader.getSchema(), writerSchema);


// In case Advanced Schema Evolution is enabled we might need to rewrite currently
// persisted records to adhere to an evolved schema
Expand Down
24 changes: 24 additions & 0 deletions hudi-client/hudi-spark-client/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,10 @@
<groupId>org.pentaho</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
Expand Down Expand Up @@ -253,6 +257,26 @@
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.5.0</version>
<executions>
<execution>
<id>add-spark32plus-parquet-sources</id>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<skipAddSource>${spark31orEarlier}</skipAddSource>
<sources>
<source>src/parquet/scala</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>

<resources>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
import org.apache.spark.sql.types.UserDefinedType;
import org.apache.spark.sql.types.VarcharType;

import java.lang.reflect.Field;
import java.nio.charset.StandardCharsets;
import java.sql.Date;
import java.util.ArrayList;
Expand All @@ -80,6 +81,21 @@ private SparkInternalSchemaConverter() {
public static final String HOODIE_TABLE_PATH = "hoodie.tablePath";
public static final String HOODIE_VALID_COMMITS_LIST = "hoodie.valid.commits.list";

/**
* Get TimestampNTZType$ using reflection, as it's only available in Spark 3.3+.
* Falls back to TimestampType$ if TimestampNTZType is not available.
*/
private static DataType getTimestampNTZType() {
try {
Class<?> timestampNTZTypeClass = Class.forName("org.apache.spark.sql.types.TimestampNTZType$");
Field moduleField = timestampNTZTypeClass.getField("MODULE$");
return (DataType) moduleField.get(null);
} catch (ClassNotFoundException | NoSuchFieldException | IllegalAccessException e) {
// TimestampNTZType is not available in this Spark version, fall back to TimestampType
return TimestampType$.MODULE$;
}
}

public static Type buildTypeFromStructType(DataType sparkType, Boolean firstVisitRoot, AtomicInteger nextId) {
if (sparkType instanceof StructType) {
StructField[] fields = ((StructType) sparkType).fields();
Expand Down Expand Up @@ -265,10 +281,14 @@ private static DataType constructSparkSchemaFromType(Type type) {
case DATE:
return DateType$.MODULE$;
case TIME:
case TIME_MILLIS:
throw new UnsupportedOperationException(String.format("cannot convert %s type to Spark", type));
case TIMESTAMP:
// todo support TimeStampNTZ
case TIMESTAMP_MILLIS:
return TimestampType$.MODULE$;
case LOCAL_TIMESTAMP_MILLIS:
case LOCAL_TIMESTAMP_MICROS:
return getTimestampNTZType();
case STRING:
return StringType$.MODULE$;
case UUID:
Expand Down
Loading
Loading