diff --git a/.github/workflows/bot.yml b/.github/workflows/bot.yml index fd3cc67976a16..5c851b8041c34 100644 --- a/.github/workflows/bot.yml +++ b/.github/workflows/bot.yml @@ -177,29 +177,197 @@ jobs: java-version: '17' distribution: 'adopt' architecture: x64 + cache: maven + - name: Verify Java 17 version + run: | + echo "JAVA_HOME: $JAVA_HOME" + java -version + which java - name: Quickstart Test + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + run: | + export PATH="$JAVA_HOME/bin:$PATH" + mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DwildcardSuites=skipScalaTests -DfailIfNoTests=false -pl hudi-examples/hudi-examples-spark $MVN_ARGS + - name: Java UT - Common & Spark + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + SPARK_MODULES: ${{ matrix.sparkModules }} + run: | + export PATH="$JAVA_HOME/bin:$PATH" + mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DwildcardSuites=skipScalaTests -DfailIfNoTests=false -pl "hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + - name: Java FT - Spark + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + SPARK_MODULES: ${{ matrix.sparkModules }} + if: ${{ !endsWith(env.SPARK_PROFILE, '3.4') }} # skip test spark 3.4 as it's covered by Azure CI + run: | + export PATH="$JAVA_HOME/bin:$PATH" + mvn test -Pfunctional-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + + test-spark-java17-scala-tests: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.3" + sparkModules: "hudi-spark-datasource/hudi-spark3.3.x" + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.4" + sparkModules: "hudi-spark-datasource/hudi-spark3.4.x" + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: '8' + distribution: 'temurin' + architecture: x64 + cache: maven + - name: Build Project env: SCALA_PROFILE: ${{ matrix.scalaProfile }} SPARK_PROFILE: ${{ matrix.sparkProfile }} run: - mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl hudi-examples/hudi-examples-spark $MVN_ARGS - - name: UT - Common & Spark + mvn clean install -T 2 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DskipTests=true $MVN_ARGS -am -pl "hudi-examples/hudi-examples-spark,hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" + - name: Set up JDK 17 + uses: actions/setup-java@v3 + with: + java-version: '17' + distribution: 'temurin' + architecture: x64 + cache: maven + - name: Verify Java 17 version + run: | + echo "JAVA_HOME: $JAVA_HOME" + java -version + which java + - name: Scala UT - Common & Spark env: SCALA_PROFILE: ${{ matrix.scalaProfile }} SPARK_PROFILE: ${{ matrix.sparkProfile }} SPARK_MODULES: ${{ matrix.sparkModules }} - if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 as it's covered by Azure CI + run: | + export PATH="$JAVA_HOME/bin:$PATH" + mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -Dtest=skipJavaTests -DfailIfNoTests=false -pl "hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + - name: Scala FT - Spark + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + SPARK_MODULES: ${{ matrix.sparkModules }} + run: | + export PATH="$JAVA_HOME/bin:$PATH" + mvn test -Pfunctional-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -Dtest=skipJavaTests -DfailIfNoTests=false -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + + test-spark-java11-17-java-tests: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.5" + sparkModules: "hudi-spark-datasource/hudi-spark3.5.x" + - scalaProfile: "scala-2.13" + sparkProfile: "spark3.5" + sparkModules: "hudi-spark-datasource/hudi-spark3.5.x" + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'temurin' + architecture: x64 + cache: maven + - name: Build Project + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} run: - mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl "hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS - - name: FT - Spark + mvn clean install -T 2 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DskipTests=true $MVN_ARGS -am -pl "hudi-examples/hudi-examples-spark,hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" + - name: Set up JDK 17 + uses: actions/setup-java@v3 + with: + java-version: '17' + distribution: 'temurin' + architecture: x64 + cache: maven + - name: Quickstart Test + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + run: + mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DwildcardSuites=skipScalaTests -DfailIfNoTests=false -pl hudi-examples/hudi-examples-spark $MVN_ARGS + - name: Java UT - Common & Spark + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + SPARK_MODULES: ${{ matrix.sparkModules }} + run: + mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DwildcardSuites=skipScalaTests -DfailIfNoTests=false -pl "hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + - name: Java FT - Spark env: SCALA_PROFILE: ${{ matrix.scalaProfile }} SPARK_PROFILE: ${{ matrix.sparkProfile }} SPARK_MODULES: ${{ matrix.sparkModules }} - if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 as it's covered by Azure CI run: mvn test -Pfunctional-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + test-spark-java11-17-scala-tests: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - scalaProfile: "scala-2.12" + sparkProfile: "spark3.5" + sparkModules: "hudi-spark-datasource/hudi-spark3.5.x" + - scalaProfile: "scala-2.13" + sparkProfile: "spark3.5" + sparkModules: "hudi-spark-datasource/hudi-spark3.5.x" + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'temurin' + architecture: x64 + cache: maven + - name: Build Project + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + run: + mvn clean install -T 2 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -DskipTests=true $MVN_ARGS -am -pl "hudi-examples/hudi-examples-spark,hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" + - name: Set up JDK 17 + uses: actions/setup-java@v3 + with: + java-version: '17' + distribution: 'temurin' + architecture: x64 + cache: maven + - name: Scala UT - Common & Spark + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + SPARK_MODULES: ${{ matrix.sparkModules }} + run: + mvn test -Punit-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -Dtest=skipJavaTests -DfailIfNoTests=false -pl "hudi-common,$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + - name: Scala FT - Spark + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + SPARK_MODULES: ${{ matrix.sparkModules }} + run: + mvn test -Pfunctional-tests -Pjava17 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -Dtest=skipJavaTests -DfailIfNoTests=false -pl "$SPARK_COMMON_MODULES,$SPARK_MODULES" $MVN_ARGS + test-flink: runs-on: ubuntu-latest strategy: diff --git a/azure-pipelines-20230430.yml b/azure-pipelines-20230430.yml index 85d185fbc2c5c..59d48de4271e7 100644 --- a/azure-pipelines-20230430.yml +++ b/azure-pipelines-20230430.yml @@ -47,7 +47,7 @@ parameters: default: - 'hudi-spark-datasource' - 'hudi-spark-datasource/hudi-spark' - - 'hudi-spark-datasource/hudi-spark3.2.x' + - 'hudi-spark-datasource/hudi-spark3.4.x' - 'hudi-spark-datasource/hudi-spark3.2plus-common' - 'hudi-spark-datasource/hudi-spark3-common' - 'hudi-spark-datasource/hudi-spark-common' @@ -73,7 +73,7 @@ parameters: - '!hudi-flink-datasource/hudi-flink1.18.x' - '!hudi-spark-datasource' - '!hudi-spark-datasource/hudi-spark' - - '!hudi-spark-datasource/hudi-spark3.2.x' + - '!hudi-spark-datasource/hudi-spark3.4.x' - '!hudi-spark-datasource/hudi-spark3.2plus-common' - '!hudi-spark-datasource/hudi-spark3-common' - '!hudi-spark-datasource/hudi-spark-common' @@ -97,7 +97,7 @@ parameters: - '!hudi-flink-datasource/hudi-flink1.18.x' variables: - BUILD_PROFILES: '-Dscala-2.12 -Dspark3.2 -Dflink1.18' + BUILD_PROFILES: '-Dscala-2.12 -Dspark3.4 -Dflink1.18' PLUGIN_OPTS: '-Dcheckstyle.skip=true -Drat.skip=true -Djacoco.skip=true -ntp -B -V -Pwarn-log -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.shade=warn -Dorg.slf4j.simpleLogger.log.org.apache.maven.plugins.dependency=warn' MVN_OPTS_INSTALL: '-Phudi-platform-service -DskipTests $(BUILD_PROFILES) $(PLUGIN_OPTS) -Dmaven.wagon.httpconnectionManager.ttlSeconds=25 -Dmaven.wagon.http.retryHandler.count=5' MVN_OPTS_TEST: '-fae -Pwarn-log $(BUILD_PROFILES) $(PLUGIN_OPTS)' diff --git a/hudi-client/hudi-client-common/pom.xml b/hudi-client/hudi-client-common/pom.xml index 7a10ae12c35dd..e38327283df9f 100644 --- a/hudi-client/hudi-client-common/pom.xml +++ b/hudi-client/hudi-client-common/pom.xml @@ -53,6 +53,48 @@ org.apache.hudi hudi-timeline-service ${project.version} + + + + org.eclipse.jetty + * + + + + + + + + org.eclipse.jetty + jetty-server + + + org.eclipse.jetty + jetty-servlet + + + org.eclipse.jetty + jetty-http + + + org.eclipse.jetty + jetty-io + + + org.eclipse.jetty + jetty-util + + + org.eclipse.jetty + jetty-webapp + + + org.eclipse.jetty + jetty-xml + + + org.eclipse.jetty + jetty-security @@ -188,6 +230,10 @@ org.pentaho * + + org.codehaus.janino + janino + diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergedReadHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergedReadHandle.java index 738688c62193a..15ef4367d2d0d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergedReadHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergedReadHandle.java @@ -19,6 +19,7 @@ package org.apache.hudi.io; +import org.apache.hudi.avro.AvroSchemaUtils; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieLogFile; @@ -59,9 +60,12 @@ public HoodieMergedReadHandle(HoodieWriteConfig config, HoodieTable hoodieTable, Pair partitionPathFileIDPair) { super(config, instantTime, hoodieTable, partitionPathFileIDPair); - readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField()); + Schema orignalReaderSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField()); // config.getSchema is not canonicalized, while config.getWriteSchema is canonicalized. So, we have to use the canonicalized schema to read the existing data. baseFileReaderSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getWriteSchema()), config.allowOperationMetadataField()); + // Repair reader schema. + // Assume writer schema should be correct. If not, no repair happens. + readerSchema = AvroSchemaUtils.getRepairedSchema(orignalReaderSchema, baseFileReaderSchema); } public List> getMergedRecords() { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java index 1990b2dab44ef..ea2e0911d3010 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java @@ -41,6 +41,7 @@ import java.util.TimeZone; import java.util.concurrent.TimeUnit; +import static java.util.concurrent.TimeUnit.MICROSECONDS; import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.TimeUnit.SECONDS; import static org.apache.hudi.common.config.TimestampKeyGeneratorConfig.DATE_TIME_PARSER; @@ -54,7 +55,7 @@ */ public class TimestampBasedAvroKeyGenerator extends SimpleAvroKeyGenerator { public enum TimestampType implements Serializable { - UNIX_TIMESTAMP, DATE_STRING, MIXED, EPOCHMILLISECONDS, SCALAR + UNIX_TIMESTAMP, DATE_STRING, MIXED, EPOCHMILLISECONDS, EPOCHMICROSECONDS, SCALAR } private final TimeUnit timeUnit; @@ -93,6 +94,9 @@ public TimestampBasedAvroKeyGenerator(TypedProperties config) throws IOException case EPOCHMILLISECONDS: timeUnit = MILLISECONDS; break; + case EPOCHMICROSECONDS: + timeUnit = MICROSECONDS; + break; case UNIX_TIMESTAMP: timeUnit = SECONDS; break; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java index dfa464d8af8b5..080479fc417d6 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -45,6 +45,7 @@ import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java index c1523d564e480..8b8848fa95d6e 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java @@ -18,6 +18,7 @@ package org.apache.hudi.table.action.commit; +import org.apache.hudi.avro.AvroSchemaUtils; import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieRecord; @@ -86,7 +87,8 @@ public void runMerge(HoodieTable table, HoodieFileReader bootstrapFileReader = null; Schema writerSchema = mergeHandle.getWriterSchemaWithMetaFields(); - Schema readerSchema = baseFileReader.getSchema(); + Schema readerSchema = AvroSchemaUtils.getRepairedSchema(baseFileReader.getSchema(), writerSchema); + // In case Advanced Schema Evolution is enabled we might need to rewrite currently // persisted records to adhere to an evolved schema diff --git a/hudi-client/hudi-spark-client/pom.xml b/hudi-client/hudi-spark-client/pom.xml index fa437494fd9f5..10b98bfb1dbba 100644 --- a/hudi-client/hudi-spark-client/pom.xml +++ b/hudi-client/hudi-spark-client/pom.xml @@ -169,6 +169,10 @@ org.pentaho * + + org.codehaus.janino + janino + @@ -253,6 +257,26 @@ org.apache.rat apache-rat-plugin + + org.codehaus.mojo + build-helper-maven-plugin + 3.5.0 + + + add-spark32plus-parquet-sources + generate-sources + + add-source + + + ${spark31orEarlier} + + src/parquet/scala + + + + + diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java index 2b14bb3a0665b..7b7c18dd95c66 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkInternalSchemaConverter.java @@ -61,6 +61,7 @@ import org.apache.spark.sql.types.UserDefinedType; import org.apache.spark.sql.types.VarcharType; +import java.lang.reflect.Field; import java.nio.charset.StandardCharsets; import java.sql.Date; import java.util.ArrayList; @@ -80,6 +81,21 @@ private SparkInternalSchemaConverter() { public static final String HOODIE_TABLE_PATH = "hoodie.tablePath"; public static final String HOODIE_VALID_COMMITS_LIST = "hoodie.valid.commits.list"; + /** + * Get TimestampNTZType$ using reflection, as it's only available in Spark 3.3+. + * Falls back to TimestampType$ if TimestampNTZType is not available. + */ + private static DataType getTimestampNTZType() { + try { + Class timestampNTZTypeClass = Class.forName("org.apache.spark.sql.types.TimestampNTZType$"); + Field moduleField = timestampNTZTypeClass.getField("MODULE$"); + return (DataType) moduleField.get(null); + } catch (ClassNotFoundException | NoSuchFieldException | IllegalAccessException e) { + // TimestampNTZType is not available in this Spark version, fall back to TimestampType + return TimestampType$.MODULE$; + } + } + public static Type buildTypeFromStructType(DataType sparkType, Boolean firstVisitRoot, AtomicInteger nextId) { if (sparkType instanceof StructType) { StructField[] fields = ((StructType) sparkType).fields(); @@ -265,10 +281,14 @@ private static DataType constructSparkSchemaFromType(Type type) { case DATE: return DateType$.MODULE$; case TIME: + case TIME_MILLIS: throw new UnsupportedOperationException(String.format("cannot convert %s type to Spark", type)); case TIMESTAMP: - // todo support TimeStampNTZ + case TIMESTAMP_MILLIS: return TimestampType$.MODULE$; + case LOCAL_TIMESTAMP_MILLIS: + case LOCAL_TIMESTAMP_MICROS: + return getTimestampNTZType(); case STRING: return StringType$.MODULE$; case UUID: diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetReader.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetReader.java index 2a22eacea8c5a..45b0093b8ffd7 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetReader.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetReader.java @@ -18,22 +18,25 @@ package org.apache.hudi.io.storage; -import org.apache.avro.Schema; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; import org.apache.hudi.SparkAdapterSupport$; +import org.apache.hudi.avro.AvroSchemaUtils; import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.model.HoodieSparkRecord; import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieSparkRecord; import org.apache.hudi.common.util.BaseFileUtils; -import org.apache.hudi.common.util.collection.ClosableIterator; -import org.apache.hudi.common.util.collection.CloseableMappingIterator; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ParquetReaderIterator; import org.apache.hudi.common.util.ParquetUtils; import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.ClosableIterator; +import org.apache.hudi.common.util.collection.CloseableMappingIterator; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.api.ReadSupport; import org.apache.parquet.schema.MessageType; @@ -53,6 +56,7 @@ import static org.apache.hudi.common.util.TypeUtils.unsafeCast; import static org.apache.parquet.avro.AvroSchemaConverter.ADD_LIST_ELEMENT_RECORDS; +import static org.apache.parquet.avro.HoodieAvroParquetSchemaConverter.getAvroSchemaConverter; public class HoodieSparkParquetReader implements HoodieSparkFileReader { @@ -60,6 +64,10 @@ public class HoodieSparkParquetReader implements HoodieSparkFileReader { private final Configuration conf; private final BaseFileUtils parquetUtils; private List readerIterators = new ArrayList<>(); + public static final String ENABLE_LOGICAL_TIMESTAMP_REPAIR = "spark.hudi.logicalTimestampField.repair.enable"; + private Option fileSchemaOption = Option.empty(); + private Option structTypeOption = Option.empty(); + private Option schemaOption = Option.empty(); public HoodieSparkParquetReader(Configuration conf, Path path) { this.path = path; @@ -118,35 +126,62 @@ private ClosableIterator getInternalRowIterator(Schema readerSchema if (requestedSchema == null) { requestedSchema = readerSchema; } - StructType readerStructType = HoodieInternalRowUtils.getCachedSchema(readerSchema); - StructType requestedStructType = HoodieInternalRowUtils.getCachedSchema(requestedSchema); - conf.set(ParquetReadSupport.PARQUET_READ_SCHEMA, readerStructType.json()); - conf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA(), requestedStructType.json()); - conf.setBoolean(SQLConf.PARQUET_BINARY_AS_STRING().key(), (Boolean) SQLConf.get().getConf(SQLConf.PARQUET_BINARY_AS_STRING())); - conf.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP().key(), (Boolean) SQLConf.get().getConf(SQLConf.PARQUET_INT96_AS_TIMESTAMP())); - ParquetReader reader = ParquetReader.builder((ReadSupport) new ParquetReadSupport(), path) - .withConf(conf) + MessageType fileSchema = getFileSchema(); + Schema nonNullSchema = AvroSchemaUtils.getNonNullTypeFromUnion(requestedSchema); + Option messageSchema = Option.of(getAvroSchemaConverter(conf).convert(nonNullSchema)); + Pair readerSchemas = + SparkAdapterSupport$.MODULE$.sparkAdapter().getReaderSchemas(conf, readerSchema, requestedSchema, fileSchema); + conf.set(ParquetReadSupport.PARQUET_READ_SCHEMA, readerSchemas.getLeft().json()); + conf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA(), readerSchemas.getRight().json()); + conf.set(SQLConf.PARQUET_BINARY_AS_STRING().key(), SQLConf.get().getConf(SQLConf.PARQUET_BINARY_AS_STRING()).toString()); + conf.set(SQLConf.PARQUET_INT96_AS_TIMESTAMP().key(), SQLConf.get().getConf(SQLConf.PARQUET_INT96_AS_TIMESTAMP()).toString()); + ParquetReader reader = ParquetReader.builder( + (ReadSupport) SparkAdapterSupport$.MODULE$.sparkAdapter().getParquetReadSupport(messageSchema), + new Path(path.toUri())).withConf(conf) .build(); ParquetReaderIterator parquetReaderIterator = new ParquetReaderIterator<>(reader); readerIterators.add(parquetReaderIterator); return parquetReaderIterator; } + private MessageType getFileSchema() { + if (!fileSchemaOption.isPresent()) { + MessageType messageType = ((ParquetUtils) parquetUtils).readSchema(conf, path); + fileSchemaOption = Option.of(messageType); + } + return fileSchemaOption.get(); + } + @Override public Schema getSchema() { - // Some types in avro are not compatible with parquet. - // Avro only supports representing Decimals as fixed byte array - // and therefore if we convert to Avro directly we'll lose logical type-info. - MessageType messageType = ((ParquetUtils) parquetUtils).readSchema(conf, path); - StructType structType = new ParquetToSparkSchemaConverter(conf).convert(messageType); - return SparkAdapterSupport$.MODULE$.sparkAdapter() - .getAvroSchemaConverters() - .toAvroType(structType, true, messageType.getName(), StringUtils.EMPTY_STRING); + if (!schemaOption.isPresent()) { + // Some types in avro are not compatible with parquet. + // Avro only supports representing Decimals as fixed byte array + // and therefore if we convert to Avro directly we'll lose logical type-info. + MessageType messageType = getFileSchema(); + StructType structType = getStructSchema(); + schemaOption = Option.of(SparkAdapterSupport$.MODULE$.sparkAdapter() + .getAvroSchemaConverters() + .toAvroType(structType, true, messageType.getName(), StringUtils.EMPTY_STRING)); + } + return schemaOption.get(); + } + + protected StructType getStructSchema() { + if (!structTypeOption.isPresent()) { + MessageType messageType = getFileSchema(); + structTypeOption = Option.of(convertToStruct(messageType)); + } + return structTypeOption.get(); + } + + private StructType convertToStruct(MessageType messageType) { + return new ParquetToSparkSchemaConverter(conf).convert(messageType); } @Override public void close() { - readerIterators.forEach(ParquetReaderIterator::close); + readerIterators.forEach(it -> it.close()); } @Override diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java index 3a1b6d000becc..ebf77cbd7fde7 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowParquetWriteSupport.java @@ -18,11 +18,12 @@ package org.apache.hudi.io.storage.row; -import org.apache.hadoop.conf.Configuration; import org.apache.hudi.avro.HoodieBloomFilterWriteSupport; import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.util.Option; + +import org.apache.hadoop.conf.Configuration; import org.apache.parquet.hadoop.api.WriteSupport; import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport; import org.apache.spark.sql.types.StructType; @@ -88,5 +89,4 @@ protected UTF8String dereference(UTF8String key) { return key.clone(); } } - } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java index 2c3ddfdcda2ce..f6874e28d5d95 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java @@ -40,7 +40,6 @@ import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.fs.Path; -import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.metadata.ParquetMetadata; @@ -56,6 +55,7 @@ import java.util.function.Function; import static org.apache.hudi.io.HoodieBootstrapHandle.METADATA_BOOTSTRAP_RECORD_SCHEMA; +import static org.apache.parquet.avro.HoodieAvroParquetSchemaConverter.getAvroSchemaConverter; class ParquetBootstrapMetadataHandler extends BaseBootstrapMetadataHandler { @@ -68,7 +68,7 @@ Schema getAvroSchema(Path sourceFilePath) throws IOException { ParquetMetadata readFooter = ParquetFileReader.readFooter(table.getHadoopConf(), sourceFilePath, ParquetMetadataConverter.NO_FILTER); MessageType parquetSchema = readFooter.getFileMetaData().getSchema(); - return new AvroSchemaConverter().convert(parquetSchema); + return getAvroSchemaConverter(table.getHadoopConf()).convert(parquetSchema); } @Override diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala index d84679eaf923a..7948b1a4baa05 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala @@ -84,7 +84,7 @@ object AvroConversionUtils { recordNamespace: String): Row => GenericRecord = { val serde = sparkAdapter.createSparkRowSerDe(sourceSqlType) val avroSchema = AvroConversionUtils.convertStructTypeToAvroSchema(sourceSqlType, structName, recordNamespace) - val nullable = AvroSchemaUtils.resolveNullableSchema(avroSchema) != avroSchema + val nullable = AvroSchemaUtils.getNonNullTypeFromUnion(avroSchema) != avroSchema val converter = AvroConversionUtils.createInternalRowToAvroConverter(sourceSqlType, avroSchema, nullable) @@ -97,8 +97,11 @@ object AvroConversionUtils { * TODO convert directly from GenericRecord into InternalRow instead */ def createDataFrame(rdd: RDD[GenericRecord], schemaStr: String, ss: SparkSession): Dataset[Row] = { - if (rdd.isEmpty()) { - ss.emptyDataFrame + // Avoid calling isEmpty() which can cause serialization issues with Ordering$Reverse + // Check partition count instead, which doesn't require task serialization + val structType = convertAvroSchemaToStructType(new Schema.Parser().parse(schemaStr)) + if (rdd.getNumPartitions == 0) { + ss.createDataFrame(ss.sparkContext.emptyRDD[Row], structType) } else { ss.createDataFrame(rdd.mapPartitions { records => if (records.isEmpty) Iterator.empty @@ -108,7 +111,7 @@ object AvroConversionUtils { val converter = createConverterToRow(schema, dataType) records.map { r => converter(r) } } - }, convertAvroSchemaToStructType(new Schema.Parser().parse(schemaStr))) + }, structType) } } diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala index a0fe879b3dbea..efd32bae76487 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala @@ -93,7 +93,7 @@ object HoodieSparkUtils extends SparkAdapterSupport with SparkVersionsSupport wi // making Spark deserialize its internal representation [[InternalRow]] into [[Row]] for subsequent conversion // (and back) val sameSchema = writerAvroSchema.equals(readerAvroSchema) - val nullable = AvroSchemaUtils.resolveNullableSchema(writerAvroSchema) != writerAvroSchema + val nullable = AvroSchemaUtils.getNonNullTypeFromUnion(writerAvroSchema) != writerAvroSchema // NOTE: We have to serialize Avro schema, and then subsequently parse it on the executor node, since Spark // serializer is not able to digest it @@ -152,7 +152,7 @@ object HoodieSparkUtils extends SparkAdapterSupport with SparkVersionsSupport wi // making Spark deserialize its internal representation [[InternalRow]] into [[Row]] for subsequent conversion // (and back) val sameSchema = writerAvroSchema.equals(readerAvroSchema) - val nullable = AvroSchemaUtils.resolveNullableSchema(writerAvroSchema) != writerAvroSchema + val nullable = AvroSchemaUtils.getNonNullTypeFromUnion(writerAvroSchema) != writerAvroSchema // NOTE: We have to serialize Avro schema, and then subsequently parse it on the executor node, since Spark // serializer is not able to digest it diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkConversionUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkConversionUtils.scala index 799bda2f8e5ba..736f69fd17157 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkConversionUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkConversionUtils.scala @@ -27,7 +27,9 @@ import org.apache.spark.sql.{DataFrameUtil, Dataset, Row, SparkSession} object SparkConversionUtils { def createDataFrame[T](rdd: RDD[HoodieRecord[T]], ss: SparkSession, structType: StructType): Dataset[Row] = { - if (rdd.isEmpty()) { + // Avoid calling isEmpty() which can cause serialization issues with Ordering$Reverse + // Check partition count instead, which doesn't require task serialization + if (rdd.getNumPartitions == 0) { ss.emptyDataFrame } else { DataFrameUtil.createFromInternalRows(ss, structType, rdd.map(_.getData.asInstanceOf[InternalRow])) diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/HoodieSparkKryoRegistrar.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/HoodieSparkKryoRegistrar.scala index dd98227d4407c..b22a0f070364f 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/HoodieSparkKryoRegistrar.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/HoodieSparkKryoRegistrar.scala @@ -89,6 +89,8 @@ object HoodieSparkKryoRegistrar { private val KRYO_USER_REGISTRATORS = "spark.kryo.registrator" def register(conf: SparkConf): SparkConf = { - conf.set(KRYO_USER_REGISTRATORS, Seq(classOf[HoodieSparkKryoRegistrar].getName).mkString(",")) + // Use class name directly to avoid Scala collection binary compatibility issues + // when compiled with Scala 2.13 but running with Spark 3.5 (Scala 2.12) + conf.set(KRYO_USER_REGISTRATORS, classOf[HoodieSparkKryoRegistrar].getName) } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormatHelper.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormatHelper.scala similarity index 78% rename from hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormatHelper.scala rename to hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormatHelper.scala index 599bbebe4f6c4..3e13e3bf7f9de 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormatHelper.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormatHelper.scala @@ -1,34 +1,41 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.conf.Configuration import org.apache.parquet.hadoop.metadata.FileMetaData -import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructField, StructType} +import org.apache.spark.sql.types._ object HoodieParquetFileFormatHelper { - def buildImplicitSchemaChangeInfo(hadoopConf: Configuration, parquetFileMetaData: FileMetaData, requiredSchema: StructType): (java.util.Map[Integer, org.apache.hudi.common.util.collection.Pair[DataType, DataType]], StructType) = { - val implicitTypeChangeInfo: java.util.Map[Integer, org.apache.hudi.common.util.collection.Pair[DataType, DataType]] = new java.util.HashMap() val convert = new ParquetToSparkSchemaConverter(hadoopConf) val fileStruct = convert.convert(parquetFileMetaData.getSchema) + buildImplicitSchemaChangeInfo(fileStruct, requiredSchema) + } + + def buildImplicitSchemaChangeInfo(fileStruct: StructType, + requiredSchema: StructType): (java.util.Map[Integer, org.apache.hudi.common.util.collection.Pair[DataType, DataType]], StructType) = { + val implicitTypeChangeInfo: java.util.Map[Integer, org.apache.hudi.common.util.collection.Pair[DataType, DataType]] = new java.util.HashMap() + val fileStructMap = fileStruct.fields.map(f => (f.name, f.dataType)).toMap // if there are missing fields or if field's data type needs to be changed while reading, we handle it here. val sparkRequestStructFields = requiredSchema.map(f => { @@ -45,6 +52,7 @@ object HoodieParquetFileFormatHelper { } def isDataTypeEqual(requiredType: DataType, fileType: DataType): Boolean = (requiredType, fileType) match { + case (requiredType, fileType) if requiredType == fileType => true case (ArrayType(rt, _), ArrayType(ft, _)) => diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala index 1c6111afe47f3..e8058aa1f2248 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala @@ -19,14 +19,18 @@ package org.apache.spark.sql.hudi import org.apache.avro.Schema +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hudi.client.utils.SparkRowSerDe import org.apache.hudi.common.table.HoodieTableMetaClient + +import org.apache.parquet.schema.MessageType import org.apache.spark.sql._ import org.apache.spark.sql.avro.{HoodieAvroDeserializer, HoodieAvroSchemaConverters, HoodieAvroSerializer} import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, InterpretedPredicate} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, InterpretedPredicate} import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan} @@ -36,6 +40,7 @@ import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.parser.HoodieExtendedParserInterface import org.apache.spark.sql.sources.{BaseRelation, Filter} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DataType, Metadata, StructType} import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} import org.apache.spark.storage.StorageLevel @@ -48,10 +53,21 @@ import java.util.{Locale, TimeZone} trait SparkAdapter extends Serializable { /** - * Checks whether provided instance of [[InternalRow]] is actually an instance of [[ColumnarBatchRow]] + * Checks whether provided instance of [[InternalRow]] is actually an instance of [[org.apache.spark.sql.vectorized.ColumnarBatchRow]] */ def isColumnarBatchRow(r: InternalRow): Boolean + def isTimestampNTZType(dataType: DataType): Boolean + + def getParquetReadSupport(messageSchema: org.apache.hudi.common.util.Option[MessageType]): org.apache.parquet.hadoop.api.ReadSupport[_] + + def repairSchemaIfSpecified(shouldRepair: Boolean, + fileSchema: MessageType, + tableSchemaOpt: org.apache.hudi.common.util.Option[MessageType]): MessageType + + def getReaderSchemas(conf: Configuration, readerSchema: Schema, requestedSchema: Schema, fileSchema: MessageType): + org.apache.hudi.common.util.collection.Pair[StructType, StructType] + /** * Creates Catalyst [[Metadata]] for Hudi's meta-fields (designating these w/ * [[METADATA_COL_ATTR_KEY]] if available (available in Spark >= 3.2) @@ -65,7 +81,7 @@ trait SparkAdapter extends Serializable { /** * Returns an instance of [[HoodieCatalogUtils]] providing for common utils operating on Spark's - * [[TableCatalog]]s + * [[org.apache.spark.sql.connector.catalog.TableCatalog]]s */ def getCatalogUtils: HoodieCatalogUtils @@ -169,7 +185,7 @@ trait SparkAdapter extends Serializable { /** * Create instance of [[ParquetFileFormat]] */ - def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] + def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean, tableAvroSchema: Schema): Option[ParquetFileFormat] def makeColumnarBatch(vectors: Array[ColumnVector], numRows: Int): ColumnarBatch @@ -200,7 +216,7 @@ trait SparkAdapter extends Serializable { metadataColumns: Seq[AttributeReference] = Seq.empty): FileScanRDD /** - * Extract condition in [[DeleteFromTable]] + * Extract condition in [[org.apache.spark.sql.catalyst.plans.logical.DeleteFromTable]] * SPARK-38626 condition is no longer Option in Spark 3.3 */ def extractDeleteCondition(deleteFromTable: Command): Expression @@ -214,4 +230,23 @@ trait SparkAdapter extends Serializable { * Tries to translate a Catalyst Expression into data source Filter */ def translateFilter(predicate: Expression, supportNestedPredicatePushdown: Boolean = false): Option[Filter] + + /** + * @param sparkSession Spark session (required for Spark 3.5 to access Analyzer) + * @param tableName table name + * @param expected expected attributes + * @param query query logical plan + * @param byName whether to match by name + * @param conf SQL configuration + * @return resolved logical plan + */ + def resolveOutputColumns(sparkSession: SparkSession, + tableName: String, + expected: Seq[Attribute], + query: LogicalPlan, + byName: Boolean, + conf: SQLConf): LogicalPlan = { + // Default implementation delegates to CatalystPlanUtils (for Spark < 3.5) + getCatalystPlanUtils.resolveOutputColumns(tableName, expected, query, byName, conf) + } } diff --git a/hudi-client/hudi-spark-client/src/parquet/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieLegacyParquetFileFormatHelper.scala b/hudi-client/hudi-spark-client/src/parquet/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieLegacyParquetFileFormatHelper.scala new file mode 100644 index 0000000000000..b5669019bd066 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/parquet/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieLegacyParquetFileFormatHelper.scala @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.spark.sql.HoodieSchemaUtils +import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.catalyst.expressions.{ArrayTransform, Attribute, Cast, CreateNamedStruct, CreateStruct, Expression, GetStructField, LambdaFunction, Literal, MapEntries, MapFromEntries, NamedLambdaVariable, UnsafeProjection} +import org.apache.spark.sql.types._ + +object HoodieLegacyParquetFileFormatHelper { + def generateUnsafeProjection(fullSchema: Seq[Attribute], + timeZoneId: Option[String], + typeChangeInfos: java.util.Map[Integer, org.apache.hudi.common.util.collection.Pair[DataType, DataType]], + requiredSchema: StructType, + partitionSchema: StructType, + schemaUtils: HoodieSchemaUtils): UnsafeProjection = { + val addedCastCache = scala.collection.mutable.HashMap.empty[(DataType, DataType), Boolean] + + def hasUnsupportedConversion(src: DataType, dst: DataType): Boolean = { + addedCastCache.getOrElseUpdate((src, dst), { + (src, dst) match { + case (FloatType, DoubleType) => true + case (IntegerType, DecimalType()) => true + case (LongType, DecimalType()) => true + case (FloatType, DecimalType()) => true + case (DoubleType, DecimalType()) => true + case (StringType, DecimalType()) => true + case (StringType, DateType) => true + case (StructType(srcFields), StructType(dstFields)) => + srcFields.zip(dstFields).exists { case (sf, df) => hasUnsupportedConversion(sf.dataType, df.dataType) } + case (ArrayType(sElem, _), ArrayType(dElem, _)) => + hasUnsupportedConversion(sElem, dElem) + case (MapType(sKey, sVal, _), MapType(dKey, dVal, _)) => + hasUnsupportedConversion(sKey, dKey) || hasUnsupportedConversion(sVal, dVal) + case _ => false + } + }) + } + + def recursivelyCastExpressions(expr: Expression, srcType: DataType, dstType: DataType): Expression = { + lazy val needTimeZone = Cast.needsTimeZone(srcType, dstType) + (srcType, dstType) match { + case (FloatType, DoubleType) => + val toStr = Cast(expr, StringType, if (needTimeZone) timeZoneId else None) + Cast(toStr, dstType, if (needTimeZone) timeZoneId else None) + case (IntegerType | LongType | FloatType | DoubleType, dec: DecimalType) => + val toStr = Cast(expr, StringType, if (needTimeZone) timeZoneId else None) + Cast(toStr, dec, if (needTimeZone) timeZoneId else None) + case (StringType, dec: DecimalType) => + Cast(expr, dec, if (needTimeZone) timeZoneId else None) + case (StringType, DateType) => + Cast(expr, DateType, if (needTimeZone) timeZoneId else None) + case (s: StructType, d: StructType) if hasUnsupportedConversion(s, d) => + val structFields = s.fields.zip(d.fields).zipWithIndex.map { + case ((srcField, dstField), i) => + val child = GetStructField(expr, i, Some(dstField.name)) + recursivelyCastExpressions(child, srcField.dataType, dstField.dataType) + } + CreateNamedStruct(d.fields.zip(structFields).flatMap { + case (f, c) => Seq(Literal(f.name), c) + }) + case (ArrayType(sElementType, containsNull), ArrayType(dElementType, _)) if hasUnsupportedConversion(sElementType, dElementType) => + val lambdaVar = NamedLambdaVariable("element", sElementType, containsNull) + val body = recursivelyCastExpressions(lambdaVar, sElementType, dElementType) + val func = LambdaFunction(body, Seq(lambdaVar)) + ArrayTransform(expr, func) + case (MapType(sKeyType, sValType, vnull), MapType(dKeyType, dValType, _)) + if hasUnsupportedConversion(sKeyType, dKeyType) || hasUnsupportedConversion(sValType, dValType) => + val kv = NamedLambdaVariable("kv", new StructType() + .add("key", sKeyType, nullable = false) + .add("value", sValType, nullable = vnull), nullable = false) + val newKey = recursivelyCastExpressions(GetStructField(kv, 0), sKeyType, dKeyType) + val newVal = recursivelyCastExpressions(GetStructField(kv, 1), sValType, dValType) + val entry = CreateStruct(Seq(newKey, newVal)) + val func = LambdaFunction(entry, Seq(kv)) + val transformed = ArrayTransform(MapEntries(expr), func) + MapFromEntries(transformed) + case _ => + // most cases should be covered here we only need to do the recursive work for float to double + Cast(expr, dstType, if (needTimeZone) timeZoneId else None) + } + } + + if (typeChangeInfos.isEmpty) { + GenerateUnsafeProjection.generate(fullSchema, fullSchema) + } else { + // find type changed. + val newSchema = new StructType(requiredSchema.fields.zipWithIndex.map { case (f, i) => + if (typeChangeInfos.containsKey(i)) { + StructField(f.name, typeChangeInfos.get(i).getRight, f.nullable, f.metadata) + } else f + }) + val newFullSchema = newSchema.toAttributes ++ partitionSchema.toAttributes + val castSchema = newFullSchema.zipWithIndex.map { case (attr, i) => + if (typeChangeInfos.containsKey(i)) { + val srcType = typeChangeInfos.get(i).getRight + val dstType = typeChangeInfos.get(i).getLeft + recursivelyCastExpressions(attr, srcType, dstType) + } else attr + } + GenerateUnsafeProjection.generate(castSchema, newFullSchema) + } + } +} diff --git a/hudi-client/hudi-spark-client/src/parquet/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetReadSupport.scala b/hudi-client/hudi-spark-client/src/parquet/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetReadSupport.scala new file mode 100644 index 0000000000000..f19a2952a6637 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/parquet/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetReadSupport.scala @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.hudi.SparkAdapterSupport +import org.apache.hudi.common.util.{Option => HOption} +import org.apache.hudi.common.util.ValidationUtils +import org.apache.parquet.hadoop.api.InitContext +import org.apache.parquet.hadoop.api.ReadSupport.ReadContext +import org.apache.parquet.schema.{GroupType, MessageType, Type, Types} +import org.apache.spark.sql.catalyst.util.RebaseDateTime.RebaseSpec + +import java.time.ZoneId +import scala.collection.JavaConverters._ + +class HoodieParquetReadSupport(convertTz: Option[ZoneId], + enableVectorizedReader: Boolean, + val enableTimestampFieldRepair: Boolean, + datetimeRebaseSpec: RebaseSpec, + int96RebaseSpec: RebaseSpec, + tableSchemaOpt: HOption[MessageType] = HOption.empty()) + extends ParquetReadSupport(convertTz, enableVectorizedReader, datetimeRebaseSpec, int96RebaseSpec) with SparkAdapterSupport { + + override def init(context: InitContext): ReadContext = { + val readContext = super.init(context) + // Repairing is needed here because this is the schema that is used by the reader to decide what + // conversions are necessary + val requestedParquetSchema = if (enableTimestampFieldRepair) { + HoodieParquetReadSupport.getRepairedSchema(readContext.getRequestedSchema, tableSchemaOpt) + } else { + readContext.getRequestedSchema + } + val trimmedParquetSchema = HoodieParquetReadSupport.trimParquetSchema(requestedParquetSchema, context.getFileSchema) + new ReadContext(trimmedParquetSchema, readContext.getReadSupportMetadata) + } +} + +object HoodieParquetReadSupport { + /** + * Removes any fields from the parquet schema that do not have any child fields in the actual file schema after the + * schema is trimmed down to the requested fields. This can happen when the table schema evolves and only a subset of + * the nested fields are required by the query. + * + * @param requestedSchema the initial parquet schema requested by Spark + * @param fileSchema the actual parquet schema of the file + * @return a potentially updated schema with empty struct fields removed + */ + def trimParquetSchema(requestedSchema: MessageType, fileSchema: MessageType): MessageType = { + val trimmedFields = requestedSchema.getFields.asScala.map(field => { + if (fileSchema.containsField(field.getName)) { + trimParquetType(field, fileSchema.asGroupType().getType(field.getName)) + } else { + Some(field) + } + }).filter(_.isDefined).map(_.get).toArray[Type] + Types.buildMessage().addFields(trimmedFields: _*).named(requestedSchema.getName) + } + + private def trimParquetType(requestedType: Type, fileType: Type): Option[Type] = { + if (requestedType.equals(fileType)) { + Some(requestedType) + } else { + requestedType match { + case groupType: GroupType => + ValidationUtils.checkState(!fileType.isPrimitive, + "Group type provided by requested schema but existing type in the file is a primitive") + val fileTypeGroup = fileType.asGroupType() + var hasMatchingField = false + val fields = groupType.getFields.asScala.map(field => { + if (fileTypeGroup.containsField(field.getName)) { + hasMatchingField = true + trimParquetType(field, fileType.asGroupType().getType(field.getName)) + } else { + Some(field) + } + }).filter(_.isDefined).map(_.get).asJava + if (hasMatchingField && !fields.isEmpty) { + Some(groupType.withNewFields(fields)) + } else { + None + } + case _ => Some(requestedType) + } + } + } + + def getRepairedSchema(fileSchema: MessageType, + tableSchema: org.apache.hudi.common.util.Option[MessageType]): MessageType = { + try { + val schemaRepairClass = Class.forName("org.apache.parquet.schema.SchemaRepair") + val repairMethod = schemaRepairClass.getMethod( + "repairLogicalTypes", classOf[MessageType], classOf[org.apache.hudi.common.util.Option[MessageType]]) + repairMethod.invoke(null, fileSchema, tableSchema).asInstanceOf[MessageType] + } catch { + case _: Exception => fileSchema + } + } +} diff --git a/hudi-client/hudi-spark-client/src/parquet/scala/org/apache/spark/sql/execution/datasources/parquet/SparkBasicSchemaEvolution.scala b/hudi-client/hudi-spark-client/src/parquet/scala/org/apache/spark/sql/execution/datasources/parquet/SparkBasicSchemaEvolution.scala new file mode 100644 index 0000000000000..6a6a094bbac3c --- /dev/null +++ b/hudi-client/hudi-spark-client/src/parquet/scala/org/apache/spark/sql/execution/datasources/parquet/SparkBasicSchemaEvolution.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.spark.sql.types.StructType + +/** + * Intended to be used just with HoodieSparkParquetReader to avoid any java/scala issues + */ +class SparkBasicSchemaEvolution(fileSchema: StructType, + requiredSchema: StructType, + sessionLocalTimeZone: String) { + + val (implicitTypeChangeInfo, sparkRequestSchema) = HoodieParquetFileFormatHelper.buildImplicitSchemaChangeInfo(fileSchema, requiredSchema) + + def getRequestSchema: StructType = { + if (implicitTypeChangeInfo.isEmpty) { + requiredSchema + } else { + sparkRequestSchema + } + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java index 1a0d5a95f9a0f..856d8a3c9e88e 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java @@ -72,7 +72,7 @@ public class TestTableSchemaEvolution extends HoodieClientTestBase { public static final String EXTRA_FIELD_WITHOUT_DEFAULT_SCHEMA = "{\"name\": \"new_field_without_default\", \"type\": \"boolean\"},"; public static final String EXTRA_FIELD_NULLABLE_SCHEMA = - ",{\"name\": \"new_field_without_default\", \"type\": [\"boolean\", \"null\"]}"; + "{\"name\": \"new_field_without_default\", \"type\": [\"null\", \"boolean\"], \"default\": null},"; // TRIP_EXAMPLE_SCHEMA with a new_field added public static final String TRIP_EXAMPLE_SCHEMA_EVOLVED_COL_ADDED = TRIP_SCHEMA_PREFIX + EXTRA_TYPE_SCHEMA + MAP_TYPE_SCHEMA @@ -152,7 +152,7 @@ public void testSchemaCompatibilityBasic() { "Added field without default and not nullable is not compatible (Evolved Schema)"); assertTrue(isSchemaCompatible(TRIP_EXAMPLE_SCHEMA, TRIP_SCHEMA_PREFIX + EXTRA_TYPE_SCHEMA + MAP_TYPE_SCHEMA - + FARE_NESTED_SCHEMA + TIP_NESTED_SCHEMA + TRIP_SCHEMA_SUFFIX + EXTRA_FIELD_NULLABLE_SCHEMA, false), + + FARE_NESTED_SCHEMA + TIP_NESTED_SCHEMA + EXTRA_FIELD_NULLABLE_SCHEMA + TRIP_SCHEMA_SUFFIX, false), "Added nullable field is compatible (Evolved Schema)"); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java index a7808ea938248..b47a763b6b245 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java @@ -107,9 +107,16 @@ public static SparkConf getSparkConfForTest(String appName) { sparkConf.set("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension"); } - if (canLoadClass("org.apache.spark.sql.hudi.catalog.HoodieCatalog") && HoodieSparkUtils.gteqSpark3_2()) { - sparkConf.set("spark.sql.catalog.spark_catalog", - "org.apache.spark.sql.hudi.catalog.HoodieCatalog"); + try { + if (canLoadClass("org.apache.spark.sql.hudi.catalog.HoodieCatalog") && HoodieSparkUtils.gteqSpark3_2()) { + sparkConf.set("spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.hudi.catalog.HoodieCatalog"); + } + } catch (LinkageError e) { + // Handle Scala version compatibility issues (e.g., Scala 2.12 vs 2.13) + // If we can't determine the Spark version, skip setting the catalog + // This can happen when Scala code compiled with 2.12 references types removed in 2.13 + // LinkageError catches both NoClassDefFoundError and other linkage errors } String evlogDir = System.getProperty("SPARK_EVLOG_DIR"); @@ -342,6 +349,12 @@ private static boolean canLoadClass(String className) { return ReflectionUtils.getClass(className) != null; } catch (Exception e) { return false; + } catch (NoClassDefFoundError e) { + // Handle cases where class exists but dependencies are missing (e.g., Scala 2.12 vs 2.13 compatibility) + return false; + } catch (Throwable e) { + // Catch any other errors (LinkageError, etc.) that might occur during class loading + return false; } } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java index 511613d904438..9546cb5349530 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java @@ -112,8 +112,15 @@ public static Map getSparkSqlConf() { Map sqlConf = new HashMap<>(); sqlConf.put("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension"); - if (HoodieSparkUtils.gteqSpark3_2()) { - sqlConf.put("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog"); + try { + if (HoodieSparkUtils.gteqSpark3_2()) { + sqlConf.put("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog"); + } + } catch (LinkageError e) { + // Handle Scala version compatibility issues (e.g., Scala 2.12 vs 2.13) + // If we can't determine the Spark version, skip setting the catalog + // This can happen when Scala code compiled with 2.12 references types removed in 2.13 + // LinkageError catches both NoClassDefFoundError and other linkage errors } return sqlConf; diff --git a/hudi-common/pom.xml b/hudi-common/pom.xml index 1d5cda0f75e82..9014d1a056325 100644 --- a/hudi-common/pom.xml +++ b/hudi-common/pom.xml @@ -54,6 +54,41 @@ false + + org.codehaus.mojo + build-helper-maven-plugin + 3.5.0 + + + add-spark34plus-avro-sources + generate-sources + + add-source + + + ${spark33orEarlier} + + src/avro/java + src/parquet/java + + + + + add-spark34plus-avro-test-sources + generate-test-sources + + add-test-source + + + ${spark33orEarlier} + + src/avro/test/java + src/parquet/test/java + + + + + org.apache.rat apache-rat-plugin diff --git a/hudi-common/src/avro/java/org/apache/parquet/avro/AvroSchemaConverterWithTimestampNTZ.java b/hudi-common/src/avro/java/org/apache/parquet/avro/AvroSchemaConverterWithTimestampNTZ.java new file mode 100644 index 0000000000000..c1bda6ef0ffb7 --- /dev/null +++ b/hudi-common/src/avro/java/org/apache/parquet/avro/AvroSchemaConverterWithTimestampNTZ.java @@ -0,0 +1,598 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.avro; + +import org.apache.avro.LogicalType; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.schema.ConversionPatterns; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.UUIDLogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.apache.avro.JsonProperties.NULL_VALUE; +import static org.apache.parquet.avro.AvroReadSupport.READ_INT96_AS_FIXED; +import static org.apache.parquet.avro.AvroReadSupport.READ_INT96_AS_FIXED_DEFAULT; +import static org.apache.parquet.avro.AvroWriteSupport.WRITE_OLD_LIST_STRUCTURE; +import static org.apache.parquet.avro.AvroWriteSupport.WRITE_OLD_LIST_STRUCTURE_DEFAULT; +import static org.apache.parquet.avro.AvroWriteSupport.WRITE_PARQUET_UUID; +import static org.apache.parquet.avro.AvroWriteSupport.WRITE_PARQUET_UUID_DEFAULT; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.dateType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.enumType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.uuidType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.Type.Repetition.REPEATED; + +/** + *

+ * Converts an Avro schema into a Parquet schema, or vice versa. See package + * documentation for details of the mapping. + *

+ * This was taken from parquet-java 1.13.1 AvroSchemaConverter and modified + * to support local timestamp types by copying a few methods from 1.14.0 AvroSchemaConverter. + */ +@SuppressWarnings("all") +public class AvroSchemaConverterWithTimestampNTZ extends HoodieAvroParquetSchemaConverter { + + public static final String ADD_LIST_ELEMENT_RECORDS = + "parquet.avro.add-list-element-records"; + private static final boolean ADD_LIST_ELEMENT_RECORDS_DEFAULT = true; + + private final boolean assumeRepeatedIsListElement; + private final boolean writeOldListStructure; + private final boolean writeParquetUUID; + private final boolean readInt96AsFixed; + private final Set pathsToInt96; + + public AvroSchemaConverterWithTimestampNTZ() { + this(ADD_LIST_ELEMENT_RECORDS_DEFAULT); + } + + /** + * Constructor used by {@link AvroRecordConverter#isElementType}, which always + * uses the 2-level list conversion. + * + * @param assumeRepeatedIsListElement whether to assume 2-level lists + */ + AvroSchemaConverterWithTimestampNTZ(boolean assumeRepeatedIsListElement) { + this.assumeRepeatedIsListElement = assumeRepeatedIsListElement; + this.writeOldListStructure = WRITE_OLD_LIST_STRUCTURE_DEFAULT; + this.writeParquetUUID = WRITE_PARQUET_UUID_DEFAULT; + this.readInt96AsFixed = READ_INT96_AS_FIXED_DEFAULT; + this.pathsToInt96 = Collections.emptySet(); + } + + public AvroSchemaConverterWithTimestampNTZ(Configuration conf) { + this.assumeRepeatedIsListElement = conf.getBoolean( + ADD_LIST_ELEMENT_RECORDS, ADD_LIST_ELEMENT_RECORDS_DEFAULT); + this.writeOldListStructure = conf.getBoolean( + WRITE_OLD_LIST_STRUCTURE, WRITE_OLD_LIST_STRUCTURE_DEFAULT); + this.writeParquetUUID = conf.getBoolean(WRITE_PARQUET_UUID, WRITE_PARQUET_UUID_DEFAULT); + this.readInt96AsFixed = conf.getBoolean(READ_INT96_AS_FIXED, READ_INT96_AS_FIXED_DEFAULT); + this.pathsToInt96 = new HashSet<>(Arrays.asList(conf.getStrings("parquet.avro.writeFixedAsInt96", new String[0]))); + } + + /** + * Given a schema, check to see if it is a union of a null type and a regular schema, + * and then return the non-null sub-schema. Otherwise, return the given schema. + * + * @param schema The schema to check + * @return The non-null portion of a union schema, or the given schema + */ + public static Schema getNonNull(Schema schema) { + if (schema.getType().equals(Schema.Type.UNION)) { + List schemas = schema.getTypes(); + if (schemas.size() == 2) { + if (schemas.get(0).getType().equals(Schema.Type.NULL)) { + return schemas.get(1); + } else if (schemas.get(1).getType().equals(Schema.Type.NULL)) { + return schemas.get(0); + } else { + return schema; + } + } else { + return schema; + } + } else { + return schema; + } + } + + @Override + public MessageType convert(Schema avroSchema) { + if (!avroSchema.getType().equals(Schema.Type.RECORD)) { + throw new IllegalArgumentException("Avro schema must be a record."); + } + return new MessageType(avroSchema.getFullName(), convertFields(avroSchema.getFields(), "")); + } + + private List convertFields(List fields, String schemaPath) { + List types = new ArrayList(); + for (Schema.Field field : fields) { + if (field.schema().getType().equals(Schema.Type.NULL)) { + continue; // Avro nulls are not encoded, unless they are null unions + } + types.add(convertField(field, appendPath(schemaPath, field.name()))); + } + return types; + } + + private Type convertField(String fieldName, Schema schema, String schemaPath) { + return convertField(fieldName, schema, Type.Repetition.REQUIRED, schemaPath); + } + + @SuppressWarnings("deprecation") + private Type convertField(String fieldName, Schema schema, Type.Repetition repetition, String schemaPath) { + Types.PrimitiveBuilder builder; + Schema.Type type = schema.getType(); + LogicalType logicalType = schema.getLogicalType(); + if (type.equals(Schema.Type.BOOLEAN)) { + builder = Types.primitive(BOOLEAN, repetition); + } else if (type.equals(Schema.Type.INT)) { + builder = Types.primitive(INT32, repetition); + } else if (type.equals(Schema.Type.LONG)) { + builder = Types.primitive(INT64, repetition); + } else if (type.equals(Schema.Type.FLOAT)) { + builder = Types.primitive(FLOAT, repetition); + } else if (type.equals(Schema.Type.DOUBLE)) { + builder = Types.primitive(DOUBLE, repetition); + } else if (type.equals(Schema.Type.BYTES)) { + builder = Types.primitive(BINARY, repetition); + } else if (type.equals(Schema.Type.STRING)) { + if (logicalType != null && logicalType.getName().equals(LogicalTypes.uuid().getName()) && writeParquetUUID) { + builder = Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition) + .length(LogicalTypeAnnotation.UUIDLogicalTypeAnnotation.BYTES); + } else { + builder = Types.primitive(BINARY, repetition).as(stringType()); + } + } else if (type.equals(Schema.Type.RECORD)) { + return new GroupType(repetition, fieldName, convertFields(schema.getFields(), schemaPath)); + } else if (type.equals(Schema.Type.ENUM)) { + builder = Types.primitive(BINARY, repetition).as(enumType()); + } else if (type.equals(Schema.Type.ARRAY)) { + if (writeOldListStructure) { + return ConversionPatterns.listType(repetition, fieldName, + convertField("array", schema.getElementType(), REPEATED, schemaPath)); + } else { + return ConversionPatterns.listOfElements(repetition, fieldName, + convertField(AvroWriteSupport.LIST_ELEMENT_NAME, schema.getElementType(), schemaPath)); + } + } else if (type.equals(Schema.Type.MAP)) { + Type valType = convertField("value", schema.getValueType(), schemaPath); + // avro map key type is always string + return ConversionPatterns.stringKeyMapType(repetition, fieldName, valType); + } else if (type.equals(Schema.Type.FIXED)) { + if (pathsToInt96.contains(schemaPath)) { + if (schema.getFixedSize() != 12) { + throw new IllegalArgumentException( + "The size of the fixed type field " + schemaPath + " must be 12 bytes for INT96 conversion"); + } + builder = Types.primitive(PrimitiveTypeName.INT96, repetition); + } else { + builder = Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(schema.getFixedSize()); + } + } else if (type.equals(Schema.Type.UNION)) { + return convertUnion(fieldName, schema, repetition, schemaPath); + } else { + throw new UnsupportedOperationException("Cannot convert Avro type " + type); + } + + // schema translation can only be done for known logical types because this + // creates an equivalence + if (logicalType != null) { + if (logicalType instanceof LogicalTypes.Decimal) { + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; + builder = builder.as(decimalType(decimal.getScale(), decimal.getPrecision())); + } else { + LogicalTypeAnnotation annotation = convertLogicalType(logicalType); + if (annotation != null) { + builder.as(annotation); + } + } + } + + return builder.named(fieldName); + } + + private Type convertUnion(String fieldName, Schema schema, Type.Repetition repetition, String schemaPath) { + List nonNullSchemas = new ArrayList(schema.getTypes().size()); + // Found any schemas in the union? Required for the edge case, where the union contains only a single type. + boolean foundNullSchema = false; + for (Schema childSchema : schema.getTypes()) { + if (childSchema.getType().equals(Schema.Type.NULL)) { + foundNullSchema = true; + if (Type.Repetition.REQUIRED == repetition) { + repetition = Type.Repetition.OPTIONAL; + } + } else { + nonNullSchemas.add(childSchema); + } + } + // If we only get a null and one other type then its a simple optional field + // otherwise construct a union container + switch (nonNullSchemas.size()) { + case 0: + throw new UnsupportedOperationException("Cannot convert Avro union of only nulls"); + + case 1: + return foundNullSchema ? convertField(fieldName, nonNullSchemas.get(0), repetition, schemaPath) : + convertUnionToGroupType(fieldName, repetition, nonNullSchemas, schemaPath); + + default: // complex union type + return convertUnionToGroupType(fieldName, repetition, nonNullSchemas, schemaPath); + } + } + + private Type convertUnionToGroupType(String fieldName, Type.Repetition repetition, List nonNullSchemas, + String schemaPath) { + List unionTypes = new ArrayList(nonNullSchemas.size()); + int index = 0; + for (Schema childSchema : nonNullSchemas) { + unionTypes.add( convertField("member" + index++, childSchema, Type.Repetition.OPTIONAL, schemaPath)); + } + return new GroupType(repetition, fieldName, unionTypes); + } + + private Type convertField(Schema.Field field, String schemaPath) { + return convertField(field.name(), field.schema(), schemaPath); + } + + @Override + public Schema convert(MessageType parquetSchema) { + return convertFields(parquetSchema.getName(), parquetSchema.getFields(), new HashMap<>()); + } + + Schema convert(GroupType parquetSchema) { + return convertFields(parquetSchema.getName(), parquetSchema.getFields(), new HashMap<>()); + } + + private Schema convertFields(String name, List parquetFields, Map names) { + List fields = new ArrayList(); + Integer nameCount = names.merge(name, 1, (oldValue, value) -> oldValue + 1); + for (Type parquetType : parquetFields) { + Schema fieldSchema = convertField(parquetType, names); + if (parquetType.isRepetition(REPEATED)) { + throw new UnsupportedOperationException("REPEATED not supported outside LIST or MAP. Type: " + parquetType); + } else if (parquetType.isRepetition(Type.Repetition.OPTIONAL)) { + fields.add(new Schema.Field( + parquetType.getName(), optional(fieldSchema), null, NULL_VALUE)); + } else { // REQUIRED + fields.add(new Schema.Field( + parquetType.getName(), fieldSchema, null, (Object) null)); + } + } + Schema schema = Schema.createRecord(name, null, nameCount > 1 ? name + nameCount : null, false); + schema.setFields(fields); + return schema; + } + + private Schema convertField(final Type parquetType, Map names) { + if (parquetType.isPrimitive()) { + final PrimitiveType asPrimitive = parquetType.asPrimitiveType(); + final PrimitiveTypeName parquetPrimitiveTypeName = + asPrimitive.getPrimitiveTypeName(); + final LogicalTypeAnnotation annotation = parquetType.getLogicalTypeAnnotation(); + Schema schema = parquetPrimitiveTypeName.convert( + new PrimitiveType.PrimitiveTypeNameConverter() { + @Override + public Schema convertBOOLEAN(PrimitiveTypeName primitiveTypeName) { + return Schema.create(Schema.Type.BOOLEAN); + } + @Override + public Schema convertINT32(PrimitiveTypeName primitiveTypeName) { + return Schema.create(Schema.Type.INT); + } + @Override + public Schema convertINT64(PrimitiveTypeName primitiveTypeName) { + return Schema.create(Schema.Type.LONG); + } + @Override + public Schema convertINT96(PrimitiveTypeName primitiveTypeName) { + if (readInt96AsFixed) { + return Schema.createFixed("INT96", "INT96 represented as byte[12]", null, 12); + } + throw new IllegalArgumentException( + "INT96 is deprecated. As interim enable READ_INT96_AS_FIXED flag to read as byte array."); + } + @Override + public Schema convertFLOAT(PrimitiveTypeName primitiveTypeName) { + return Schema.create(Schema.Type.FLOAT); + } + @Override + public Schema convertDOUBLE(PrimitiveTypeName primitiveTypeName) { + return Schema.create(Schema.Type.DOUBLE); + } + @Override + public Schema convertFIXED_LEN_BYTE_ARRAY(PrimitiveTypeName primitiveTypeName) { + if (annotation instanceof LogicalTypeAnnotation.UUIDLogicalTypeAnnotation) { + return Schema.create(Schema.Type.STRING); + } else { + int size = parquetType.asPrimitiveType().getTypeLength(); + return Schema.createFixed(parquetType.getName(), null, null, size); + } + } + @Override + public Schema convertBINARY(PrimitiveTypeName primitiveTypeName) { + if (annotation instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation || + annotation instanceof LogicalTypeAnnotation.EnumLogicalTypeAnnotation) { + return Schema.create(Schema.Type.STRING); + } else { + return Schema.create(Schema.Type.BYTES); + } + } + }); + + LogicalType logicalType = convertLogicalType(annotation); + if (logicalType != null && (!(annotation instanceof LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) || + parquetPrimitiveTypeName == BINARY || + parquetPrimitiveTypeName == FIXED_LEN_BYTE_ARRAY)) { + schema = logicalType.addToSchema(schema); + } + + return schema; + + } else { + GroupType parquetGroupType = parquetType.asGroupType(); + LogicalTypeAnnotation logicalTypeAnnotation = parquetGroupType.getLogicalTypeAnnotation(); + if (logicalTypeAnnotation != null) { + return logicalTypeAnnotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public java.util.Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { + if (parquetGroupType.getFieldCount()!= 1) { + throw new UnsupportedOperationException("Invalid list type " + parquetGroupType); + } + Type repeatedType = parquetGroupType.getType(0); + if (!repeatedType.isRepetition(REPEATED)) { + throw new UnsupportedOperationException("Invalid list type " + parquetGroupType); + } + if (isElementType(repeatedType, parquetGroupType.getName())) { + // repeated element types are always required + return java.util.Optional.of(Schema.createArray(convertField(repeatedType, names))); + } else { + Type elementType = repeatedType.asGroupType().getType(0); + if (elementType.isRepetition(Type.Repetition.OPTIONAL)) { + return java.util.Optional.of(Schema.createArray(optional(convertField(elementType, names)))); + } else { + return java.util.Optional.of(Schema.createArray(convertField(elementType, names))); + } + } + } + + @Override + // for backward-compatibility + public java.util.Optional visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) { + return visitMapOrMapKeyValue(); + } + + @Override + public java.util.Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { + return visitMapOrMapKeyValue(); + } + + private java.util.Optional visitMapOrMapKeyValue() { + if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0).isPrimitive()) { + throw new UnsupportedOperationException("Invalid map type " + parquetGroupType); + } + GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType(); + if (!mapKeyValType.isRepetition(REPEATED) || + mapKeyValType.getFieldCount()!=2) { + throw new UnsupportedOperationException("Invalid map type " + parquetGroupType); + } + Type keyType = mapKeyValType.getType(0); + if (!keyType.isPrimitive() || + !keyType.asPrimitiveType().getPrimitiveTypeName().equals(BINARY) || + !keyType.getLogicalTypeAnnotation().equals(stringType())) { + throw new IllegalArgumentException("Map key type must be binary (UTF8): " + + keyType); + } + Type valueType = mapKeyValType.getType(1); + if (valueType.isRepetition(Type.Repetition.OPTIONAL)) { + return java.util.Optional.of(Schema.createMap(optional(convertField(valueType, names)))); + } else { + return java.util.Optional.of(Schema.createMap(convertField(valueType, names))); + } + } + + @Override + public java.util.Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { + return java.util.Optional.of(Schema.create(Schema.Type.STRING)); + } + }).orElseThrow(() -> new UnsupportedOperationException("Cannot convert Parquet type " + parquetType)); + } else { + // if no original type then it's a record + return convertFields(parquetGroupType.getName(), parquetGroupType.getFields(), names); + } + } + } + + private LogicalTypeAnnotation convertLogicalType(LogicalType logicalType) { + if (logicalType == null) { + return null; + } else if (logicalType instanceof LogicalTypes.Decimal) { + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; + return decimalType(decimal.getScale(), decimal.getPrecision()); + } else if (logicalType instanceof LogicalTypes.Date) { + return dateType(); + } else if (logicalType instanceof LogicalTypes.TimeMillis) { + return timeType(true, MILLIS); + } else if (logicalType instanceof LogicalTypes.TimeMicros) { + return timeType(true, MICROS); + } else if (logicalType instanceof LogicalTypes.TimestampMillis) { + return timestampType(true, MILLIS); + } else if (logicalType instanceof LogicalTypes.TimestampMicros) { + return timestampType(true, MICROS); + } else if (logicalType.getName().equals(LogicalTypes.uuid().getName()) && writeParquetUUID) { + return uuidType(); + } + + if (avroVersionSupportsLocalTimestampTypes()) { + if (logicalType instanceof LogicalTypes.LocalTimestampMillis) { + return timestampType(false, MILLIS); + } else if (logicalType instanceof LogicalTypes.LocalTimestampMicros) { + return timestampType(false, MICROS); + } + } + + return null; + } + + private LogicalType convertLogicalType(LogicalTypeAnnotation annotation) { + if (annotation == null) { + return null; + } + return annotation + .accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public java.util.Optional visit( + LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return java.util.Optional.of( + LogicalTypes.decimal(decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); + } + + @Override + public java.util.Optional visit( + LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { + return java.util.Optional.of(LogicalTypes.date()); + } + + @Override + public java.util.Optional visit( + LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + LogicalTypeAnnotation.TimeUnit unit = timeLogicalType.getUnit(); + switch (unit) { + case MILLIS: + return java.util.Optional.of(LogicalTypes.timeMillis()); + case MICROS: + return java.util.Optional.of(LogicalTypes.timeMicros()); + } + return java.util.Optional.empty(); + } + + @Override + public java.util.Optional visit( + LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + LogicalTypeAnnotation.TimeUnit unit = timestampLogicalType.getUnit(); + boolean isAdjustedToUTC = timestampLogicalType.isAdjustedToUTC(); + + if (isAdjustedToUTC || !avroVersionSupportsLocalTimestampTypes()) { + switch (unit) { + case MILLIS: + return java.util.Optional.of(LogicalTypes.timestampMillis()); + case MICROS: + return java.util.Optional.of(LogicalTypes.timestampMicros()); + } + return java.util.Optional.empty(); + } else { + switch (unit) { + case MILLIS: + return java.util.Optional.of(LogicalTypes.localTimestampMillis()); + case MICROS: + return java.util.Optional.of(LogicalTypes.localTimestampMicros()); + } + return java.util.Optional.empty(); + } + } + + @Override + public java.util.Optional visit(UUIDLogicalTypeAnnotation uuidLogicalType) { + return java.util.Optional.of(LogicalTypes.uuid()); + } + }) + .orElse(null); + } + + /** + * Implements the rules for interpreting existing data from the logical type + * spec for the LIST annotation. This is used to produce the expected schema. + *

+ * The AvroArrayConverter will decide whether the repeated type is the array + * element type by testing whether the element schema and repeated type are + * the same. This ensures that the LIST rules are followed when there is no + * schema and that a schema can be provided to override the default behavior. + */ + private boolean isElementType(Type repeatedType, String parentName) { + return ( + // can't be a synthetic layer because it would be invalid + repeatedType.isPrimitive() || + repeatedType.asGroupType().getFieldCount() > 1 || + repeatedType.asGroupType().getType(0).isRepetition(REPEATED) || + // known patterns without the synthetic layer + repeatedType.getName().equals("array") || + repeatedType.getName().equals(parentName + "_tuple") || + // default assumption + assumeRepeatedIsListElement + ); + } + + private static Schema optional(Schema original) { + // null is first in the union because Parquet's default is always null + return Schema.createUnion(Arrays.asList( + Schema.create(Schema.Type.NULL), + original)); + } + + private static String appendPath(String path, String fieldName) { + if (path == null || path.isEmpty()) { + return fieldName; + } + return path + '.' + fieldName; + } + + /* Avro <= 1.9 does not support conversions to LocalTimestamp{Micros, Millis} classes */ + private static boolean avroVersionSupportsLocalTimestampTypes() { + final String avroVersion = getRuntimeAvroVersion(); + + return avroVersion == null + || !(avroVersion.startsWith("1.7.") + || avroVersion.startsWith("1.8.") + || avroVersion.startsWith("1.9.")); + } + + private static String getRuntimeAvroVersion() { + return Schema.Parser.class.getPackage().getImplementationVersion(); + } +} diff --git a/hudi-common/src/avro/java/org/apache/parquet/schema/AvroSchemaRepair.java b/hudi-common/src/avro/java/org/apache/parquet/schema/AvroSchemaRepair.java new file mode 100644 index 0000000000000..75f880ae3f955 --- /dev/null +++ b/hudi-common/src/avro/java/org/apache/parquet/schema/AvroSchemaRepair.java @@ -0,0 +1,259 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.schema; + +import org.apache.hudi.avro.AvroSchemaCache; +import org.apache.hudi.avro.AvroSchemaUtils; +import org.apache.hudi.avro.HoodieAvroUtils; + +import org.apache.avro.LogicalType; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class AvroSchemaRepair { + public static boolean isLocalTimestampSupported = isLocalTimestampMillisSupported(); + + public static Schema repairLogicalTypes(Schema fileSchema, Schema tableSchema) { + Schema repairedSchema = repairAvroSchema(fileSchema, tableSchema); + if (repairedSchema != fileSchema) { + return AvroSchemaCache.intern(repairedSchema); + } + return fileSchema; + } + + /** + * Performs schema repair on a schema, handling nullable unions. + */ + private static Schema repairAvroSchema(Schema fileSchema, Schema tableSchema) { + // Always resolve nullable schemas first (returns unchanged if not a union) + Schema nonNullFileSchema = AvroSchemaUtils.getNonNullTypeFromUnion(fileSchema); + Schema nonNullTableSchema = AvroSchemaUtils.getNonNullTypeFromUnion(tableSchema); + + // Perform repair on the non-null types + Schema nonNullRepairedSchema = repairAvroSchemaNonNull(nonNullFileSchema, nonNullTableSchema); + + // If nothing changed, return the original schema + if (nonNullRepairedSchema == nonNullFileSchema) { + return fileSchema; + } + + // If the original was a union, wrap the repaired schema back in a nullable union + if (fileSchema.getType() == Schema.Type.UNION) { + return AvroSchemaUtils.createNullableSchema(nonNullRepairedSchema); + } + + return nonNullRepairedSchema; + } + + /** + * Repairs non-nullable schemas (after unions have been resolved). + */ + private static Schema repairAvroSchemaNonNull(Schema fileSchema, Schema tableSchema) { + // If schemas are already equal, nothing to repair + if (fileSchema.equals(tableSchema)) { + return fileSchema; + } + + // If types are different, no repair can be done + if (fileSchema.getType() != tableSchema.getType()) { + return fileSchema; + } + + // Handle record types (nested structs) + if (fileSchema.getType() == Schema.Type.RECORD) { + return repairRecord(fileSchema, tableSchema); + } + + // Handle array types + if (fileSchema.getType() == Schema.Type.ARRAY) { + Schema repairedElementSchema = repairAvroSchema(fileSchema.getElementType(), tableSchema.getElementType()); + // If element didn't change, return original array schema + if (repairedElementSchema == fileSchema.getElementType()) { + return fileSchema; + } + return Schema.createArray(repairedElementSchema); + } + + // Handle map types + if (fileSchema.getType() == Schema.Type.MAP) { + Schema repairedValueSchema = repairAvroSchema(fileSchema.getValueType(), tableSchema.getValueType()); + // If value didn't change, return original map schema + if (repairedValueSchema == fileSchema.getValueType()) { + return fileSchema; + } + return Schema.createMap(repairedValueSchema); + } + + // Check primitive if we need to repair + if (needsLogicalTypeRepair(fileSchema, tableSchema)) { + // If we need to repair, return the table schema + return tableSchema; + } + + // Default: return file schema + return fileSchema; + } + + /** + * Quick check if a logical type repair is needed (no allocations). + */ + private static boolean needsLogicalTypeRepair(Schema fileSchema, Schema tableSchema) { + if (fileSchema.getType() != Schema.Type.LONG || tableSchema.getType() != Schema.Type.LONG) { + return false; + } + + LogicalType fileSchemaLogicalType = fileSchema.getLogicalType(); + LogicalType tableSchemaLogicalType = tableSchema.getLogicalType(); + + // if file scheam has no logical type, and the table has a local timestamp, then we need to repair + if (fileSchemaLogicalType == null) { + try { + return tableSchemaLogicalType instanceof LogicalTypes.LocalTimestampMillis + || tableSchemaLogicalType instanceof LogicalTypes.LocalTimestampMicros; + } catch (Exception e) { + return false; + } + } + + // if file schema is timestamp-micros, and the table is timestamp-millis, then we need to repair + return fileSchemaLogicalType instanceof LogicalTypes.TimestampMicros + && tableSchemaLogicalType instanceof LogicalTypes.TimestampMillis; + } + + /** + * Performs record repair, returning the original schema if nothing changed. + */ + private static Schema repairRecord(Schema fileSchema, Schema tableSchema) { + List fields = fileSchema.getFields(); + + // First pass: find the first field that changes + int firstChangedIndex = -1; + Schema firstRepairedSchema = null; + + for (int i = 0; i < fields.size(); i++) { + Schema.Field requestedField = fields.get(i); + Schema.Field tableField = tableSchema.getField(requestedField.name()); + if (tableField != null) { + Schema repairedSchema = repairAvroSchema(requestedField.schema(), tableField.schema()); + if (repairedSchema != requestedField.schema()) { + firstChangedIndex = i; + firstRepairedSchema = repairedSchema; + break; + } + } + } + + // If nothing changed, return the original schema + if (firstChangedIndex == -1) { + return fileSchema; + } + + // Second pass: build the new schema with repaired fields + List repairedFields = new ArrayList<>(fields.size()); + + // Copy all fields before the first changed field + for (int i = 0; i < firstChangedIndex; i++) { + Schema.Field field = fields.get(i); + // Must create new Field since they cannot be reused + repairedFields.add(HoodieAvroUtils.createNewSchemaField(field)); + } + + // Add the first changed field (using cached repaired schema) + Schema.Field firstChangedField = fields.get(firstChangedIndex); + repairedFields.add(HoodieAvroUtils.createNewSchemaField( + firstChangedField.name(), + firstRepairedSchema, + firstChangedField.doc(), + firstChangedField.defaultVal() + )); + + // Process remaining fields + for (int i = firstChangedIndex + 1; i < fields.size(); i++) { + Schema.Field requestedField = fields.get(i); + Schema.Field tableField = tableSchema.getField(requestedField.name()); + Schema repairedSchema; + + if (tableField != null) { + repairedSchema = repairAvroSchema(requestedField.schema(), tableField.schema()); + } else { + repairedSchema = requestedField.schema(); + } + + // Must create new Field since they cannot be reused + repairedFields.add(HoodieAvroUtils.createNewSchemaField( + requestedField.name(), + repairedSchema, + requestedField.doc(), + requestedField.defaultVal() + )); + } + + return Schema.createRecord( + fileSchema.getName(), + fileSchema.getDoc(), + fileSchema.getNamespace(), + fileSchema.isError(), + repairedFields + ); + } + + public static boolean hasTimestampMillisField(Schema tableSchema) { + switch (tableSchema.getType()) { + case RECORD: + for (Schema.Field field : tableSchema.getFields()) { + if (hasTimestampMillisField(field.schema())) { + return true; + } + } + return false; + + case ARRAY: + return hasTimestampMillisField(tableSchema.getElementType()); + + case MAP: + return hasTimestampMillisField(tableSchema.getValueType()); + + case UNION: + return hasTimestampMillisField(AvroSchemaUtils.getNonNullTypeFromUnion(tableSchema)); + + default: + return tableSchema.getType() == Schema.Type.LONG + && (tableSchema.getLogicalType() instanceof LogicalTypes.TimestampMillis || tableSchema.getLogicalType() instanceof LogicalTypes.LocalTimestampMillis); + } + } + + /** + * Check if LogicalTypes.LocalTimestampMillis is supported in the current Avro version + * + * @return true if LocalTimestampMillis is available, false otherwise + */ + public static boolean isLocalTimestampMillisSupported() { + try { + return Arrays.stream(LogicalTypes.class.getDeclaredClasses()) + .anyMatch(c -> c.getSimpleName().equals("LocalTimestampMillis")); + } catch (Exception e) { + return false; + } + } +} diff --git a/hudi-common/src/avro/test/java/org/apache/parquet/schema/TestAvroSchemaRepair.java b/hudi-common/src/avro/test/java/org/apache/parquet/schema/TestAvroSchemaRepair.java new file mode 100644 index 0000000000000..fb3d7e375b2c4 --- /dev/null +++ b/hudi-common/src/avro/test/java/org/apache/parquet/schema/TestAvroSchemaRepair.java @@ -0,0 +1,984 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.schema; + +import org.apache.hudi.avro.AvroSchemaUtils; + +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Tests {@link AvroSchemaRepair}. + */ +public class TestAvroSchemaRepair { + + @Test + public void testNoRepairNeededIdenticalSchemas() { + Schema requestedSchema = Schema.create(Schema.Type.LONG); + Schema tableSchema = Schema.create(Schema.Type.LONG); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertSame(requestedSchema, result, "When schemas are identical, should return same instance"); + + } + + @Test + public void testNoRepairNeededDifferentPrimitiveTypes() { + Schema requestedSchema = Schema.create(Schema.Type.STRING); + Schema tableSchema = Schema.create(Schema.Type.INT); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertSame(requestedSchema, result, "When types differ, should return original schema"); + } + + @Test + public void testRepairLongWithoutLogicalTypeToLocalTimestampMillis() { + Schema requestedSchema = Schema.create(Schema.Type.LONG); + Schema tableSchema = LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG)); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create a new schema with logical type"); + assertEquals(Schema.Type.LONG, result.getType()); + assertEquals(LogicalTypes.localTimestampMillis(), result.getLogicalType()); + } + + @Test + public void testRepairLongWithoutLogicalTypeToLocalTimestampMicros() { + Schema requestedSchema = Schema.create(Schema.Type.LONG); + Schema tableSchema = LogicalTypes.localTimestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create a new schema with logical type"); + assertEquals(Schema.Type.LONG, result.getType()); + assertEquals(LogicalTypes.localTimestampMicros(), result.getLogicalType()); + } + + @Test + public void testRepairTimestampMicrosToTimestampMillis() { + Schema requestedSchema = LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); + Schema tableSchema = LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG)); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create a new schema with timestamp-millis"); + assertEquals(Schema.Type.LONG, result.getType()); + assertEquals(LogicalTypes.timestampMillis(), result.getLogicalType()); + } + + @Test + public void testNoRepairNeededTimestampMillisToTimestampMicros() { + // This direction should NOT trigger repair + Schema requestedSchema = LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG)); + Schema tableSchema = LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertSame(requestedSchema, result, "Should not repair timestamp-millis to timestamp-micros"); + } + + @Test + public void testNoRepairNeededNonLongTypes() { + Schema requestedSchema = Schema.create(Schema.Type.INT); + Schema tableSchema = LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertSame(requestedSchema, result, "Should not repair non-LONG types"); + } + + @Test + public void testRepairNullableSchemaLongToLocalTimestampMillis() { + Schema requestedSchema = Schema.createUnion( + Schema.create(Schema.Type.NULL), + Schema.create(Schema.Type.LONG) + ); + Schema tableSchema = Schema.createUnion( + Schema.create(Schema.Type.NULL), + LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG)) + ); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new nullable schema with repaired type"); + assertEquals(Schema.Type.UNION, result.getType()); + assertEquals(2, result.getTypes().size()); + + Schema nonNullType = AvroSchemaUtils.getNonNullTypeFromUnion(result); + assertEquals(LogicalTypes.localTimestampMillis(), nonNullType.getLogicalType()); + } + + @Test + public void testRepairNullableSchemaTimestampMicrosToMillis() { + Schema requestedSchema = Schema.createUnion( + Schema.create(Schema.Type.NULL), + LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)) + ); + Schema tableSchema = Schema.createUnion( + Schema.create(Schema.Type.NULL), + LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG)) + ); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new nullable schema"); + assertEquals(Schema.Type.UNION, result.getType()); + + Schema nonNullType = AvroSchemaUtils.getNonNullTypeFromUnion(result); + assertEquals(LogicalTypes.timestampMillis(), nonNullType.getLogicalType()); + } + + @Test + public void testRepairRecordSingleField() { + Schema requestedSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("timestamp").type().longType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("timestamp").type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new record schema"); + assertEquals(Schema.Type.RECORD, result.getType()); + assertEquals("TestRecord", result.getName()); + assertEquals(1, result.getFields().size()); + + Schema.Field field = result.getField("timestamp"); + assertEquals(LogicalTypes.localTimestampMillis(), field.schema().getLogicalType()); + } + + @Test + public void testRepairRecordMultipleFieldsOnlyOneNeedsRepair() { + Schema requestedSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .name("timestamp").type().longType().noDefault() + .name("name").type().stringType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .name("timestamp").type(LogicalTypes.localTimestampMicros().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .name("name").type().stringType().noDefault() + .endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new record schema"); + assertEquals(3, result.getFields().size()); + + // Verify id field unchanged - should be same schema instance + assertSame(requestedSchema.getField("id").schema(), result.getField("id").schema()); + + // Verify timestamp field repaired + assertEquals(LogicalTypes.localTimestampMicros(), result.getField("timestamp").schema().getLogicalType()); + + // Verify name field unchanged - should be same schema instance + assertSame(requestedSchema.getField("name").schema(), result.getField("name").schema()); + } + + @Test + public void testRepairRecordNestedRecord() { + Schema nestedRequestedSchema = SchemaBuilder.record("NestedRecord") + .fields() + .name("timestamp").type().longType().noDefault() + .endRecord(); + + Schema nestedTableSchema = SchemaBuilder.record("NestedRecord") + .fields() + .name("timestamp").type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + Schema requestedSchema = SchemaBuilder.record("OuterRecord") + .fields() + .name("id").type().intType().noDefault() + .name("nested").type(nestedRequestedSchema).noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("OuterRecord") + .fields() + .name("id").type().intType().noDefault() + .name("nested").type(nestedTableSchema).noDefault() + .endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new schema for nested record"); + + // Verify id field unchanged - should be same schema instance + assertSame(requestedSchema.getField("id").schema(), result.getField("id").schema()); + + // Verify nested record was repaired + Schema nestedResult = result.getField("nested").schema(); + assertEquals(Schema.Type.RECORD, nestedResult.getType()); + assertEquals(LogicalTypes.localTimestampMillis(), + nestedResult.getField("timestamp").schema().getLogicalType()); + } + + @Test + public void testRepairRecordNullableNestedField() { + Schema requestedSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("timestamp").type().optional().longType() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("timestamp").type().optional().type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new schema"); + + Schema fieldSchema = result.getField("timestamp").schema(); + assertEquals(Schema.Type.UNION, fieldSchema.getType()); + + Schema nonNullType = AvroSchemaUtils.getNonNullTypeFromUnion(fieldSchema); + assertEquals(LogicalTypes.localTimestampMillis(), nonNullType.getLogicalType()); + } + + @Test + public void testRepairArrayElementNeedsRepair() { + Schema requestedSchema = Schema.createArray(Schema.create(Schema.Type.LONG)); + Schema tableSchema = Schema.createArray( + LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG)) + ); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new array schema"); + assertEquals(Schema.Type.ARRAY, result.getType()); + assertEquals(LogicalTypes.localTimestampMillis(), result.getElementType().getLogicalType()); + } + + @Test + public void testRepairArrayNoRepairNeeded() { + Schema elementSchema = Schema.create(Schema.Type.STRING); + Schema requestedSchema = Schema.createArray(elementSchema); + Schema tableSchema = Schema.createArray(elementSchema); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertSame(requestedSchema, result, "Should return same array when no repair needed"); + } + + @Test + public void testRepairArrayNullableElements() { + Schema requestedSchema = Schema.createArray( + Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.LONG)) + ); + Schema tableSchema = Schema.createArray( + Schema.createUnion( + Schema.create(Schema.Type.NULL), + LogicalTypes.localTimestampMicros().addToSchema(Schema.create(Schema.Type.LONG)) + ) + ); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new array schema"); + Schema elementSchema = result.getElementType(); + assertEquals(Schema.Type.UNION, elementSchema.getType()); + + Schema nonNullType = AvroSchemaUtils.getNonNullTypeFromUnion(elementSchema); + assertEquals(LogicalTypes.localTimestampMicros(), nonNullType.getLogicalType()); + } + + @Test + public void testRepairMapValueNeedsRepair() { + Schema requestedSchema = Schema.createMap(Schema.create(Schema.Type.LONG)); + Schema tableSchema = Schema.createMap( + LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG)) + ); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new map schema"); + assertEquals(Schema.Type.MAP, result.getType()); + assertEquals(LogicalTypes.localTimestampMillis(), result.getValueType().getLogicalType()); + } + + @Test + public void testRepairMapNoRepairNeeded() { + Schema valueSchema = Schema.create(Schema.Type.STRING); + Schema requestedSchema = Schema.createMap(valueSchema); + Schema tableSchema = Schema.createMap(valueSchema); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertSame(requestedSchema, result, "Should return same map when no repair needed"); + } + + @Test + public void testRepairMapNullableValues() { + Schema requestedSchema = Schema.createMap( + Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.LONG)) + ); + Schema tableSchema = Schema.createMap( + Schema.createUnion( + Schema.create(Schema.Type.NULL), + LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG)) + ) + ); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new map schema"); + Schema valueSchema = result.getValueType(); + assertEquals(Schema.Type.UNION, valueSchema.getType()); + + Schema nonNullType = AvroSchemaUtils.getNonNullTypeFromUnion(valueSchema); + assertEquals(LogicalTypes.localTimestampMillis(), nonNullType.getLogicalType()); + } + + @Test + public void testComplexSchemaMultiLevelNesting() { + // Create a complex schema with nested records, arrays, and maps + Schema innerRecordRequested = SchemaBuilder.record("Inner") + .fields() + .name("timestamp").type().longType().noDefault() + .endRecord(); + + Schema innerRecordTable = SchemaBuilder.record("Inner") + .fields() + .name("timestamp").type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + Schema requestedSchema = SchemaBuilder.record("Outer") + .fields() + .name("id").type().intType().noDefault() + .name("records").type().array().items(innerRecordRequested).noDefault() + .name("mapping").type().map().values(Schema.create(Schema.Type.LONG)).noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("Outer") + .fields() + .name("id").type().intType().noDefault() + .name("records").type().array().items(innerRecordTable).noDefault() + .name("mapping").type().map().values( + LogicalTypes.localTimestampMicros().addToSchema(Schema.create(Schema.Type.LONG)) + ).noDefault() + .endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new complex schema"); + + // Verify id field unchanged - should be same schema instance + assertSame(requestedSchema.getField("id").schema(), result.getField("id").schema()); + + // Verify array of records was repaired + Schema arrayElementSchema = result.getField("records").schema().getElementType(); + assertEquals(LogicalTypes.localTimestampMillis(), + arrayElementSchema.getField("timestamp").schema().getLogicalType()); + + // Verify map values were repaired + Schema mapValueSchema = result.getField("mapping").schema().getValueType(); + assertEquals(LogicalTypes.localTimestampMicros(), mapValueSchema.getLogicalType()); + } + + @Test + public void testRepairRecordMissingFieldInTableSchema() { + // Requested schema has a field not present in table schema + Schema requestedSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .name("newField").type().longType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + // Should return original schema unchanged since newField doesn't exist in table schema + assertSame(requestedSchema, result, "Should return original when field missing in table schema"); + } + + @Test + public void testRepairRecordMultipleFieldsMissingInTableSchema() { + // Requested schema has multiple fields not present in table schema + Schema requestedSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .name("newField1").type().longType().noDefault() + .name("name").type().stringType().noDefault() + .name("newField2").type().longType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .name("name").type().stringType().noDefault() + .endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + // Should return original schema unchanged since new fields don't exist in table schema + assertSame(requestedSchema, result, "Should return original when multiple fields missing in table schema"); + } + + @Test + public void testRepairRecordMixedMissingAndRepairableFields() { + // Requested schema has some fields missing in table, some needing repair, some unchanged + Schema requestedSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .name("timestamp").type().longType().noDefault() + .name("newField").type().longType().noDefault() + .name("name").type().stringType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .name("timestamp").type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .name("name").type().stringType().noDefault() + .endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + // Should create new schema with timestamp repaired, but newField preserved from requested + assertNotSame(requestedSchema, result, "Should create new schema"); + assertEquals(4, result.getFields().size()); + + // Verify id field unchanged + assertSame(requestedSchema.getField("id").schema(), result.getField("id").schema()); + + // Verify timestamp field repaired + assertEquals(LogicalTypes.localTimestampMillis(), result.getField("timestamp").schema().getLogicalType()); + + // Verify newField preserved from requested schema (not in table) + assertSame(requestedSchema.getField("newField").schema(), result.getField("newField").schema()); + + // Verify name field unchanged + assertSame(requestedSchema.getField("name").schema(), result.getField("name").schema()); + } + + @Test + public void testRepairNestedRecordFieldMissingInTableSchema() { + // Requested nested record has a field not present in table's nested record + Schema nestedRequestedSchema = SchemaBuilder.record("NestedRecord") + .fields() + .name("timestamp").type().longType().noDefault() + .name("extraField").type().stringType().noDefault() + .endRecord(); + + Schema nestedTableSchema = SchemaBuilder.record("NestedRecord") + .fields() + .name("timestamp").type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + Schema requestedSchema = SchemaBuilder.record("OuterRecord") + .fields() + .name("id").type().intType().noDefault() + .name("nested").type(nestedRequestedSchema).noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("OuterRecord") + .fields() + .name("id").type().intType().noDefault() + .name("nested").type(nestedTableSchema).noDefault() + .endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new schema"); + + // Verify id field unchanged + assertSame(requestedSchema.getField("id").schema(), result.getField("id").schema()); + + // Verify nested record was repaired but still has extraField + Schema nestedResult = result.getField("nested").schema(); + assertEquals(Schema.Type.RECORD, nestedResult.getType()); + assertEquals(2, nestedResult.getFields().size()); + + // Timestamp should be repaired + assertEquals(LogicalTypes.localTimestampMillis(), + nestedResult.getField("timestamp").schema().getLogicalType()); + + // extraField should be preserved from requested schema + assertSame(nestedRequestedSchema.getField("extraField").schema(), + nestedResult.getField("extraField").schema()); + } + + @Test + public void testRepairRecordWholeNestedRecordMissingInTableSchema() { + // Requested schema has a nested record field that doesn't exist in table schema + Schema nestedRequestedSchema = SchemaBuilder.record("NestedRecord") + .fields() + .name("timestamp").type().longType().noDefault() + .endRecord(); + + Schema requestedSchema = SchemaBuilder.record("OuterRecord") + .fields() + .name("id").type().intType().noDefault() + .name("newNested").type(nestedRequestedSchema).noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("OuterRecord") + .fields() + .name("id").type().intType().noDefault() + .endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + // Should return original schema unchanged since newNested field doesn't exist in table + assertSame(requestedSchema, result, "Should return original when nested field missing in table schema"); + } + + @Test + public void testRepairRecordPreservesFieldMetadata() { + Schema requestedSchema = SchemaBuilder.record("TestRecord") + .doc("Test documentation") + .fields() + .name("timestamp").doc("Timestamp field").type().longType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("timestamp").type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result); + assertEquals("TestRecord", result.getName()); + assertEquals("Test documentation", result.getDoc()); + assertEquals("Timestamp field", result.getField("timestamp").doc()); + } + + @Test + public void testEdgeCaseEmptyRecord() { + Schema requestedSchema = SchemaBuilder.record("EmptyRecord").fields().endRecord(); + Schema tableSchema = SchemaBuilder.record("EmptyRecord").fields().endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertSame(requestedSchema, result, "Empty records should return same instance"); + } + + @Test + public void testRepairRecordFirstFieldChanged() { + // Test the optimization path where the first field needs repair + Schema requestedSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("timestamp1").type().longType().noDefault() + .name("timestamp2").type().longType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("timestamp1").type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .name("timestamp2").type(LogicalTypes.localTimestampMicros().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result); + assertEquals(LogicalTypes.localTimestampMillis(), result.getField("timestamp1").schema().getLogicalType()); + assertEquals(LogicalTypes.localTimestampMicros(), result.getField("timestamp2").schema().getLogicalType()); + } + + @Test + public void testRepairRecordLastFieldChanged() { + // Test the optimization path where only the last field needs repair + Schema requestedSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .name("name").type().stringType().noDefault() + .name("timestamp").type().longType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .name("name").type().stringType().noDefault() + .name("timestamp").type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + Schema result = AvroSchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result); + // Verify id and name fields unchanged - should be same schema instances + assertSame(requestedSchema.getField("id").schema(), result.getField("id").schema()); + assertSame(requestedSchema.getField("name").schema(), result.getField("name").schema()); + // Verify timestamp field repaired + assertEquals(LogicalTypes.localTimestampMillis(), result.getField("timestamp").schema().getLogicalType()); + } + + @Test + public void testHasTimestampMillisFieldPrimitiveLongWithTimestampMillis() { + Schema schema = LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG)); + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true for LONG with timestamp-millis logical type"); + } + + @Test + public void testHasTimestampMillisFieldPrimitiveLongWithoutLogicalType() { + Schema schema = Schema.create(Schema.Type.LONG); + assertFalse(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return false for LONG without logical type"); + } + + @Test + public void testHasTimestampMillisFieldPrimitiveLongWithTimestampMicros() { + Schema schema = LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); + assertFalse(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return false for LONG with timestamp-micros logical type"); + } + + @Test + public void testHasTimestampMillisFieldPrimitiveLongWithLocalTimestampMillis() { + Schema schema = LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG)); + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true for LONG with local-timestamp-millis logical type"); + } + + @Test + public void testHasTimestampMillisFieldPrimitiveLongWithLocalTimestampMicros() { + Schema schema = LogicalTypes.localTimestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); + assertFalse(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return false for LONG with local-timestamp-micros logical type"); + } + + @Test + public void testHasTimestampMillisFieldOtherPrimitiveTypes() { + assertFalse(AvroSchemaRepair.hasTimestampMillisField(Schema.create(Schema.Type.STRING)), + "Should return false for STRING type"); + assertFalse(AvroSchemaRepair.hasTimestampMillisField(Schema.create(Schema.Type.INT)), + "Should return false for INT type"); + assertFalse(AvroSchemaRepair.hasTimestampMillisField(Schema.create(Schema.Type.FLOAT)), + "Should return false for FLOAT type"); + assertFalse(AvroSchemaRepair.hasTimestampMillisField(Schema.create(Schema.Type.DOUBLE)), + "Should return false for DOUBLE type"); + assertFalse(AvroSchemaRepair.hasTimestampMillisField(Schema.create(Schema.Type.BOOLEAN)), + "Should return false for BOOLEAN type"); + } + + @Test + public void testHasTimestampMillisFieldRecordWithTimestampMillis() { + Schema schema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .name("timestamp").type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .name("name").type().stringType().noDefault() + .endRecord(); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true for record containing timestamp-millis field"); + } + + @Test + public void testHasTimestampMillisFieldRecordWithoutTimestampMillis() { + Schema schema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .name("timestamp").type(LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .name("name").type().stringType().noDefault() + .endRecord(); + + assertFalse(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return false for record without timestamp-millis field"); + } + + @Test + public void testHasTimestampMillisFieldRecordEmpty() { + Schema schema = SchemaBuilder.record("EmptyRecord").fields().endRecord(); + + assertFalse(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return false for empty record"); + } + + @Test + public void testHasTimestampMillisFieldNestedRecord() { + Schema innerSchema = SchemaBuilder.record("InnerRecord") + .fields() + .name("timestamp").type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + Schema outerSchema = SchemaBuilder.record("OuterRecord") + .fields() + .name("id").type().intType().noDefault() + .name("inner").type(innerSchema).noDefault() + .endRecord(); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(outerSchema), + "Should return true for nested record containing timestamp-millis field"); + } + + @Test + public void testHasTimestampMillisFieldDeeplyNestedRecord() { + Schema level3 = SchemaBuilder.record("Level3") + .fields() + .name("timestamp").type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + Schema level2 = SchemaBuilder.record("Level2") + .fields() + .name("data").type(level3).noDefault() + .endRecord(); + + Schema level1 = SchemaBuilder.record("Level1") + .fields() + .name("nested").type(level2).noDefault() + .endRecord(); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(level1), + "Should return true for deeply nested record containing timestamp-millis field"); + } + + @Test + public void testHasTimestampMillisFieldArrayWithTimestampMillis() { + Schema schema = Schema.createArray( + LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG)) + ); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true for array with timestamp-millis elements"); + } + + @Test + public void testHasTimestampMillisFieldArrayWithoutTimestampMillis() { + Schema schema = Schema.createArray(Schema.create(Schema.Type.STRING)); + + assertFalse(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return false for array without timestamp-millis elements"); + } + + @Test + public void testHasTimestampMillisFieldArrayOfRecordsWithTimestampMillis() { + Schema elementSchema = SchemaBuilder.record("Element") + .fields() + .name("timestamp").type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + Schema schema = Schema.createArray(elementSchema); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true for array of records containing timestamp-millis field"); + } + + @Test + public void testHasTimestampMillisFieldMapWithTimestampMillis() { + Schema schema = Schema.createMap( + LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG)) + ); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true for map with timestamp-millis values"); + } + + @Test + public void testHasTimestampMillisFieldMapWithoutTimestampMillis() { + Schema schema = Schema.createMap(Schema.create(Schema.Type.STRING)); + + assertFalse(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return false for map without timestamp-millis values"); + } + + @Test + public void testHasTimestampMillisFieldMapOfRecordsWithTimestampMillis() { + Schema valueSchema = SchemaBuilder.record("Value") + .fields() + .name("timestamp").type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + Schema schema = Schema.createMap(valueSchema); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true for map of records containing timestamp-millis field"); + } + + @Test + public void testHasTimestampMillisFieldUnionWithTimestampMillis() { + Schema schema = Schema.createUnion( + Schema.create(Schema.Type.NULL), + LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG)) + ); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true for nullable union with timestamp-millis"); + } + + @Test + public void testHasTimestampMillisFieldUnionWithoutTimestampMillis() { + Schema schema = Schema.createUnion( + Schema.create(Schema.Type.NULL), + Schema.create(Schema.Type.LONG) + ); + + assertFalse(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return false for nullable union without timestamp-millis"); + } + + @Test + public void testHasTimestampMillisFieldUnionWithRecordContainingTimestampMillis() { + Schema recordSchema = SchemaBuilder.record("Record") + .fields() + .name("timestamp").type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + Schema schema = Schema.createUnion( + Schema.create(Schema.Type.NULL), + recordSchema + ); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true for nullable union with record containing timestamp-millis"); + } + + @Test + public void testHasTimestampMillisFieldComplexNestedStructure() { + // Create a complex schema with arrays, maps, and nested records + Schema innerRecordSchema = SchemaBuilder.record("InnerRecord") + .fields() + .name("timestamp").type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + Schema complexSchema = SchemaBuilder.record("ComplexRecord") + .fields() + .name("id").type().intType().noDefault() + .name("arrayOfRecords").type().array().items(innerRecordSchema).noDefault() + .name("mapOfStrings").type().map().values().stringType().noDefault() + .endRecord(); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(complexSchema), + "Should return true for complex nested structure containing timestamp-millis field"); + } + + @Test + public void testHasTimestampMillisFieldComplexStructureWithoutTimestampMillis() { + Schema innerRecordSchema = SchemaBuilder.record("InnerRecord") + .fields() + .name("value").type().longType().noDefault() + .endRecord(); + + Schema complexSchema = SchemaBuilder.record("ComplexRecord") + .fields() + .name("id").type().intType().noDefault() + .name("arrayOfRecords").type().array().items(innerRecordSchema).noDefault() + .name("mapOfLongs").type().map().values( + LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)) + ).noDefault() + .endRecord(); + + assertFalse(AvroSchemaRepair.hasTimestampMillisField(complexSchema), + "Should return false for complex structure without timestamp-millis field"); + } + + @Test + public void testHasTimestampMillisFieldFirstFieldMatches() { + Schema schema = SchemaBuilder.record("TestRecord") + .fields() + .name("timestamp").type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .name("id").type().intType().noDefault() + .name("name").type().stringType().noDefault() + .endRecord(); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true when first field is timestamp-millis"); + } + + @Test + public void testHasTimestampMillisFieldLastFieldMatches() { + Schema schema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .name("name").type().stringType().noDefault() + .name("timestamp").type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true when last field is timestamp-millis"); + } + + @Test + public void testHasTimestampMillisFieldMultipleTimestampMillisFields() { + Schema schema = SchemaBuilder.record("TestRecord") + .fields() + .name("createdAt").type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .name("id").type().intType().noDefault() + .name("updatedAt").type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .endRecord(); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true when multiple timestamp-millis fields exist"); + } + + @Test + public void testHasTimestampMillisFieldNullableFieldWithTimestampMillis() { + Schema schema = SchemaBuilder.record("TestRecord") + .fields() + .name("id").type().intType().noDefault() + .name("timestamp").type().optional().type( + LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG)) + ) + .endRecord(); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true for nullable field with timestamp-millis"); + } + + @Test + public void testHasTimestampMillisFieldArrayOfNullableTimestampMillis() { + Schema elementSchema = Schema.createUnion( + Schema.create(Schema.Type.NULL), + LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG)) + ); + + Schema schema = Schema.createArray(elementSchema); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true for array of nullable timestamp-millis elements"); + } + + @Test + public void testHasTimestampMillisFieldMapOfNullableTimestampMillis() { + Schema valueSchema = Schema.createUnion( + Schema.create(Schema.Type.NULL), + LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG)) + ); + + Schema schema = Schema.createMap(valueSchema); + + assertTrue(AvroSchemaRepair.hasTimestampMillisField(schema), + "Should return true for map of nullable timestamp-millis values"); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java b/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java index 824a94abab4bd..6808ae1528279 100644 --- a/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java +++ b/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java @@ -479,6 +479,11 @@ protected boolean shouldReadAsPartitionedTable() { return (partitionColumns.length > 0 && canParsePartitionValues()) || HoodieTableMetadata.isMetadataTable(basePath.toString()); } + protected PartitionPath convertToPartitionPath(String partitionPath) { + Object[] partitionColumnValues = parsePartitionColumnValues(partitionColumns, partitionPath); + return new PartitionPath(partitionPath, partitionColumnValues); + } + private static long fileSliceSize(FileSlice fileSlice) { long logFileSize = fileSlice.getLogFiles().map(HoodieLogFile::getFileSize) .filter(s -> s > 0) diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaCache.java b/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaCache.java new file mode 100644 index 0000000000000..b679ea3c8d508 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaCache.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.avro; + +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.LoadingCache; +import org.apache.avro.Schema; + +/** + * An avro schema cache implementation for reusing avro schema instantces in JVM/process scope. + * This is a global cache which works for a JVM lifecycle. + * A collection of schema instants are maintained. + * + *

NOTE: The schema which be used frequently should be cached through this cache. + */ +public class AvroSchemaCache { + + + // Ensure that there is only one variable instance of the same schema within an entire JVM lifetime + private static final LoadingCache SCHEMA_CACHE = Caffeine.newBuilder().weakValues().maximumSize(1024).build(k -> k); + + /** + * Get schema variable from global cache. If not found, put it into the cache and then return it. + * @param schema schema to get + * @return if found, return the exist schema variable, otherwise return the param itself. + */ + public static Schema intern(Schema schema) { + return SCHEMA_CACHE.get(schema); + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java index 3c5486c47c742..c01af1b0644a2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java @@ -18,12 +18,17 @@ package org.apache.hudi.avro; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieAvroSchemaException; import org.apache.hudi.exception.SchemaCompatibilityException; import org.apache.avro.AvroRuntimeException; import org.apache.avro.Schema; import org.apache.avro.SchemaCompatibility; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; import java.util.Collections; import java.util.List; import java.util.Objects; @@ -218,6 +223,25 @@ private static boolean isProjectionOfInternal(Schema sourceSchema, return atomicTypeEqualityPredicate.apply(sourceSchema, targetSchema); } + public static Option findNestedFieldSchema(Schema schema, String fieldName) { + if (StringUtils.isNullOrEmpty(fieldName)) { + return Option.empty(); + } + String[] parts = fieldName.split("\\."); + for (String part : parts) { + Schema.Field foundField = getNonNullTypeFromUnion(schema).getField(part); + if (foundField == null) { + throw new HoodieAvroSchemaException(fieldName + " not a field in " + schema); + } + schema = foundField.schema(); + } + return Option.of(getNonNullTypeFromUnion(schema)); + } + + public static Option findNestedFieldType(Schema schema, String fieldName) { + return findNestedFieldSchema(schema, fieldName).map(Schema::getType); + } + /** * Appends provided new fields at the end of the given schema * @@ -251,7 +275,7 @@ public static Schema resolveUnionSchema(Schema schema, String fieldSchemaFullNam List innerTypes = schema.getTypes(); if (innerTypes.size() == 2 && isNullable(schema)) { // this is a basic nullable field so handle it more efficiently - return resolveNullableSchema(schema); + return getNonNullTypeFromUnion(schema); } Schema nonNullType = @@ -285,7 +309,7 @@ public static boolean isNullable(Schema schema) { * Resolves typical Avro's nullable schema definition: {@code Union(Schema.Type.NULL, )}, * decomposing union and returning the target non-null type */ - public static Schema resolveNullableSchema(Schema schema) { + public static Schema getNonNullTypeFromUnion(Schema schema) { if (schema.getType() != Schema.Type.UNION) { return schema; } @@ -373,4 +397,15 @@ public static void checkSchemaCompatible( throw new SchemaCompatibilityException(errorDetails); } } + + public static Schema getRepairedSchema(Schema writerSchema, Schema readerSchema) { + try { + Class avroSchemaRepairClass = Class.forName("org.apache.parquet.schema.AvroSchemaRepair"); + Method repairMethod = avroSchemaRepairClass.getMethod("repairLogicalTypes", Schema.class, Schema.class); + return (Schema) repairMethod.invoke(null, writerSchema, readerSchema); + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | InvocationTargetException e) { + // Fallback if class/method not available + return writerSchema; + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/ConvertingGenericData.java b/hudi-common/src/main/java/org/apache/hudi/avro/ConvertingGenericData.java index 9d36e214fb852..70a653726f4e8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/ConvertingGenericData.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/ConvertingGenericData.java @@ -25,6 +25,7 @@ import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericFixed; +import java.lang.reflect.Constructor; import java.util.Map; /** @@ -42,13 +43,12 @@ public class ConvertingGenericData extends GenericData { private static final TimeConversions.TimeMicrosConversion TIME_MICROS_CONVERSION = new TimeConversions.TimeMicrosConversion(); private static final TimeConversions.TimestampMicrosConversion TIMESTAMP_MICROS_CONVERSION = new TimeConversions.TimestampMicrosConversion(); - // NOTE: Those are not supported in Avro 1.8.2 - // TODO re-enable upon upgrading to 1.10 - // private static final TimeConversions.TimestampMillisConversion TIMESTAMP_MILLIS_CONVERSION = new TimeConversions.TimestampMillisConversion(); - // private static final TimeConversions.TimeMillisConversion TIME_MILLIS_CONVERSION = new TimeConversions.TimeMillisConversion(); - // private static final TimeConversions.LocalTimestampMillisConversion LOCAL_TIMESTAMP_MILLIS_CONVERSION = new TimeConversions.LocalTimestampMillisConversion(); - // private static final TimeConversions.LocalTimestampMicrosConversion LOCAL_TIMESTAMP_MICROS_CONVERSION = new TimeConversions.LocalTimestampMicrosConversion(); - + // NOTE: Those are not supported in Avro 1.8.2 (used by Spark 2) + // Use reflection to conditionally initialize them only if available + private static final Object TIMESTAMP_MILLIS_CONVERSION = createConversionIfAvailable("org.apache.avro.data.TimeConversions$TimestampMillisConversion"); + private static final Object TIME_MILLIS_CONVERSION = createConversionIfAvailable("org.apache.avro.data.TimeConversions$TimeMillisConversion"); + private static final Object LOCAL_TIMESTAMP_MILLIS_CONVERSION = createConversionIfAvailable("org.apache.avro.data.TimeConversions$LocalTimestampMillisConversion"); + private static final Object LOCAL_TIMESTAMP_MICROS_CONVERSION = createConversionIfAvailable("org.apache.avro.data.TimeConversions$LocalTimestampMicrosConversion"); public static final GenericData INSTANCE = new ConvertingGenericData(); private ConvertingGenericData() { @@ -57,12 +57,20 @@ private ConvertingGenericData() { addLogicalTypeConversion(DATE_CONVERSION); addLogicalTypeConversion(TIME_MICROS_CONVERSION); addLogicalTypeConversion(TIMESTAMP_MICROS_CONVERSION); - // NOTE: Those are not supported in Avro 1.8.2 - // TODO re-enable upon upgrading to 1.10 - // addLogicalTypeConversion(TIME_MILLIS_CONVERSION); - // addLogicalTypeConversion(TIMESTAMP_MILLIS_CONVERSION); - // addLogicalTypeConversion(LOCAL_TIMESTAMP_MILLIS_CONVERSION); - // addLogicalTypeConversion(LOCAL_TIMESTAMP_MICROS_CONVERSION); + // NOTE: Those are not supported in Avro 1.8.2 (used by Spark 2) + // Only add conversions if they're available + if (TIME_MILLIS_CONVERSION != null) { + addLogicalTypeConversionReflectively(TIME_MILLIS_CONVERSION); + } + if (TIMESTAMP_MILLIS_CONVERSION != null) { + addLogicalTypeConversionReflectively(TIMESTAMP_MILLIS_CONVERSION); + } + if (LOCAL_TIMESTAMP_MILLIS_CONVERSION != null) { + addLogicalTypeConversionReflectively(LOCAL_TIMESTAMP_MILLIS_CONVERSION); + } + if (LOCAL_TIMESTAMP_MICROS_CONVERSION != null) { + addLogicalTypeConversionReflectively(LOCAL_TIMESTAMP_MICROS_CONVERSION); + } } @Override @@ -125,9 +133,31 @@ public boolean validate(Schema schema, Object datum) { return isInteger(datum) || DATE_CONVERSION.getConvertedType().isInstance(datum); case LONG: - return isLong(datum) - || TIME_MICROS_CONVERSION.getConvertedType().isInstance(datum) - || TIMESTAMP_MICROS_CONVERSION.getConvertedType().isInstance(datum); + if (isLong(datum)) { + return true; + } + if (TIME_MICROS_CONVERSION.getConvertedType().isInstance(datum) + || TIMESTAMP_MICROS_CONVERSION.getConvertedType().isInstance(datum)) { + return true; + } + // Check optional conversions that may not be available in Avro 1.8.2 + Class convertedType; + if (TIMESTAMP_MILLIS_CONVERSION != null + && (convertedType = getConvertedType(TIMESTAMP_MILLIS_CONVERSION)) != null + && convertedType.isInstance(datum)) { + return true; + } + if (LOCAL_TIMESTAMP_MICROS_CONVERSION != null + && (convertedType = getConvertedType(LOCAL_TIMESTAMP_MICROS_CONVERSION)) != null + && convertedType.isInstance(datum)) { + return true; + } + if (LOCAL_TIMESTAMP_MILLIS_CONVERSION != null + && (convertedType = getConvertedType(LOCAL_TIMESTAMP_MILLIS_CONVERSION)) != null + && convertedType.isInstance(datum)) { + return true; + } + return false; case FLOAT: return isFloat(datum); case DOUBLE: @@ -140,5 +170,43 @@ public boolean validate(Schema schema, Object datum) { return false; } } + + /** + * Creates a conversion instance using reflection if the class is available. + * Returns null if the class doesn't exist (e.g., in Avro 1.8.2). + */ + private static Object createConversionIfAvailable(String className) { + try { + Class clazz = Class.forName(className); + Constructor constructor = clazz.getConstructor(); + return constructor.newInstance(); + } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException + | IllegalAccessException | java.lang.reflect.InvocationTargetException e) { + // Class doesn't exist or can't be instantiated (e.g., Avro 1.8.2) + return null; + } + } + + /** + * Gets the converted type from a conversion object using reflection. + */ + private static Class getConvertedType(Object conversion) { + try { + return (Class) conversion.getClass().getMethod("getConvertedType").invoke(conversion); + } catch (Exception e) { + // Should not happen if conversion is valid, but handle gracefully + return null; + } + } + + /** + * Adds a logical type conversion using unchecked cast to avoid compile-time dependency + * on classes that may not exist in older Avro versions. + */ + private void addLogicalTypeConversionReflectively(Object conversion) { + // Cast to Conversion since we know it's a Conversion if not null + // This avoids compile-time dependency on specific Conversion subclasses + addLogicalTypeConversion((org.apache.avro.Conversion) conversion); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java index d04e986487b5e..3e51d826a9ca6 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java @@ -33,11 +33,13 @@ import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.DateTimeUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.SpillableMapUtils; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieAvroSchemaException; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.SchemaCompatibilityException; @@ -47,10 +49,12 @@ import org.apache.avro.Conversions; import org.apache.avro.Conversions.DecimalConversion; import org.apache.avro.JsonProperties; +import org.apache.avro.LogicalType; import org.apache.avro.LogicalTypes; import org.apache.avro.LogicalTypes.Decimal; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; +import org.apache.avro.Schema.Field.Order; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.generic.GenericDatumReader; @@ -104,10 +108,10 @@ import static org.apache.avro.Schema.Type.UNION; import static org.apache.hudi.avro.AvroSchemaUtils.createNullableSchema; import static org.apache.hudi.avro.AvroSchemaUtils.isNullable; -import static org.apache.hudi.avro.AvroSchemaUtils.resolveNullableSchema; import static org.apache.hudi.avro.AvroSchemaUtils.resolveUnionSchema; import static org.apache.hudi.common.util.DateTimeUtils.instantToMicros; import static org.apache.hudi.common.util.DateTimeUtils.microsToInstant; +import static org.apache.hudi.avro.AvroSchemaUtils.getNonNullTypeFromUnion; import static org.apache.hudi.common.util.ValidationUtils.checkState; import static org.apache.hudi.metadata.HoodieTableMetadataUtil.tryUpcastDecimal; @@ -653,7 +657,7 @@ public static Schema getNestedFieldSchemaFromRecord(GenericRecord record, String Object val = valueNode.get(part); if (i == parts.length - 1) { - return resolveNullableSchema(valueNode.getSchema().getField(part).schema()); + return getNonNullTypeFromUnion(valueNode.getSchema().getField(part).schema()); } else { if (!(val instanceof GenericRecord)) { throw new HoodieException("Cannot find a record at part value :" + part); @@ -674,13 +678,20 @@ public static Schema getNestedFieldSchemaFromRecord(GenericRecord record, String */ public static Schema getNestedFieldSchemaFromWriteSchema(Schema writeSchema, String fieldName) { String[] parts = fieldName.split("\\."); + Schema currentSchema = writeSchema; int i = 0; for (; i < parts.length; i++) { String part = parts[i]; - Schema schema = writeSchema.getField(part).schema(); + try { + // Resolve nullable/union schema to the actual schema + currentSchema = getNonNullTypeFromUnion(currentSchema.getField(part).schema()); - if (i == parts.length - 1) { - return resolveNullableSchema(schema); + if (i == parts.length - 1) { + // Return the schema for the final part + return getNonNullTypeFromUnion(currentSchema); + } + } catch (Exception e) { + throw new HoodieException("Failed to get schema. Not a valid field name: " + fieldName); } } throw new HoodieException("Failed to get schema. Not a valid field name: " + fieldName); @@ -718,7 +729,7 @@ public static Object convertValueForSpecificDataTypes(Schema fieldSchema, return null; } - return convertValueForAvroLogicalTypes(resolveNullableSchema(fieldSchema), fieldValue, consistentLogicalTimestampEnabled); + return convertValueForAvroLogicalTypes(getNonNullTypeFromUnion(fieldSchema), fieldValue, consistentLogicalTimestampEnabled); } /** @@ -968,12 +979,35 @@ private static Object rewritePrimaryType(Object oldValue, Schema oldSchema, Sche case NULL: case BOOLEAN: case INT: - case LONG: case FLOAT: case DOUBLE: case BYTES: case STRING: return oldValue; + case LONG: + if (oldSchema.getLogicalType() != newSchema.getLogicalType()) { + if (oldSchema.getLogicalType() == null || newSchema.getLogicalType() == null) { + return oldValue; + } else if (oldSchema.getLogicalType() instanceof LogicalTypes.TimestampMillis) { + if (newSchema.getLogicalType() instanceof LogicalTypes.TimestampMicros) { + return DateTimeUtils.millisToMicros((Long) oldValue); + } + } else if (oldSchema.getLogicalType() instanceof LogicalTypes.TimestampMicros) { + if (newSchema.getLogicalType() instanceof LogicalTypes.TimestampMillis) { + return DateTimeUtils.microsToMillis((Long) oldValue); + } + } else if (isLocalTimestampMillis(oldSchema.getLogicalType())) { + if (isLocalTimestampMicros(newSchema.getLogicalType())) { + return DateTimeUtils.millisToMicros((Long) oldValue); + } + } else if (isLocalTimestampMicros(oldSchema.getLogicalType())) { + if (isLocalTimestampMillis(newSchema.getLogicalType())) { + return DateTimeUtils.microsToMillis((Long) oldValue); + } + } + throw new HoodieAvroSchemaException("Long type logical change from " + oldSchema.getLogicalType() + " to " + newSchema.getLogicalType() + " is not supported"); + } + return oldValue; case FIXED: if (oldSchema.getFixedSize() != newSchema.getFixedSize()) { // Check whether this is a [[Decimal]]'s precision change @@ -1271,6 +1305,10 @@ public static boolean gteqAvro1_10() { return VersionUtil.compareVersions(AVRO_VERSION, "1.10") >= 0; } + public static boolean gteqAvro1_12() { + return VersionUtil.compareVersions(AVRO_VERSION, "1.12") >= 0; + } + /** * Wraps a value into Avro type wrapper. * @@ -1364,4 +1402,107 @@ public static Comparable unwrapAvroValueWrapper(Object avroValueWrapper) { } } + /** + * Checks if a logical type is an instance of LocalTimestampMillis using reflection. + * Returns false if the class doesn't exist (e.g., in Avro 1.8.2). + */ + private static boolean isLocalTimestampMillis(LogicalType logicalType) { + if (logicalType == null) { + return false; + } + try { + Class localTimestampMillisClass = Class.forName("org.apache.avro.LogicalTypes$LocalTimestampMillis"); + return localTimestampMillisClass.isInstance(logicalType); + } catch (ClassNotFoundException e) { + // Class doesn't exist (e.g., Avro 1.8.2) + return false; + } + } + + /** + * Checks if a logical type is an instance of LocalTimestampMicros using reflection. + * Returns false if the class doesn't exist (e.g., in Avro 1.8.2). + */ + private static boolean isLocalTimestampMicros(LogicalType logicalType) { + if (logicalType == null) { + return false; + } + try { + Class localTimestampMicrosClass = Class.forName("org.apache.avro.LogicalTypes$LocalTimestampMicros"); + return localTimestampMicrosClass.isInstance(logicalType); + } catch (ClassNotFoundException e) { + // Class doesn't exist (e.g., Avro 1.8.2) + return false; + } + } + + private static Object convertDefaultValueForAvroCompatibility(Object defaultValue) { + if (gteqAvro1_12() && defaultValue instanceof byte[]) { + // For Avro 1.12.0 compatibility, we need to convert the default value in byte array + // to String so that correct JsonNode is used for the default value for validation, + // instead of directly relying on Avro's JacksonUtils.toJsonNode which is called + // by `Schema.Field` constructor + // The logic of getting the String value is copied from JacksonUtils.toJsonNode in Avro 1.11.4 + return new String((byte[]) defaultValue, StandardCharsets.ISO_8859_1); + } + return defaultValue; + } + + /** + * Creates a new Avro Schema.Field from an existing field, with special handling for + * default values to ensure compatibility with Avro 1.12.0 and later versions. + * + * @param field the original Schema.Field to create a new field from + * @return a new Schema.Field with the same properties but properly formatted default value + */ + public static Schema.Field createNewSchemaField(Schema.Field field) { + return createNewSchemaField(field.name(), field.schema(), field.doc(), field.defaultVal()); + } + + /** + * Creates a new Avro Schema.Field with special handling for default values to ensure + * compatibility with Avro 1.12.0 and later versions. + * + *

In Avro 1.12.0+, the validation of default values for bytes fields is stricter. + * When the default value is a byte array, it needs to be converted to a String using + * ISO-8859-1 encoding so that the correct JsonNode type (TextNode) is used for validation, + * rather than BinaryNode which would fail validation. Changes in Avro 1.12.0 that + * lead to this behavior: [AVRO-3876] https://github.com/apache/avro/pull/2529 + * + *

This conversion ensures that schemas with bytes fields having default values + * can be properly constructed without AvroTypeException in Avro 1.12.0+. + * + * @param name the name of the field + * @param schema the schema of the field + * @param doc the documentation for the field (can be null) + * @param defaultValue the default value for the field (can be null) + * @return a new Schema.Field with properly formatted default value for Avro 1.12.0+ compatibility + */ + public static Schema.Field createNewSchemaField(String name, Schema schema, String doc, Object defaultValue) { + return new Schema.Field(name, schema, doc, convertDefaultValueForAvroCompatibility(defaultValue)); + } + + /** + * Creates a new Avro Schema.Field with special handling for default values to ensure + * compatibility with Avro 1.12.0 and later versions. + * + *

In Avro 1.12.0+, the validation of default values for bytes fields is stricter. + * When the default value is a byte array, it needs to be converted to a String using + * ISO-8859-1 encoding so that the correct JsonNode type (TextNode) is used for validation, + * rather than BinaryNode which would fail validation. Changes in Avro 1.12.0 that + * lead to this behavior: [AVRO-3876] https://github.com/apache/avro/pull/2529 + * + *

This conversion ensures that schemas with bytes fields having default values + * can be properly constructed without AvroTypeException in Avro 1.12.0+. + * + * @param name the name of the field + * @param schema the schema of the field + * @param doc the documentation for the field (can be null) + * @param defaultValue the default value for the field (can be null) + * @param order the sort order for this field (can be null, defaults to ascending) + * @return a new Schema.Field with properly formatted default value for Avro 1.12.0+ compatibility + */ + public static Schema.Field createNewSchemaField(String name, Schema schema, String doc, Object defaultValue, Order order) { + return new Schema.Field(name, schema, doc, convertDefaultValueForAvroCompatibility(defaultValue), order); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java b/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java index cdf0f15d80deb..cce3016aed859 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java @@ -299,12 +299,27 @@ private static JsonToAvroFieldProcessor generateFixedTypeHandler() { return new JsonToAvroFieldProcessor() { @Override public Pair convert(Object value, String name, Schema schema, boolean shouldSanitize, String invalidCharMask) { + byte[] src; // The ObjectMapper use List to represent FixedType // eg: "decimal_val": [0, 0, 14, -63, -52] will convert to ArrayList - List converval = (List) value; - byte[] src = new byte[converval.size()]; - for (int i = 0; i < converval.size(); i++) { - src[i] = converval.get(i).byteValue(); + if (value instanceof List) { + List converval = (List) value; + src = new byte[converval.size()]; + for (int i = 0; i < converval.size(); i++) { + src[i] = converval.get(i).byteValue(); + } + } else if (value instanceof ByteBuffer) { + // Handle ByteBuffer when reading from existing records + ByteBuffer buffer = (ByteBuffer) value; + int start = buffer.position(); + int length = buffer.limit() - start; + src = new byte[length]; + buffer.get(src, 0, length); + buffer.position(start); + } else if (value instanceof byte[]) { + src = (byte[]) value; + } else { + return Pair.of(false, null); } byte[] dst = new byte[schema.getFixedSize()]; System.arraycopy(src, 0, dst, 0, Math.min(schema.getFixedSize(), src.length)); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/TimestampKeyGeneratorConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/TimestampKeyGeneratorConfig.java index 7098c076279b0..aa97b10c2c83c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/TimestampKeyGeneratorConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/TimestampKeyGeneratorConfig.java @@ -42,7 +42,8 @@ public class TimestampKeyGeneratorConfig { .withAlternatives(OLD_TIMESTAMP_KEYGEN_CONFIG_PREFIX + "timestamp.type") .markAdvanced() .withDocumentation("Timestamp type of the field, which should be one of the timestamp types " - + "supported: `UNIX_TIMESTAMP`, `DATE_STRING`, `MIXED`, `EPOCHMILLISECONDS`, `SCALAR`."); + + "supported: `UNIX_TIMESTAMP`, `DATE_STRING`, `MIXED`, `EPOCHMILLISECONDS`," + + " `EPOCHMICROSECONDS`, `SCALAR`."); public static final ConfigProperty INPUT_TIME_UNIT = ConfigProperty .key(TIMESTAMP_KEYGEN_CONFIG_PREFIX + "timestamp.scalar.time.unit") diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/ParquetTableSchemaResolver.java b/hudi-common/src/main/java/org/apache/hudi/common/table/ParquetTableSchemaResolver.java new file mode 100644 index 0000000000000..ea2bf2fe043a9 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/ParquetTableSchemaResolver.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.table; + +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.schema.MessageType; + +import static org.apache.parquet.avro.HoodieAvroParquetSchemaConverter.getAvroSchemaConverter; + +public class ParquetTableSchemaResolver extends TableSchemaResolver { + + public ParquetTableSchemaResolver(HoodieTableMetaClient metaClient) { + super(metaClient); + } + + public static MessageType convertAvroSchemaToParquet(Schema schema, Configuration hadoopConf) { + return getAvroSchemaConverter(hadoopConf).convert(schema); + } + + private Schema convertParquetSchemaToAvro(MessageType parquetSchema) { + return getAvroSchemaConverter(metaClient.getHadoopConf()).convert(parquetSchema); + } + + private MessageType convertAvroSchemaToParquet(Schema schema) { + return getAvroSchemaConverter(metaClient.getHadoopConf()).convert(schema); + } + + /** + * Gets full schema (user + metadata) for a hoodie table in Parquet format. + * + * @return Parquet schema for the table + */ + public MessageType getTableParquetSchema() throws Exception { + return convertAvroSchemaToParquet(getTableAvroSchema(true)); + } + + /** + * Gets users data schema for a hoodie table in Parquet format. + * + * @return Parquet schema for the table + */ + public MessageType getTableParquetSchema(boolean includeMetadataField) throws Exception { + return convertAvroSchemaToParquet(getTableAvroSchema(includeMetadataField)); + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java index 02b1ef352515b..e0295080be7f7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java @@ -82,7 +82,7 @@ public class TableSchemaResolver { private static final Logger LOG = LoggerFactory.getLogger(TableSchemaResolver.class); - private final HoodieTableMetaClient metaClient; + protected final HoodieTableMetaClient metaClient; /** * Signals whether suite of the meta-fields should have additional field designating diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java index bdcd0ac690fd2..6f04583c0f191 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java @@ -62,6 +62,7 @@ import java.util.zip.DeflaterOutputStream; import java.util.zip.InflaterInputStream; +import static org.apache.hudi.avro.AvroSchemaUtils.getRepairedSchema; import static org.apache.hudi.avro.HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion; import static org.apache.hudi.common.util.ValidationUtils.checkArgument; import static org.apache.hudi.common.util.ValidationUtils.checkState; @@ -142,7 +143,7 @@ protected ClosableIterator> deserializeRecords(byte[] conten checkState(this.readerSchema != null, "Reader's schema has to be non-null"); checkArgument(type != HoodieRecordType.SPARK, "Not support read avro to spark record"); // TODO AvroSparkReader need - RecordIterator iterator = RecordIterator.getInstance(this, content); + RecordIterator iterator = RecordIterator.getInstance(this, content, true); return new CloseableMappingIterator<>(iterator, data -> (HoodieRecord) new HoodieAvroIndexedRecord(data)); } @@ -155,7 +156,7 @@ private static class RecordIterator implements ClosableIterator { private int totalRecords = 0; private int readRecords = 0; - private RecordIterator(Schema readerSchema, Schema writerSchema, byte[] content) throws IOException { + private RecordIterator(Schema readerSchema, Schema writerSchema, byte[] content, boolean enableLogicalTimestampFieldRepair) throws IOException { this.content = content; this.dis = new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(this.content))); @@ -166,16 +167,21 @@ private RecordIterator(Schema readerSchema, Schema writerSchema, byte[] content) this.totalRecords = this.dis.readInt(); } - if (recordNeedsRewriteForExtendedAvroTypePromotion(writerSchema, readerSchema)) { - this.reader = new GenericDatumReader<>(writerSchema, writerSchema); + // writer schema could refer to table schema. + // avoid this for MDT for sure. + // and for tables having no logical ts column. + Schema repairedWriterSchema = enableLogicalTimestampFieldRepair + ? getRepairedSchema(writerSchema, readerSchema) : writerSchema; + if (recordNeedsRewriteForExtendedAvroTypePromotion(repairedWriterSchema, readerSchema)) { + this.reader = new GenericDatumReader<>(repairedWriterSchema, repairedWriterSchema); this.promotedSchema = Option.of(readerSchema); } else { - this.reader = new GenericDatumReader<>(writerSchema, readerSchema); + this.reader = new GenericDatumReader<>(repairedWriterSchema, readerSchema); } } - public static RecordIterator getInstance(HoodieAvroDataBlock dataBlock, byte[] content) throws IOException { - return new RecordIterator(dataBlock.readerSchema, dataBlock.getSchemaFromHeader(), content); + public static RecordIterator getInstance(HoodieAvroDataBlock dataBlock, byte[] content, boolean enableLogicalTimestampFieldRepair) throws IOException { + return new RecordIterator(dataBlock.readerSchema, dataBlock.getSchemaFromHeader(), content, enableLogicalTimestampFieldRepair); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java index c6e524e8dd78a..c5df6b4d8b063 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java @@ -638,6 +638,19 @@ public final List getPartitionPaths() { } } + public final List getPartitionNames() { + try { + readLock.lock(); + return fetchAllStoredFileGroups() + .filter(fg -> !isFileGroupReplaced(fg)) + .map(HoodieFileGroup::getPartitionPath) + .distinct() + .collect(Collectors.toList()); + } finally { + readLock.unlock(); + } + } + @Override public final Stream> getPendingLogCompactionOperations() { try { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/DateTimeUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/DateTimeUtils.java index 9dde7727806c2..99efa89fa0542 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/DateTimeUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/DateTimeUtils.java @@ -52,6 +52,12 @@ public static Instant microsToInstant(long microsFromEpoch) { return Instant.ofEpochSecond(epochSeconds, nanoAdjustment); } + public static Instant nanosToInstant(long nanosFromEpoch) { + long epochSeconds = nanosFromEpoch / (1_000_000_000L); + long nanoAdjustment = nanosFromEpoch % (1_000_000_000L); + return Instant.ofEpochSecond(epochSeconds, nanoAdjustment); + } + /** * Converts provided {@link Instant} to microseconds (from epoch) */ @@ -71,6 +77,45 @@ public static long instantToMicros(Instant instant) { } } + /** + * This is based off instantToMicros above. + * */ + public static long instantToNanos(Instant instant) { + long seconds = instant.getEpochSecond(); + int nanos = instant.getNano(); + + if (seconds < 0 && nanos > 0) { + // Shift seconds by +1, then subtract a full second in nanos + long totalNanos = Math.multiplyExact(seconds + 1, 1_000_000_000L); + long adjustment = nanos - 1_000_000_000L; + return Math.addExact(totalNanos, adjustment); + } else { + long totalNanos = Math.multiplyExact(seconds, 1_000_000_000L); + return Math.addExact(totalNanos, nanos); + } + } + + public static final long MICROS_PER_MILLIS = 1000L; + + /** + * Converts the timestamp to milliseconds since epoch. In Spark timestamp values have microseconds + * precision, so this conversion is lossy. + */ + public static Long microsToMillis(Long micros) { + // When the timestamp is negative i.e before 1970, we need to adjust the milliseconds portion. + // Example - 1965-01-01 10:11:12.123456 is represented as (-157700927876544) in micro precision. + // In millis precision the above needs to be represented as (-157700927877). + return Math.floorDiv(micros, MICROS_PER_MILLIS); + } + + /** + * Converts milliseconds since the epoch to microseconds. + */ + public static Long millisToMicros(Long millis) { + return Math.multiplyExact(millis, MICROS_PER_MILLIS); + } + + /** * Parse input String to a {@link java.time.Instant}. * diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieAvroParquetReaderIterator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieAvroParquetReaderIterator.java new file mode 100644 index 0000000000000..b4eedac7ce2a7 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieAvroParquetReaderIterator.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util; + +import org.apache.hudi.avro.HoodieAvroUtils; + +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.parquet.hadoop.ParquetReader; + +public class HoodieAvroParquetReaderIterator extends ParquetReaderIterator { + private final Schema promotedSchema; + public HoodieAvroParquetReaderIterator(ParquetReader parquetReader, Schema promotedSchema) { + super(parquetReader); + this.promotedSchema = promotedSchema; + } + + @Override + public IndexedRecord next() { + return HoodieAvroUtils.rewriteRecordWithNewSchema(super.next(), this.promotedSchema); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/JsonUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/JsonUtils.java index b35f8a1c18ccb..820f564f1cb1f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/JsonUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/JsonUtils.java @@ -44,6 +44,8 @@ public class JsonUtils { MAPPER.setVisibility(PropertyAccessor.IS_GETTER, JsonAutoDetect.Visibility.NONE); MAPPER.setVisibility(PropertyAccessor.SETTER, JsonAutoDetect.Visibility.NONE); MAPPER.setVisibility(PropertyAccessor.CREATOR, JsonAutoDetect.Visibility.NONE); + // NOTE: Registering [[JavaTimeModule]] is required for Jackson >= 2.11 (Spark >= 3.3) + MAPPER.registerModule(new JavaTimeModule()); } public static ObjectMapper getObjectMapper() { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java index de5572523c1eb..e848b166d0ecf 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java @@ -35,7 +35,6 @@ import org.apache.hadoop.fs.Path; import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.avro.AvroReadSupport; -import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.ParquetReader; @@ -65,6 +64,8 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.parquet.avro.HoodieAvroParquetSchemaConverter.getAvroSchemaConverter; + /** * Utility functions involving with parquet. */ @@ -228,7 +229,7 @@ public Map readFooter(Configuration configuration, boolean requi @Override public Schema readAvroSchema(Configuration conf, Path parquetFilePath) { MessageType parquetSchema = readSchema(conf, parquetFilePath); - return new AvroSchemaConverter(conf).convert(parquetSchema); + return getAvroSchemaConverter(conf).convert(parquetSchema); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ReflectionUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ReflectionUtils.java index 21d91a8a3344f..babe35c6e6028 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ReflectionUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ReflectionUtils.java @@ -53,6 +53,8 @@ public static Class getClass(String clazzName) { return Class.forName(c); } catch (ClassNotFoundException e) { throw new HoodieException("Unable to load class", e); + } catch (NoClassDefFoundError e) { + throw new HoodieException("Unable to load class due to missing dependency", e); } }); } @@ -112,6 +114,15 @@ public static boolean hasConstructor(String clazz, Class[] constructorArgType LOG.warn(message, e); } return false; + } catch (HoodieException e) { + // Class cannot be loaded (e.g., ClassNotFoundException or NoClassDefFoundError) + String message = "Unable to load class " + clazz; + if (silenceWarning) { + LOG.debug(message, e); + } else { + LOG.warn(message, e); + } + return false; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/exception/HoodieAvroSchemaException.java b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieAvroSchemaException.java new file mode 100644 index 0000000000000..7e9822ba31170 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/exception/HoodieAvroSchemaException.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.exception; + +/** + * Thrown when we detect in Hudi code that a record schema + * violates Avro rules. This can happen even when using Spark + * because we use Avro schema internally + */ +public class HoodieAvroSchemaException extends SchemaCompatibilityException { + public HoodieAvroSchemaException(String message) { + super(message); + } + + public HoodieAvroSchemaException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/Type.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/Type.java index bc8b89004d695..95c6504446b95 100644 --- a/hudi-common/src/main/java/org/apache/hudi/internal/schema/Type.java +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/Type.java @@ -64,7 +64,12 @@ enum TypeID { TIME(Long.class), TIMESTAMP(Long.class), DECIMAL(BigDecimal.class), - UUID(UUID.class); + UUID(UUID.class), + TIME_MILLIS(Integer.class), + TIMESTAMP_MILLIS(Long.class), + LOCAL_TIMESTAMP_MILLIS(Long.class), + LOCAL_TIMESTAMP_MICROS(Long.class); + private final String name; private final Class classTag; diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/Types.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/Types.java index ed03a7349cb72..86e39959c5178 100644 --- a/hudi-common/src/main/java/org/apache/hudi/internal/schema/Types.java +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/Types.java @@ -383,6 +383,78 @@ public int hashCode() { } } + public static class TimeMillisType extends PrimitiveType { + private static final TimeMillisType INSTANCE = new TimeMillisType(); + + public static TimeMillisType get() { + return INSTANCE; + } + + @Override + public TypeID typeId() { + return TypeID.TIME_MILLIS; + } + + @Override + public String toString() { + return "time-millis"; + } + } + + public static class TimestampMillisType extends PrimitiveType { + private static final TimestampMillisType INSTANCE = new TimestampMillisType(); + + public static TimestampMillisType get() { + return INSTANCE; + } + + @Override + public TypeID typeId() { + return TypeID.TIMESTAMP_MILLIS; + } + + @Override + public String toString() { + return "timestamp-millis"; + } + } + + public static class LocalTimestampMillisType extends PrimitiveType { + private static final LocalTimestampMillisType INSTANCE = new LocalTimestampMillisType(); + + public static LocalTimestampMillisType get() { + return INSTANCE; + } + + @Override + public TypeID typeId() { + return TypeID.LOCAL_TIMESTAMP_MILLIS; + } + + @Override + public String toString() { + return "local-timestamp-millis"; + } + } + + public static class LocalTimestampMicrosType extends PrimitiveType { + private static final LocalTimestampMicrosType INSTANCE = new LocalTimestampMicrosType(); + + public static LocalTimestampMicrosType get() { + return INSTANCE; + } + + @Override + public TypeID typeId() { + return TypeID.LOCAL_TIMESTAMP_MICROS; + } + + @Override + public String toString() { + return "local-timestamp-micros"; + } + } + /** * UUID primitive type. */ diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/convert/AvroInternalSchemaConverter.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/convert/AvroInternalSchemaConverter.java index 786ac538271a2..ac14ea9e5c6e8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/internal/schema/convert/AvroInternalSchemaConverter.java +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/convert/AvroInternalSchemaConverter.java @@ -231,15 +231,18 @@ private static Type visitAvroPrimitiveToBuildInternalType(Schema primitive) { } else if (logical instanceof LogicalTypes.Date) { return Types.DateType.get(); - } else if ( - logical instanceof LogicalTypes.TimeMillis - || logical instanceof LogicalTypes.TimeMicros) { + } else if (logical instanceof LogicalTypes.TimeMillis) { + return Types.TimeMillisType.get(); + } else if (logical instanceof LogicalTypes.TimeMicros) { return Types.TimeType.get(); - - } else if ( - logical instanceof LogicalTypes.TimestampMillis - || logical instanceof LogicalTypes.TimestampMicros) { + } else if (logical instanceof LogicalTypes.TimestampMillis) { + return Types.TimestampMillisType.get(); + } else if (logical instanceof LogicalTypes.TimestampMicros) { return Types.TimestampType.get(); + } else if (isLocalTimestampMillis(logical)) { + return Types.LocalTimestampMillisType.get(); + } else if (isLocalTimestampMicros(logical)) { + return Types.LocalTimestampMicrosType.get(); } else if (LogicalTypes.uuid().getName().equals(name)) { return Types.UUIDType.get(); } @@ -428,9 +431,21 @@ private static Schema visitInternalPrimitiveToBuildAvroPrimitiveType(Type.Primit case TIME: return LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG)); + case TIME_MILLIS: + return LogicalTypes.timeMillis().addToSchema(Schema.create(Schema.Type.INT)); + case TIMESTAMP: return LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); + case TIMESTAMP_MILLIS: + return LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG)); + + case LOCAL_TIMESTAMP_MICROS: + return createLocalTimestampMicrosSchema(); + + case LOCAL_TIMESTAMP_MILLIS: + return createLocalTimestampMillisSchema(); + case STRING: return Schema.create(Schema.Type.STRING); @@ -481,4 +496,68 @@ private static int computeMinBytesForPrecision(int precision) { } return numBytes; } + + /** + * Checks if a logical type is an instance of LocalTimestampMillis using reflection. + * Returns false if the class doesn't exist (e.g., in Avro 1.8.2). + */ + private static boolean isLocalTimestampMillis(LogicalType logicalType) { + if (logicalType == null) { + return false; + } + try { + Class localTimestampMillisClass = Class.forName("org.apache.avro.LogicalTypes$LocalTimestampMillis"); + return localTimestampMillisClass.isInstance(logicalType); + } catch (ClassNotFoundException e) { + // Class doesn't exist (e.g., Avro 1.8.2) + return false; + } + } + + /** + * Checks if a logical type is an instance of LocalTimestampMicros using reflection. + * Returns false if the class doesn't exist (e.g., in Avro 1.8.2). + */ + private static boolean isLocalTimestampMicros(LogicalType logicalType) { + if (logicalType == null) { + return false; + } + try { + Class localTimestampMicrosClass = Class.forName("org.apache.avro.LogicalTypes$LocalTimestampMicros"); + return localTimestampMicrosClass.isInstance(logicalType); + } catch (ClassNotFoundException e) { + // Class doesn't exist (e.g., Avro 1.8.2) + return false; + } + } + + /** + * Creates a LocalTimestampMicros schema using reflection. + * Returns null if the class doesn't exist (e.g., in Avro 1.8.2). + */ + private static Schema createLocalTimestampMicrosSchema() { + try { + java.lang.reflect.Method method = LogicalTypes.class.getMethod("localTimestampMicros"); + LogicalType logicalType = (LogicalType) method.invoke(null); + return logicalType.addToSchema(Schema.create(Schema.Type.LONG)); + } catch (Exception e) { + // Method doesn't exist (e.g., Avro 1.8.2) + throw new UnsupportedOperationException("LocalTimestampMicros is not supported in this Avro version", e); + } + } + + /** + * Creates a LocalTimestampMillis schema using reflection. + * Returns null if the class doesn't exist (e.g., in Avro 1.8.2). + */ + private static Schema createLocalTimestampMillisSchema() { + try { + java.lang.reflect.Method method = LogicalTypes.class.getMethod("localTimestampMillis"); + LogicalType logicalType = (LogicalType) method.invoke(null); + return logicalType.addToSchema(Schema.create(Schema.Type.LONG)); + } catch (Exception e) { + // Method doesn't exist (e.g., Avro 1.8.2) + throw new UnsupportedOperationException("LocalTimestampMillis is not supported in this Avro version", e); + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SerDeHelper.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SerDeHelper.java index f47d7f8da517b..6d009fbe55aac 100644 --- a/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SerDeHelper.java +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/SerDeHelper.java @@ -217,8 +217,16 @@ private static Type parseTypeFromJson(JsonNode jsonNode) { return Types.DateType.get(); case TIME: return Types.TimeType.get(); + case TIME_MILLIS: + return Types.TimeMillisType.get(); case TIMESTAMP: return Types.TimestampType.get(); + case TIMESTAMP_MILLIS: + return Types.TimestampMillisType.get(); + case LOCAL_TIMESTAMP_MICROS: + return Types.LocalTimestampMicrosType.get(); + case LOCAL_TIMESTAMP_MILLIS: + return Types.LocalTimestampMillisType.get(); case STRING: return Types.StringType.get(); case UUID: diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriterFactory.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriterFactory.java index 8ed597ed920df..380a7527a42f9 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriterFactory.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileWriterFactory.java @@ -33,7 +33,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.orc.CompressionKind; -import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.schema.MessageType; @@ -45,6 +44,8 @@ import static org.apache.hudi.io.storage.HoodieHFileConfig.HFILE_COMPARATOR; import static org.apache.hudi.io.storage.HoodieHFileConfig.PREFETCH_ON_OPEN; +import static org.apache.parquet.avro.HoodieAvroParquetSchemaConverter.getAvroSchemaConverter; + public class HoodieAvroFileWriterFactory extends HoodieFileWriterFactory { protected HoodieFileWriter newParquetFileWriter( @@ -113,6 +114,6 @@ private HoodieAvroWriteSupport getHoodieAvroWriteSupport(Configuration conf, Sch return (HoodieAvroWriteSupport) ReflectionUtils.loadClass( config.getStringOrDefault(HoodieStorageConfig.HOODIE_AVRO_WRITE_SUPPORT_CLASS), new Class[] {MessageType.class, Schema.class, Option.class, Properties.class}, - new AvroSchemaConverter(conf).convert(schema), schema, filter, config.getProps()); + getAvroSchemaConverter(conf).convert(schema), schema, filter, config.getProps()); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetReader.java index ad4d1f16a60ce..275ca3ea738de 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetReader.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.HoodieAvroParquetReaderIterator; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ParquetReaderIterator; import org.apache.hudi.common.util.collection.ClosableIterator; @@ -45,7 +46,9 @@ import java.util.List; import java.util.Set; +import static org.apache.hudi.avro.AvroSchemaUtils.getRepairedSchema; import static org.apache.hudi.common.util.TypeUtils.unsafeCast; +import static org.apache.parquet.avro.HoodieAvroParquetSchemaConverter.getAvroSchemaConverter; /** * {@link HoodieFileReader} implementation for parquet format. @@ -95,8 +98,8 @@ protected ClosableIterator getIndexedRecordIterator(Schema schema } @Override - protected ClosableIterator getIndexedRecordIterator(Schema readerSchema, Schema requestedSchema) throws IOException { - return getIndexedRecordIteratorInternal(readerSchema, Option.of(requestedSchema)); + public ClosableIterator getIndexedRecordIterator(Schema readerSchema, Schema requestedSchema) throws IOException { + return getIndexedRecordIteratorInternal(requestedSchema, Option.empty()); } @Override @@ -154,19 +157,30 @@ private static Configuration tryOverrideDefaultConfigs(Configuration conf) { return conf; } - private ClosableIterator getIndexedRecordIteratorInternal(Schema schema, Option requestedSchema) throws IOException { + private ClosableIterator getIndexedRecordIteratorInternal(Schema schema, Option renamedColumns) throws IOException { // NOTE: We have to set both Avro read-schema and projection schema to make // sure that in case the file-schema is not equal to read-schema we'd still // be able to read that file (in case projection is a proper one) - if (!requestedSchema.isPresent()) { + Schema repairedFileSchema = getRepairedSchema(getSchema(), schema); + Option promotedSchema = Option.empty(); + if (!renamedColumns.isPresent() || HoodieAvroUtils.recordNeedsRewriteForExtendedAvroTypePromotion(repairedFileSchema, schema)) { + AvroReadSupport.setAvroReadSchema(conf, repairedFileSchema); + AvroReadSupport.setRequestedProjection(conf, repairedFileSchema); + promotedSchema = Option.of(schema); + } else { AvroReadSupport.setAvroReadSchema(conf, schema); AvroReadSupport.setRequestedProjection(conf, schema); - } else { - AvroReadSupport.setAvroReadSchema(conf, requestedSchema.get()); - AvroReadSupport.setRequestedProjection(conf, requestedSchema.get()); } - ParquetReader reader = new HoodieAvroParquetReaderBuilder(path).withConf(conf).build(); - ParquetReaderIterator parquetReaderIterator = new ParquetReaderIterator<>(reader); + ParquetReader reader = + new HoodieAvroParquetReaderBuilder(path) + .withTableSchema(getAvroSchemaConverter(conf).convert(schema)) + .withConf(conf) + .set(AvroSchemaConverter.ADD_LIST_ELEMENT_RECORDS, conf.get(AvroSchemaConverter.ADD_LIST_ELEMENT_RECORDS)) + .set(ParquetInputFormat.STRICT_TYPE_CHECKING, conf.get(ParquetInputFormat.STRICT_TYPE_CHECKING)) + .build(); + ParquetReaderIterator parquetReaderIterator = promotedSchema.isPresent() + ? new HoodieAvroParquetReaderIterator(reader, promotedSchema.get()) + : new ParquetReaderIterator<>(reader); readerIterators.add(parquetReaderIterator); return parquetReaderIterator; } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java index 62b0232583293..c34f0309db5d7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java @@ -73,6 +73,7 @@ import org.apache.hudi.util.Lazy; import org.apache.avro.AvroTypeException; +import org.apache.avro.LogicalType; import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; @@ -109,7 +110,7 @@ import java.util.stream.Stream; import static java.util.stream.Collectors.toList; -import static org.apache.hudi.avro.AvroSchemaUtils.resolveNullableSchema; +import static org.apache.hudi.avro.AvroSchemaUtils.getNonNullTypeFromUnion; import static org.apache.hudi.avro.HoodieAvroUtils.addMetadataFields; import static org.apache.hudi.avro.HoodieAvroUtils.convertValueForSpecificDataTypes; import static org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldSchemaFromWriteSchema; @@ -215,9 +216,8 @@ class ColumnStats { ColumnStats colStats = allColumnStats.computeIfAbsent(field.name(), (ignored) -> new ColumnStats()); GenericRecord genericRecord = (GenericRecord) record; - final Object fieldVal = convertValueForSpecificDataTypes(field.schema(), genericRecord.get(field.name()), false); - final Schema fieldSchema = getNestedFieldSchemaFromWriteSchema(genericRecord.getSchema(), field.name()); + final Schema fieldSchema = getNonNullTypeFromUnion(getNestedFieldSchemaFromWriteSchema(genericRecord.getSchema(), field.name())); colStats.valueCount++; @@ -1083,18 +1083,47 @@ private static List getColumnsToIndex(MetadataRecordsGenerationParams re List targetColumns = recordsGenParams.getTargetColumnsForColumnStatsIndex(); if (!targetColumns.isEmpty()) { - return targetColumns; + // Filter out timestamp-millis columns from the explicitly specified columns + Option writerSchemaOpt = lazyWriterSchemaOpt.get(); + return writerSchemaOpt + .map(writerSchema -> + targetColumns.stream() + .filter(colName -> { + Schema.Field field = writerSchema.getField(colName); + return field != null && !isTimestampMillisField(field.schema()); + }) + .collect(Collectors.toList())) + .orElse(targetColumns); } Option writerSchemaOpt = lazyWriterSchemaOpt.get(); return writerSchemaOpt .map(writerSchema -> writerSchema.getFields().stream() + .filter(field -> !isTimestampMillisField(field.schema())) .map(Schema.Field::name) .collect(Collectors.toList())) .orElse(Collections.emptyList()); } + /** + * Checks if a schema field is of type timestamp-millis (timestamp-millis or local-timestamp-millis). + * + * @param fieldSchema The schema of the field to check + * @return true if the field is of type timestamp-millis, false otherwise + */ + private static boolean isTimestampMillisField(Schema fieldSchema) { + Schema nonNullableSchema = getNonNullTypeFromUnion(fieldSchema); + if (nonNullableSchema.getType() == Schema.Type.LONG) { + LogicalType logicalType = nonNullableSchema.getLogicalType(); + if (logicalType != null) { + String logicalTypeName = logicalType.getName(); + return logicalTypeName.equals("timestamp-millis") || logicalTypeName.equals("local-timestamp-millis"); + } + } + return false; + } + private static Stream translateWriteStatToColumnStats(HoodieWriteStat writeStat, HoodieTableMetaClient datasetMetaClient, List columnsToIndex) { @@ -1214,7 +1243,7 @@ private static Comparable coerceToComparable(Schema schema, Object val) { switch (schema.getType()) { case UNION: // TODO we need to handle unions in general case as well - return coerceToComparable(resolveNullableSchema(schema), val); + return coerceToComparable(getNonNullTypeFromUnion(schema), val); case FIXED: case BYTES: diff --git a/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroParquetReaderBuilder.java b/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroParquetReaderBuilder.java index d6179ea1aacd2..61f526fa710d1 100644 --- a/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroParquetReaderBuilder.java +++ b/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroParquetReaderBuilder.java @@ -18,12 +18,15 @@ package org.apache.parquet.avro; +import org.apache.hudi.common.util.Option; + import org.apache.avro.generic.GenericData; import org.apache.avro.specific.SpecificData; import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.api.ReadSupport; import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; /** * Copy from org.apache.parquet.avro.AvroParquetReader.Builder. @@ -35,6 +38,7 @@ public class HoodieAvroParquetReaderBuilder extends ParquetReader.Builder private GenericData model = null; private boolean enableCompatibility = true; private boolean isReflect = true; + private Option tableSchema = Option.empty(); @Deprecated public HoodieAvroParquetReaderBuilder(Path path) { @@ -67,6 +71,11 @@ public HoodieAvroParquetReaderBuilder withCompatibility(boolean enableCompati return this; } + public HoodieAvroParquetReaderBuilder withTableSchema(MessageType tableSchema) { + this.tableSchema = Option.of(tableSchema); + return this; + } + @Override protected ReadSupport getReadSupport() { if (isReflect) { @@ -74,6 +83,6 @@ protected ReadSupport getReadSupport() { } else { conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, enableCompatibility); } - return new HoodieAvroReadSupport<>(model); + return new HoodieAvroReadSupport<>(model, tableSchema); } } diff --git a/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroParquetSchemaConverter.java b/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroParquetSchemaConverter.java new file mode 100644 index 0000000000000..2a748eb247f77 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroParquetSchemaConverter.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.avro; + +import org.apache.hudi.common.util.ReflectionUtils; + +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Parquet-Java AvroSchemaConverter doesn't support local timestamp types until version 1.14 + * for this reason we use a modified version of the AvroSchemaConverter that adds support for local timestamp types + * Parquet-Java still supports local timestamp types from version 1.11.0, just that the AvroSchemaConverter + * doesn't work. + *

+ * However, for versions < 1.11.0, local timestamp is not supported at all. Therefore, we just use the + * library AvroSchemaConverter in this case. + * + */ +public abstract class HoodieAvroParquetSchemaConverter { + private static final Logger LOG = LoggerFactory.getLogger(HoodieAvroParquetSchemaConverter.class); + public static HoodieAvroParquetSchemaConverter getAvroSchemaConverter(Configuration configuration) { + try { + return (HoodieAvroParquetSchemaConverter) ReflectionUtils.loadClass("org.apache.parquet.avro.AvroSchemaConverterWithTimestampNTZ", + new Class[] {Configuration.class}, configuration); + } catch (Throwable t) { + LOG.debug("Failed to load AvroSchemaConverterWithTimestampNTZ, using NativeAvroSchemaConverter instead", t); + return (HoodieAvroParquetSchemaConverter) ReflectionUtils.loadClass("org.apache.parquet.avro.NativeAvroSchemaConverter", + new Class[] {Configuration.class}, configuration); + } + } + + public abstract MessageType convert(Schema schema); + + public abstract Schema convert(MessageType schema); +} diff --git a/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroReadSupport.java b/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroReadSupport.java index 326accb66b2c2..b2c90553464fb 100644 --- a/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroReadSupport.java +++ b/hudi-common/src/main/java/org/apache/parquet/avro/HoodieAvroReadSupport.java @@ -18,6 +18,8 @@ package org.apache.parquet.avro; +import org.apache.hudi.common.util.Option; + import org.apache.avro.generic.GenericData; import org.apache.hadoop.conf.Configuration; import org.apache.parquet.schema.GroupType; @@ -25,6 +27,8 @@ import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.Type; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -36,11 +40,15 @@ */ public class HoodieAvroReadSupport extends AvroReadSupport { - public HoodieAvroReadSupport(GenericData model) { + private Option tableSchema; + + public HoodieAvroReadSupport(GenericData model, Option tableSchema) { super(model); + this.tableSchema = tableSchema; } public HoodieAvroReadSupport() { + tableSchema = Option.empty(); } @Override @@ -52,7 +60,7 @@ public ReadContext init(Configuration configuration, Map keyValu "false", "support reading avro from non-legacy map/list in parquet file"); } ReadContext readContext = super.init(configuration, keyValueMetaData, fileSchema); - MessageType requestedSchema = readContext.getRequestedSchema(); + MessageType requestedSchema = repairLogicalTypes(readContext.getRequestedSchema(), tableSchema); // support non-legacy map. Convert non-legacy map to legacy map // Because there is no AvroWriteSupport.WRITE_OLD_MAP_STRUCTURE // according to AvroWriteSupport.WRITE_OLD_LIST_STRUCTURE @@ -130,4 +138,16 @@ private List convertLegacyMap(List oldTypes) { } return newTypes; } + + private MessageType repairLogicalTypes(MessageType fileSchema, Option tableSchemaOpt) { + try { + Class repairClass = Class.forName("org.apache.parquet.schema.SchemaRepair"); + Method repairMethod = repairClass.getDeclaredMethod( + "repairLogicalTypes", MessageType.class, Option.class); + MessageType repaired = (MessageType) repairMethod.invoke(null, fileSchema, tableSchemaOpt); + return repaired != null ? repaired : fileSchema; + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | InvocationTargetException e) { + return fileSchema; + } + } } diff --git a/hudi-common/src/main/java/org/apache/parquet/avro/NativeAvroSchemaConverter.java b/hudi-common/src/main/java/org/apache/parquet/avro/NativeAvroSchemaConverter.java new file mode 100644 index 0000000000000..509a8afb3ea83 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/parquet/avro/NativeAvroSchemaConverter.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.avro; + +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.schema.MessageType; + +/** + * uses the native avro schema converter from parquet java + */ +public class NativeAvroSchemaConverter extends HoodieAvroParquetSchemaConverter { + + private final AvroSchemaConverter avroSchemaConverter; + + public NativeAvroSchemaConverter(Configuration configuration) { + this.avroSchemaConverter = new AvroSchemaConverter(configuration); + } + + @Override + public MessageType convert(Schema schema) { + return avroSchemaConverter.convert(schema); + } + + @Override + public Schema convert(MessageType schema) { + return avroSchemaConverter.convert(schema); + } +} diff --git a/hudi-common/src/parquet/java/org/apache/parquet/schema/SchemaRepair.java b/hudi-common/src/parquet/java/org/apache/parquet/schema/SchemaRepair.java new file mode 100644 index 0000000000000..578c42fa4b3a1 --- /dev/null +++ b/hudi-common/src/parquet/java/org/apache/parquet/schema/SchemaRepair.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.schema; + +import org.apache.hudi.common.util.Option; + +import java.util.ArrayList; +import java.util.List; + +public class SchemaRepair { + + public static MessageType repairLogicalTypes(MessageType fileSchema, Option tableSchema) { + if (!tableSchema.isPresent()) { + return fileSchema; + } + return repairLogicalTypes(fileSchema, tableSchema.get()); + } + + static MessageType repairLogicalTypes(MessageType fileSchema, MessageType tableSchema) { + List repairedFields = repairFields(fileSchema.getFields(), tableSchema); + + // If nothing changed, return the original schema + if (repairedFields == null) { + return fileSchema; + } + + return new MessageType(fileSchema.getName(), repairedFields); + } + + /** + * Repairs a list of fields against a table schema (MessageType or GroupType). + * Returns null if no changes were made, otherwise returns the repaired field list. + */ + private static List repairFields(List fileSchemaFields, GroupType tableSchema) { + // First pass: find the first field that changes + int firstChangedIndex = -1; + Type firstRepairedField = null; + + for (int i = 0; i < fileSchemaFields.size(); i++) { + Type requestedField = fileSchemaFields.get(i); + if (tableSchema.containsField(requestedField.getName())) { + Type tableField = tableSchema.getType(requestedField.getName()); + Type repaired = repairField(requestedField, tableField); + if (repaired != requestedField) { + firstChangedIndex = i; + firstRepairedField = repaired; + break; + } + } + } + + // If nothing changed, return null + if (firstChangedIndex == -1) { + return null; + } + + // Second pass: build the new field list with repaired fields + List repairedFields = new ArrayList<>(fileSchemaFields.size()); + + // Copy all fields before the first changed field + for (int i = 0; i < firstChangedIndex; i++) { + repairedFields.add(fileSchemaFields.get(i)); + } + + // Add the first changed field (using cached repaired field) + repairedFields.add(firstRepairedField); + + // Process remaining fields + for (int i = firstChangedIndex + 1; i < fileSchemaFields.size(); i++) { + Type fileSchemaField = fileSchemaFields.get(i); + Type repaired = fileSchemaField; + if (tableSchema.containsField(fileSchemaField.getName())) { + Type tableSchemaField = tableSchema.getType(fileSchemaField.getName()); + repaired = repairField(fileSchemaField, tableSchemaField); + } + repairedFields.add(repaired); + } + + return repairedFields; + } + + private static Type repairField(Type fileSchemaFieldType, Type tableSchemaFieldType) { + if (fileSchemaFieldType.isPrimitive() && tableSchemaFieldType.isPrimitive()) { + return repairPrimitiveType(fileSchemaFieldType.asPrimitiveType(), tableSchemaFieldType.asPrimitiveType()); + } else if (!fileSchemaFieldType.isPrimitive() && !tableSchemaFieldType.isPrimitive()) { + // recurse into nested structs + GroupType reqGroup = fileSchemaFieldType.asGroupType(); + GroupType tblGroup = tableSchemaFieldType.asGroupType(); + + // Repair fields directly without creating MessageType intermediaries + List repairedFields = repairFields(reqGroup.getFields(), tblGroup); + + // If nothing changed, return the original field + if (repairedFields == null) { + return fileSchemaFieldType; + } + + return new GroupType( + reqGroup.getRepetition(), + reqGroup.getName(), + reqGroup.getLogicalTypeAnnotation(), + repairedFields + ); + } else { + // fallback: keep requested + return fileSchemaFieldType; + } + } + + private static PrimitiveType repairPrimitiveType(PrimitiveType fileSchemaPrimitiveType, PrimitiveType tableSchemaPrimitiveType) { + // Quick check if repair is needed (no allocations) + if (needsLogicalTypeRepair(fileSchemaPrimitiveType, tableSchemaPrimitiveType)) { + return Types.primitive(tableSchemaPrimitiveType.getPrimitiveTypeName(), fileSchemaPrimitiveType.getRepetition()) + .as(tableSchemaPrimitiveType.getLogicalTypeAnnotation()) + .named(fileSchemaPrimitiveType.getName()); + } + return fileSchemaPrimitiveType; + } + + /** + * Quick check if a logical type repair is needed (no allocations). + */ + private static boolean needsLogicalTypeRepair(PrimitiveType fileSchemaPrimitiveType, PrimitiveType tableSchemaPrimitiveType) { + if (fileSchemaPrimitiveType.getPrimitiveTypeName() != PrimitiveType.PrimitiveTypeName.INT64 + || tableSchemaPrimitiveType.getPrimitiveTypeName() != PrimitiveType.PrimitiveTypeName.INT64) { + return false; + } + LogicalTypeAnnotation fileLogicalTypeAnnotation = fileSchemaPrimitiveType.getLogicalTypeAnnotation(); + LogicalTypeAnnotation tableLogicalTypeAnnotation = tableSchemaPrimitiveType.getLogicalTypeAnnotation(); + + // if requested has no logical type, and the table has a local timestamp, then we need to repair + if (fileLogicalTypeAnnotation == null) { + return tableLogicalTypeAnnotation instanceof LogicalTypeAnnotation.TimestampLogicalTypeAnnotation + && !((LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) tableLogicalTypeAnnotation).isAdjustedToUTC(); + } + + // if requested is timestamp-micros and table is timestamp-millis then we need to repair + return fileLogicalTypeAnnotation instanceof LogicalTypeAnnotation.TimestampLogicalTypeAnnotation + && tableLogicalTypeAnnotation instanceof LogicalTypeAnnotation.TimestampLogicalTypeAnnotation + && ((LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) fileLogicalTypeAnnotation).getUnit() == LogicalTypeAnnotation.TimeUnit.MICROS + && ((LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) tableLogicalTypeAnnotation).getUnit() == LogicalTypeAnnotation.TimeUnit.MILLIS + && ((LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) fileLogicalTypeAnnotation).isAdjustedToUTC() + && ((LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) tableLogicalTypeAnnotation).isAdjustedToUTC(); + } +} diff --git a/hudi-common/src/parquet/test/org/apache/parquet/avro/TestAvroSchemaConverter.java b/hudi-common/src/parquet/test/org/apache/parquet/avro/TestAvroSchemaConverter.java new file mode 100644 index 0000000000000..1787a0b83628c --- /dev/null +++ b/hudi-common/src/parquet/test/org/apache/parquet/avro/TestAvroSchemaConverter.java @@ -0,0 +1,954 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.avro; + +import org.apache.avro.JsonProperties; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.MessageTypeParser; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.Collections; + +import static org.apache.avro.Schema.Type.INT; +import static org.apache.avro.Schema.Type.LONG; +import static org.apache.avro.Schema.Type.STRING; +import static org.apache.avro.SchemaCompatibility.SchemaCompatibilityType.COMPATIBLE; +import static org.apache.avro.SchemaCompatibility.checkReaderWriterCompatibility; +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; +import static org.apache.parquet.avro.HoodieAvroParquetSchemaConverter.getAvroSchemaConverter; +import static org.apache.parquet.schema.OriginalType.DATE; +import static org.apache.parquet.schema.OriginalType.TIMESTAMP_MICROS; +import static org.apache.parquet.schema.OriginalType.TIMESTAMP_MILLIS; +import static org.apache.parquet.schema.OriginalType.TIME_MICROS; +import static org.apache.parquet.schema.OriginalType.TIME_MILLIS; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96; +import static org.apache.parquet.schema.Type.Repetition.REQUIRED; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +public class TestAvroSchemaConverter { + + private static final Configuration NEW_BEHAVIOR = new Configuration(false); + + @BeforeAll + public static void setupConf() { + NEW_BEHAVIOR.setBoolean("parquet.avro.add-list-element-records", false); + NEW_BEHAVIOR.setBoolean("parquet.avro.write-old-list-structure", false); + } + + public static final String ALL_PARQUET_SCHEMA = "message org.apache.parquet.avro.myrecord {\n" + + " required boolean myboolean;\n" + + " required int32 myint;\n" + + " required int64 mylong;\n" + + " required float myfloat;\n" + + " required double mydouble;\n" + + " required binary mybytes;\n" + + " required binary mystring (UTF8);\n" + + " required group mynestedrecord {\n" + + " required int32 mynestedint;\n" + + " }\n" + + " required binary myenum (ENUM);\n" + + " required group myarray (LIST) {\n" + + " repeated int32 array;\n" + + " }\n" + + " optional group myoptionalarray (LIST) {\n" + + " repeated int32 array;\n" + + " }\n" + + " required group myarrayofoptional (LIST) {\n" + + " repeated group list {\n" + + " optional int32 element;\n" + + " }\n" + + " }\n" + + " required group myrecordarray (LIST) {\n" + + " repeated group array {\n" + + " required int32 a;\n" + + " required int32 b;\n" + + " }\n" + + " }\n" + + " required group mymap (MAP) {\n" + + " repeated group map (MAP_KEY_VALUE) {\n" + + " required binary key (UTF8);\n" + + " required int32 value;\n" + + " }\n" + + " }\n" + + " required fixed_len_byte_array(1) myfixed;\n" + + "}\n"; + + private void testAvroToParquetConversion(Schema avroSchema, String schemaString) throws Exception { + testAvroToParquetConversion(new Configuration(false), avroSchema, schemaString); + } + + private void testAvroToParquetConversion(Configuration conf, Schema avroSchema, String schemaString) + throws Exception { + HoodieAvroParquetSchemaConverter avroSchemaConverter = getAvroSchemaConverter(conf); + MessageType schema = avroSchemaConverter.convert(avroSchema); + MessageType expectedMT = MessageTypeParser.parseMessageType(schemaString); + assertEquals(expectedMT.toString(), schema.toString()); + } + + private void testParquetToAvroConversion(Schema avroSchema, String schemaString) throws Exception { + testParquetToAvroConversion(new Configuration(false), avroSchema, schemaString); + } + + private void testParquetToAvroConversion(Configuration conf, Schema avroSchema, String schemaString) + throws Exception { + HoodieAvroParquetSchemaConverter avroSchemaConverter = getAvroSchemaConverter(conf); + Schema schema = avroSchemaConverter.convert(MessageTypeParser.parseMessageType(schemaString)); + assertEquals(avroSchema.toString(), schema.toString()); + } + + private void testRoundTripConversion(Schema avroSchema, String schemaString) throws Exception { + testRoundTripConversion(new Configuration(), avroSchema, schemaString); + } + + private void testRoundTripConversion(Configuration conf, Schema avroSchema, String schemaString) throws Exception { + HoodieAvroParquetSchemaConverter avroSchemaConverter = getAvroSchemaConverter(conf); + MessageType schema = avroSchemaConverter.convert(avroSchema); + MessageType expectedMT = MessageTypeParser.parseMessageType(schemaString); + assertEquals(expectedMT.toString(), schema.toString()); + Schema convertedAvroSchema = avroSchemaConverter.convert(expectedMT); + assertEquals(avroSchema.toString(), convertedAvroSchema.toString()); + } + + @Test() + public void testTopLevelMustBeARecord() { + assertThrows("expected to throw", IllegalArgumentException.class, () -> getAvroSchemaConverter(new Configuration()).convert(Schema.create(INT))); + } + + @Test + public void testAllTypes() throws Exception { + Schema schema = getSchemaFromResource(TestAvroSchemaConverter.class, "/parquet-java/all.avsc"); + testAvroToParquetConversion( + NEW_BEHAVIOR, + schema, + "message org.apache.parquet.avro.myrecord {\n" + // Avro nulls are not encoded, unless they are null unions + + " required boolean myboolean;\n" + + " required int32 myint;\n" + + " required int64 mylong;\n" + + " required float myfloat;\n" + + " required double mydouble;\n" + + " required binary mybytes;\n" + + " required binary mystring (UTF8);\n" + + " required group mynestedrecord {\n" + + " required int32 mynestedint;\n" + + " }\n" + + " required binary myenum (ENUM);\n" + + " required group myarray (LIST) {\n" + + " repeated group list {\n" + + " required int32 element;\n" + + " }\n" + + " }\n" + + " required group myemptyarray (LIST) {\n" + + " repeated group list {\n" + + " required int32 element;\n" + + " }\n" + + " }\n" + + " optional group myoptionalarray (LIST) {\n" + + " repeated group list {\n" + + " required int32 element;\n" + + " }\n" + + " }\n" + + " required group myarrayofoptional (LIST) {\n" + + " repeated group list {\n" + + " optional int32 element;\n" + + " }\n" + + " }\n" + + " required group mymap (MAP) {\n" + + " repeated group key_value (MAP_KEY_VALUE) {\n" + + " required binary key (UTF8);\n" + + " required int32 value;\n" + + " }\n" + + " }\n" + + " required group myemptymap (MAP) {\n" + + " repeated group key_value (MAP_KEY_VALUE) {\n" + + " required binary key (UTF8);\n" + + " required int32 value;\n" + + " }\n" + + " }\n" + + " required fixed_len_byte_array(1) myfixed;\n" + + "}\n"); + } + + @Test + public void testAllTypesOldListBehavior() throws Exception { + Schema schema = getSchemaFromResource(TestAvroSchemaConverter.class, "/parquet-java/all.avsc"); + testAvroToParquetConversion( + schema, + "message org.apache.parquet.avro.myrecord {\n" + // Avro nulls are not encoded, unless they are null unions + + " required boolean myboolean;\n" + + " required int32 myint;\n" + + " required int64 mylong;\n" + + " required float myfloat;\n" + + " required double mydouble;\n" + + " required binary mybytes;\n" + + " required binary mystring (UTF8);\n" + + " required group mynestedrecord {\n" + + " required int32 mynestedint;\n" + + " }\n" + + " required binary myenum (ENUM);\n" + + " required group myarray (LIST) {\n" + + " repeated int32 array;\n" + + " }\n" + + " required group myemptyarray (LIST) {\n" + + " repeated int32 array;\n" + + " }\n" + + " optional group myoptionalarray (LIST) {\n" + + " repeated int32 array;\n" + + " }\n" + + " required group myarrayofoptional (LIST) {\n" + + " repeated int32 array;\n" + + " }\n" + + " required group mymap (MAP) {\n" + + " repeated group key_value (MAP_KEY_VALUE) {\n" + + " required binary key (UTF8);\n" + + " required int32 value;\n" + + " }\n" + + " }\n" + + " required group myemptymap (MAP) {\n" + + " repeated group key_value (MAP_KEY_VALUE) {\n" + + " required binary key (UTF8);\n" + + " required int32 value;\n" + + " }\n" + + " }\n" + + " required fixed_len_byte_array(1) myfixed;\n" + + "}\n"); + } + + @Test + public void testAllTypesParquetToAvro() throws Exception { + Schema schema = getSchemaFromResource(TestAvroSchemaConverter.class, "/parquet-java/allFromParquetNewBehavior.avsc"); + // Cannot use round-trip assertion because enum is lost + testParquetToAvroConversion(NEW_BEHAVIOR, schema, ALL_PARQUET_SCHEMA); + } + + @Test + public void testAllTypesParquetToAvroOldBehavior() throws Exception { + Schema schema = getSchemaFromResource(TestAvroSchemaConverter.class, "/parquet-java/allFromParquetOldBehavior.avsc"); + // Cannot use round-trip assertion because enum is lost + testParquetToAvroConversion(schema, ALL_PARQUET_SCHEMA); + } + + @Test + public void testParquetMapWithNonStringKeyFails() throws Exception { + MessageType parquetSchema = + MessageTypeParser.parseMessageType("message myrecord {\n" + " required group mymap (MAP) {\n" + + " repeated group map (MAP_KEY_VALUE) {\n" + + " required int32 key;\n" + + " required int32 value;\n" + + " }\n" + + " }\n" + + "}\n"); + assertThrows("expected to throw", IllegalArgumentException.class, () -> getAvroSchemaConverter(new Configuration()).convert(parquetSchema)); + } + + @Test + public void testOptionalFields() throws Exception { + Schema schema = Schema.createRecord("record1", null, null, false); + Schema optionalInt = optional(Schema.create(INT)); + schema.setFields( + Collections.singletonList(new Schema.Field("myint", optionalInt, null, JsonProperties.NULL_VALUE))); + testRoundTripConversion(schema, "message record1 {\n" + " optional int32 myint;\n" + "}\n"); + } + + @Test + public void testOptionalMapValue() throws Exception { + Schema schema = Schema.createRecord("record1", null, null, false); + Schema optionalIntMap = Schema.createMap(optional(Schema.create(INT))); + schema.setFields(Arrays.asList(new Schema.Field("myintmap", optionalIntMap, null, null))); + testRoundTripConversion( + schema, + "message record1 {\n" + " required group myintmap (MAP) {\n" + + " repeated group key_value (MAP_KEY_VALUE) {\n" + + " required binary key (UTF8);\n" + + " optional int32 value;\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testOptionalArrayElement() throws Exception { + Schema schema = Schema.createRecord("record1", null, null, false); + Schema optionalIntArray = Schema.createArray(optional(Schema.create(INT))); + schema.setFields(Arrays.asList(new Schema.Field("myintarray", optionalIntArray, null, null))); + testRoundTripConversion( + NEW_BEHAVIOR, + schema, + "message record1 {\n" + " required group myintarray (LIST) {\n" + + " repeated group list {\n" + + " optional int32 element;\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testUnionOfTwoTypes() throws Exception { + Schema schema = Schema.createRecord("record2", null, null, false); + Schema multipleTypes = Schema.createUnion( + Arrays.asList(Schema.create(Schema.Type.NULL), Schema.create(INT), Schema.create(Schema.Type.FLOAT))); + schema.setFields(Arrays.asList(new Schema.Field("myunion", multipleTypes, null, JsonProperties.NULL_VALUE))); + + // Avro union is modelled using optional data members of the different + // types. This does not translate back into an Avro union + testAvroToParquetConversion( + schema, + "message record2 {\n" + " optional group myunion {\n" + + " optional int32 member0;\n" + + " optional float member1;\n" + + " }\n" + + "}\n"); + } + + @Test + public void testArrayOfOptionalRecords() throws Exception { + Schema innerRecord = Schema.createRecord("element", null, null, false); + Schema optionalString = optional(Schema.create(Schema.Type.STRING)); + innerRecord.setFields(Arrays.asList( + new Schema.Field("s1", optionalString, null, JsonProperties.NULL_VALUE), + new Schema.Field("s2", optionalString, null, JsonProperties.NULL_VALUE))); + Schema schema = Schema.createRecord("HasArray", null, null, false); + schema.setFields( + Arrays.asList(new Schema.Field("myarray", Schema.createArray(optional(innerRecord)), null, null))); + System.err.println("Avro schema: " + schema.toString(true)); + + testRoundTripConversion( + NEW_BEHAVIOR, + schema, + "message HasArray {\n" + " required group myarray (LIST) {\n" + + " repeated group list {\n" + + " optional group element {\n" + + " optional binary s1 (UTF8);\n" + + " optional binary s2 (UTF8);\n" + + " }\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testArrayOfOptionalRecordsOldBehavior() throws Exception { + Schema innerRecord = Schema.createRecord("InnerRecord", null, null, false); + Schema optionalString = optional(Schema.create(Schema.Type.STRING)); + innerRecord.setFields(Arrays.asList( + new Schema.Field("s1", optionalString, null, JsonProperties.NULL_VALUE), + new Schema.Field("s2", optionalString, null, JsonProperties.NULL_VALUE))); + Schema schema = Schema.createRecord("HasArray", null, null, false); + schema.setFields( + Arrays.asList(new Schema.Field("myarray", Schema.createArray(optional(innerRecord)), null, null))); + System.err.println("Avro schema: " + schema.toString(true)); + + // Cannot use round-trip assertion because InnerRecord optional is removed + testAvroToParquetConversion( + schema, + "message HasArray {\n" + " required group myarray (LIST) {\n" + + " repeated group array {\n" + + " optional binary s1 (UTF8);\n" + + " optional binary s2 (UTF8);\n" + + " }\n" + + " }\n" + + "}\n"); + } + + @Test + public void testOldAvroListOfLists() throws Exception { + Schema listOfLists = optional(Schema.createArray(Schema.createArray(Schema.create(INT)))); + Schema schema = Schema.createRecord("AvroCompatListInList", null, null, false); + schema.setFields( + Arrays.asList(new Schema.Field("listOfLists", listOfLists, null, JsonProperties.NULL_VALUE))); + System.err.println("Avro schema: " + schema.toString(true)); + + testRoundTripConversion( + schema, + "message AvroCompatListInList {\n" + " optional group listOfLists (LIST) {\n" + + " repeated group array (LIST) {\n" + + " repeated int32 array;\n" + + " }\n" + + " }\n" + + "}"); + // Cannot use round-trip assertion because 3-level representation is used + testParquetToAvroConversion( + NEW_BEHAVIOR, + schema, + "message AvroCompatListInList {\n" + " optional group listOfLists (LIST) {\n" + + " repeated group array (LIST) {\n" + + " repeated int32 array;\n" + + " }\n" + + " }\n" + + "}"); + } + + @Test + public void testOldThriftListOfLists() throws Exception { + Schema listOfLists = optional(Schema.createArray(Schema.createArray(Schema.create(INT)))); + Schema schema = Schema.createRecord("ThriftCompatListInList", null, null, false); + schema.setFields( + Arrays.asList(new Schema.Field("listOfLists", listOfLists, null, JsonProperties.NULL_VALUE))); + System.err.println("Avro schema: " + schema.toString(true)); + + // Cannot use round-trip assertion because repeated group names differ + testParquetToAvroConversion( + schema, + "message ThriftCompatListInList {\n" + " optional group listOfLists (LIST) {\n" + + " repeated group listOfLists_tuple (LIST) {\n" + + " repeated int32 listOfLists_tuple_tuple;\n" + + " }\n" + + " }\n" + + "}"); + // Cannot use round-trip assertion because 3-level representation is used + testParquetToAvroConversion( + NEW_BEHAVIOR, + schema, + "message ThriftCompatListInList {\n" + " optional group listOfLists (LIST) {\n" + + " repeated group listOfLists_tuple (LIST) {\n" + + " repeated int32 listOfLists_tuple_tuple;\n" + + " }\n" + + " }\n" + + "}"); + } + + @Test + public void testUnknownTwoLevelListOfLists() throws Exception { + // This tests the case where we don't detect a 2-level list by the repeated + // group's name, but it must be 2-level because the repeated group doesn't + // contain an optional or repeated element as required for 3-level lists + Schema listOfLists = optional(Schema.createArray(Schema.createArray(Schema.create(INT)))); + Schema schema = Schema.createRecord("UnknownTwoLevelListInList", null, null, false); + schema.setFields( + Arrays.asList(new Schema.Field("listOfLists", listOfLists, null, JsonProperties.NULL_VALUE))); + System.err.println("Avro schema: " + schema.toString(true)); + + // Cannot use round-trip assertion because repeated group names differ + testParquetToAvroConversion( + schema, + "message UnknownTwoLevelListInList {\n" + " optional group listOfLists (LIST) {\n" + + " repeated group mylist (LIST) {\n" + + " repeated int32 innerlist;\n" + + " }\n" + + " }\n" + + "}"); + // Cannot use round-trip assertion because 3-level representation is used + testParquetToAvroConversion( + NEW_BEHAVIOR, + schema, + "message UnknownTwoLevelListInList {\n" + " optional group listOfLists (LIST) {\n" + + " repeated group mylist (LIST) {\n" + + " repeated int32 innerlist;\n" + + " }\n" + + " }\n" + + "}"); + } + + @Test + public void testParquetMapWithoutMapKeyValueAnnotation() throws Exception { + Schema schema = Schema.createRecord("myrecord", null, null, false); + Schema map = Schema.createMap(Schema.create(INT)); + schema.setFields(Collections.singletonList(new Schema.Field("mymap", map, null, null))); + String parquetSchema = "message myrecord {\n" + " required group mymap (MAP) {\n" + + " repeated group map {\n" + + " required binary key (UTF8);\n" + + " required int32 value;\n" + + " }\n" + + " }\n" + + "}\n"; + + testParquetToAvroConversion(schema, parquetSchema); + testParquetToAvroConversion(NEW_BEHAVIOR, schema, parquetSchema); + } + + @Test + public void testDecimalBytesType() throws Exception { + Schema schema = Schema.createRecord("myrecord", null, null, false); + Schema decimal = LogicalTypes.decimal(9, 2).addToSchema(Schema.create(Schema.Type.BYTES)); + schema.setFields(Collections.singletonList(new Schema.Field("dec", decimal, null, null))); + + testRoundTripConversion(schema, "message myrecord {\n" + " required binary dec (DECIMAL(9,2));\n" + "}\n"); + } + + @Test + public void testDecimalFixedType() throws Exception { + Schema schema = Schema.createRecord("myrecord", null, null, false); + Schema decimal = LogicalTypes.decimal(9, 2).addToSchema(Schema.createFixed("dec", null, null, 8)); + schema.setFields(Collections.singletonList(new Schema.Field("dec", decimal, null, null))); + + testRoundTripConversion( + schema, "message myrecord {\n" + " required fixed_len_byte_array(8) dec (DECIMAL(9,2));\n" + "}\n"); + } + + @Test + public void testDecimalIntegerType() throws Exception { + Schema expected = Schema.createRecord( + "myrecord", null, null, false, Arrays.asList(new Schema.Field("dec", Schema.create(INT), null, null))); + + // the decimal portion is lost because it isn't valid in Avro + testParquetToAvroConversion( + expected, "message myrecord {\n" + " required int32 dec (DECIMAL(9,2));\n" + "}\n"); + } + + @Test + public void testDecimalLongType() throws Exception { + Schema expected = Schema.createRecord( + "myrecord", null, null, false, Arrays.asList(new Schema.Field("dec", Schema.create(LONG), null, null))); + + // the decimal portion is lost because it isn't valid in Avro + testParquetToAvroConversion( + expected, "message myrecord {\n" + " required int64 dec (DECIMAL(9,2));\n" + "}\n"); + } + + @Test + public void testParquetInt96AsFixed12AvroType() throws Exception { + Configuration enableInt96ReadingConfig = new Configuration(); + enableInt96ReadingConfig.setBoolean(AvroReadSupport.READ_INT96_AS_FIXED, true); + + Schema schema = Schema.createRecord("myrecord", null, null, false); + Schema int96schema = Schema.createFixed("INT96", "INT96 represented as byte[12]", null, 12); + schema.setFields(Collections.singletonList( + new Schema.Field("int96_field", int96schema, null, null))); + + testParquetToAvroConversion(enableInt96ReadingConfig, schema, "message myrecord {\n" + + " required int96 int96_field;\n" + + "}\n"); + } + + @Test + public void testParquetInt96DefaultFail() throws Exception { + Schema schema = Schema.createRecord("myrecord", null, null, false); + + MessageType parquetSchemaWithInt96 = + MessageTypeParser.parseMessageType("message myrecord {\n required int96 int96_field;\n}\n"); + + assertThrows( + "INT96 is deprecated. As interim enable READ_INT96_AS_FIXED flag to read as byte array.", + IllegalArgumentException.class, + () -> getAvroSchemaConverter(new Configuration()).convert(parquetSchemaWithInt96)); + } + + @Test + public void testDateType() throws Exception { + Schema date = LogicalTypes.date().addToSchema(Schema.create(INT)); + Schema expected = Schema.createRecord( + "myrecord", null, null, false, Arrays.asList(new Schema.Field("date", date, null, null))); + + testRoundTripConversion(expected, "message myrecord {\n" + " required int32 date (DATE);\n" + "}\n"); + + for (PrimitiveTypeName primitive : + new PrimitiveTypeName[] {INT64, INT96, FLOAT, DOUBLE, BOOLEAN, BINARY, FIXED_LEN_BYTE_ARRAY}) { + final PrimitiveType type; + if (primitive == FIXED_LEN_BYTE_ARRAY) { + type = new PrimitiveType(REQUIRED, primitive, 12, "test", DATE); + } else { + type = new PrimitiveType(REQUIRED, primitive, "test", DATE); + } + + assertThrows( + "Should not allow TIME_MICROS with " + primitive, + IllegalArgumentException.class, + () -> getAvroSchemaConverter(new Configuration()).convert(message(type))); + } + } + + @Test + public void testTimeMillisType() throws Exception { + Schema date = LogicalTypes.timeMillis().addToSchema(Schema.create(INT)); + Schema expected = Schema.createRecord( + "myrecord", null, null, false, Arrays.asList(new Schema.Field("time", date, null, null))); + + testRoundTripConversion( + expected, "message myrecord {\n" + " required int32 time (TIME(MILLIS,true));\n" + "}\n"); + + for (PrimitiveTypeName primitive : + new PrimitiveTypeName[] {INT64, INT96, FLOAT, DOUBLE, BOOLEAN, BINARY, FIXED_LEN_BYTE_ARRAY}) { + final PrimitiveType type; + if (primitive == FIXED_LEN_BYTE_ARRAY) { + type = new PrimitiveType(REQUIRED, primitive, 12, "test", TIME_MILLIS); + } else { + type = new PrimitiveType(REQUIRED, primitive, "test", TIME_MILLIS); + } + + assertThrows( + "Should not allow TIME_MICROS with " + primitive, + IllegalArgumentException.class, + () -> getAvroSchemaConverter(new Configuration()).convert(message(type))); + } + } + + @Test + public void testTimeMicrosType() throws Exception { + Schema date = LogicalTypes.timeMicros().addToSchema(Schema.create(LONG)); + Schema expected = Schema.createRecord( + "myrecord", null, null, false, Arrays.asList(new Schema.Field("time", date, null, null))); + + testRoundTripConversion( + expected, "message myrecord {\n" + " required int64 time (TIME(MICROS,true));\n" + "}\n"); + + for (PrimitiveTypeName primitive : + new PrimitiveTypeName[] {INT32, INT96, FLOAT, DOUBLE, BOOLEAN, BINARY, FIXED_LEN_BYTE_ARRAY}) { + final PrimitiveType type; + if (primitive == FIXED_LEN_BYTE_ARRAY) { + type = new PrimitiveType(REQUIRED, primitive, 12, "test", TIME_MICROS); + } else { + type = new PrimitiveType(REQUIRED, primitive, "test", TIME_MICROS); + } + + assertThrows( + "Should not allow TIME_MICROS with " + primitive, + IllegalArgumentException.class, + () -> getAvroSchemaConverter(new Configuration()).convert(message(type))); + } + } + + @Test + public void testTimestampMillisType() throws Exception { + Schema date = LogicalTypes.timestampMillis().addToSchema(Schema.create(LONG)); + Schema expected = Schema.createRecord( + "myrecord", null, null, false, Arrays.asList(new Schema.Field("timestamp", date, null, null))); + + testRoundTripConversion( + expected, "message myrecord {\n" + " required int64 timestamp (TIMESTAMP(MILLIS,true));\n" + "}\n"); + + final Schema converted = getAvroSchemaConverter(new Configuration()) + .convert(Types.buildMessage() + .addField(Types.primitive(INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType( + false, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .length(1) + .named("timestamp_type")) + .named("TestAvro")); + assertEquals( + "local-timestamp-millis", + converted + .getField("timestamp_type") + .schema() + .getLogicalType() + .getName()); + + for (PrimitiveTypeName primitive : + new PrimitiveTypeName[] {INT32, INT96, FLOAT, DOUBLE, BOOLEAN, BINARY, FIXED_LEN_BYTE_ARRAY}) { + final PrimitiveType type; + if (primitive == FIXED_LEN_BYTE_ARRAY) { + type = new PrimitiveType(REQUIRED, primitive, 12, "test", TIMESTAMP_MILLIS); + } else { + type = new PrimitiveType(REQUIRED, primitive, "test", TIMESTAMP_MILLIS); + } + + assertThrows( + "Should not allow TIMESTAMP_MILLIS with " + primitive, + IllegalArgumentException.class, + () -> getAvroSchemaConverter(new Configuration()).convert(message(type))); + } + } + + @Test + public void testLocalTimestampMillisType() throws Exception { + Schema date = LogicalTypes.localTimestampMillis().addToSchema(Schema.create(LONG)); + Schema expected = Schema.createRecord( + "myrecord", null, null, false, Arrays.asList(new Schema.Field("timestamp", date, null, null))); + + testRoundTripConversion( + expected, "message myrecord {\n" + " required int64 timestamp (TIMESTAMP(MILLIS,false));\n" + "}\n"); + + for (PrimitiveTypeName primitive : + new PrimitiveTypeName[] {INT32, INT96, FLOAT, DOUBLE, BOOLEAN, BINARY, FIXED_LEN_BYTE_ARRAY}) { + final PrimitiveType type; + if (primitive == FIXED_LEN_BYTE_ARRAY) { + type = new PrimitiveType(REQUIRED, primitive, 12, "test", TIMESTAMP_MILLIS); + } else { + type = new PrimitiveType(REQUIRED, primitive, "test", TIMESTAMP_MILLIS); + } + + assertThrows( + "Should not allow TIMESTAMP_MILLIS with " + primitive, + IllegalArgumentException.class, + () -> getAvroSchemaConverter(new Configuration()).convert(message(type))); + } + } + + @Test + public void testTimestampMicrosType() throws Exception { + Schema date = LogicalTypes.timestampMicros().addToSchema(Schema.create(LONG)); + Schema expected = Schema.createRecord( + "myrecord", null, null, false, Arrays.asList(new Schema.Field("timestamp", date, null, null))); + + testRoundTripConversion( + expected, "message myrecord {\n" + " required int64 timestamp (TIMESTAMP(MICROS,true));\n" + "}\n"); + + for (PrimitiveTypeName primitive : + new PrimitiveTypeName[] {INT32, INT96, FLOAT, DOUBLE, BOOLEAN, BINARY, FIXED_LEN_BYTE_ARRAY}) { + final PrimitiveType type; + if (primitive == FIXED_LEN_BYTE_ARRAY) { + type = new PrimitiveType(REQUIRED, primitive, 12, "test", TIMESTAMP_MICROS); + } else { + type = new PrimitiveType(REQUIRED, primitive, "test", TIMESTAMP_MICROS); + } + + assertThrows( + "Should not allow TIMESTAMP_MICROS with " + primitive, + IllegalArgumentException.class, + () -> getAvroSchemaConverter(new Configuration()).convert(message(type))); + } + + final Schema converted = getAvroSchemaConverter(new Configuration()) + .convert(Types.buildMessage() + .addField(Types.primitive(INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType( + false, LogicalTypeAnnotation.TimeUnit.MICROS)) + .length(1) + .named("timestamp_type")) + .named("TestAvro")); + + assertEquals( + "local-timestamp-micros", + converted + .getField("timestamp_type") + .schema() + .getLogicalType() + .getName()); + } + + @Test + public void testLocalTimestampMicrosType() throws Exception { + Schema date = LogicalTypes.localTimestampMicros().addToSchema(Schema.create(LONG)); + Schema expected = Schema.createRecord( + "myrecord", null, null, false, Arrays.asList(new Schema.Field("timestamp", date, null, null))); + + testRoundTripConversion( + expected, "message myrecord {\n" + " required int64 timestamp (TIMESTAMP(MICROS,false));\n" + "}\n"); + + for (PrimitiveTypeName primitive : + new PrimitiveTypeName[] {INT32, INT96, FLOAT, DOUBLE, BOOLEAN, BINARY, FIXED_LEN_BYTE_ARRAY}) { + final PrimitiveType type; + if (primitive == FIXED_LEN_BYTE_ARRAY) { + type = new PrimitiveType(REQUIRED, primitive, 12, "test", TIMESTAMP_MICROS); + } else { + type = new PrimitiveType(REQUIRED, primitive, "test", TIMESTAMP_MICROS); + } + + assertThrows( + "Should not allow TIMESTAMP_MICROS with " + primitive, + IllegalArgumentException.class, + () -> getAvroSchemaConverter(new Configuration()).convert(message(type))); + } + } + + @Test + public void testReuseNameInNestedStructure() throws Exception { + Schema innerA1 = record("a1", "a12", field("a4", primitive(Schema.Type.FLOAT))); + + Schema outerA1 = record("a1", field("a2", primitive(Schema.Type.FLOAT)), optionalField("a1", innerA1)); + Schema schema = record("Message", optionalField("a1", outerA1)); + + String parquetSchema = "message Message {\n" + + " optional group a1 {\n" + + " required float a2;\n" + + " optional group a1 {\n" + + " required float a4;\n" + + " }\n" + + " }\n" + + "}\n"; + + testParquetToAvroConversion(schema, parquetSchema); + testParquetToAvroConversion(NEW_BEHAVIOR, schema, parquetSchema); + } + + @Test + public void testReuseNameInNestedStructureAtSameLevel() throws Exception { + Schema a2 = record("a2", field("a4", primitive(Schema.Type.FLOAT))); + Schema a22 = record( + "a2", "a22", field("a4", primitive(Schema.Type.FLOAT)), field("a5", primitive(Schema.Type.FLOAT))); + + Schema a1 = record("a1", optionalField("a2", a2)); + Schema a3 = record("a3", optionalField("a2", a22)); + + Schema schema = record("Message", optionalField("a1", a1), optionalField("a3", a3)); + + String parquetSchema = "message Message {\n" + + " optional group a1 {\n" + + " optional group a2 {\n" + + " required float a4;\n" + + " }\n" + + " }\n" + + " optional group a3 {\n" + + " optional group a2 {\n" + + " required float a4;\n" + + " required float a5;\n" + + " }\n" + + " }\n" + + "}\n"; + + testParquetToAvroConversion(schema, parquetSchema); + testParquetToAvroConversion(NEW_BEHAVIOR, schema, parquetSchema); + } + + @Test + public void testUUIDType() throws Exception { + Schema fromAvro = Schema.createRecord( + "myrecord", + null, + null, + false, + Arrays.asList( + new Schema.Field("uuid", LogicalTypes.uuid().addToSchema(Schema.create(STRING)), null, null))); + String parquet = "message myrecord {\n" + " required binary uuid (STRING);\n" + "}\n"; + Schema toAvro = Schema.createRecord( + "myrecord", + null, + null, + false, + Arrays.asList(new Schema.Field("uuid", Schema.create(STRING), null, null))); + + testAvroToParquetConversion(fromAvro, parquet); + testParquetToAvroConversion(toAvro, parquet); + + assertEquals( + COMPATIBLE, checkReaderWriterCompatibility(fromAvro, toAvro).getType()); + } + + @Test + public void testUUIDTypeWithParquetUUID() throws Exception { + Schema uuid = LogicalTypes.uuid().addToSchema(Schema.create(STRING)); + Schema expected = Schema.createRecord( + "myrecord", null, null, false, Arrays.asList(new Schema.Field("uuid", uuid, null, null))); + + testRoundTripConversion( + conf(AvroWriteSupport.WRITE_PARQUET_UUID, true), + expected, + "message myrecord {\n" + " required fixed_len_byte_array(16) uuid (UUID);\n" + "}\n"); + } + + @Test + public void testAvroFixed12AsParquetInt96Type() throws Exception { + Schema schema = getSchemaFromResource(TestAvroSchemaConverter.class, "/parquet-java/fixedToInt96.avsc"); + + Configuration conf = new Configuration(); + conf.setStrings( + "parquet.avro.writeFixedAsInt96", + "int96", + "mynestedrecord.int96inrecord", + "mynestedrecord.myarrayofoptional", + "mynestedrecord.mymap"); + testAvroToParquetConversion( + conf, + schema, + "message org.apache.parquet.avro.fixedToInt96 {\n" + + " required int96 int96;\n" + + " required fixed_len_byte_array(12) notanint96;\n" + + " required group mynestedrecord {\n" + + " required int96 int96inrecord;\n" + + " required group myarrayofoptional (LIST) {\n" + + " repeated int96 array;\n" + + " }\n" + + " required group mymap (MAP) {\n" + + " repeated group key_value (MAP_KEY_VALUE) {\n" + + " required binary key (STRING);\n" + + " required int96 value;\n" + + " }\n" + + " }\n" + + " }\n" + + " required fixed_len_byte_array(1) onebytefixed;\n" + + "}"); + + conf.setStrings("parquet.avro.writeFixedAsInt96", "onebytefixed"); + assertThrows( + "Exception should be thrown for fixed types to be converted to INT96 where the size is not 12 bytes", + IllegalArgumentException.class, + () -> getAvroSchemaConverter(conf).convert(schema)); + } + + public static Schema optional(Schema original) { + return Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.NULL), original)); + } + + public static MessageType message(PrimitiveType primitive) { + return Types.buildMessage().addField(primitive).named("myrecord"); + } + + /** + * A convenience method to avoid a large number of @Test(expected=...) tests + * + * @param message A String message to describe this assertion + * @param expected An Exception class that the Runnable should throw + * @param runnable A Runnable that is expected to throw the exception + */ + public static void assertThrows(String message, Class expected, Runnable runnable) { + try { + runnable.run(); + fail("No exception was thrown (" + message + "), expected: " + expected.getName()); + } catch (Exception actual) { + try { + assertEquals(expected, actual.getClass(), message); + } catch (AssertionError e) { + e.addSuppressed(actual); + throw e; + } + } + } + + public static Schema record(String name, String namespace, Schema.Field... fields) { + Schema record = Schema.createRecord(name, null, namespace, false); + record.setFields(Arrays.asList(fields)); + return record; + } + + public static Schema record(String name, Schema.Field... fields) { + return record(name, null, fields); + } + + public static Schema.Field field(String name, Schema schema) { + return new Schema.Field(name, schema, null, null); + } + + public static Schema.Field optionalField(String name, Schema schema) { + return new Schema.Field(name, optional(schema), null, JsonProperties.NULL_VALUE); + } + + public static Schema array(Schema element) { + return Schema.createArray(element); + } + + public static Schema primitive(Schema.Type type) { + return Schema.create(type); + } + + public static Configuration conf(String name, boolean value) { + Configuration conf = new Configuration(false); + conf.setBoolean(name, value); + return conf; + } + +} \ No newline at end of file diff --git a/hudi-common/src/parquet/test/org/apache/parquet/schema/TestSchemaRepair.java b/hudi-common/src/parquet/test/org/apache/parquet/schema/TestSchemaRepair.java new file mode 100644 index 0000000000000..b31d37c835dbd --- /dev/null +++ b/hudi-common/src/parquet/test/org/apache/parquet/schema/TestSchemaRepair.java @@ -0,0 +1,600 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.schema; + +import org.apache.hudi.common.util.Option; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertSame; + +/** + * Tests {@link SchemaRepair}. + */ +public class TestSchemaRepair { + + @Test + public void testNoRepairNeededIdenticalSchemas() { + MessageType requestedSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp") + ); + MessageType tableSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertSame(requestedSchema, result, "When schemas are identical, should return same instance"); + } + + @Test + public void testNoRepairNeededDifferentPrimitiveTypes() { + MessageType requestedSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .named("name") + ); + MessageType tableSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertSame(requestedSchema, result, "When field names differ, should return original schema"); + } + + @Test + public void testRepairLongWithoutLogicalTypeToLocalTimestampMillis() { + MessageType requestedSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp") + ); + MessageType tableSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .named("timestamp") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create a new schema with logical type"); + PrimitiveType timestampField = result.getType("timestamp").asPrimitiveType(); + assertEquals(PrimitiveType.PrimitiveTypeName.INT64, timestampField.getPrimitiveTypeName()); + assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS), + timestampField.getLogicalTypeAnnotation()); + } + + @Test + public void testRepairLongWithoutLogicalTypeToLocalTimestampMicros() { + MessageType requestedSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp") + ); + MessageType tableSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named("timestamp") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create a new schema with logical type"); + PrimitiveType timestampField = result.getType("timestamp").asPrimitiveType(); + assertEquals(PrimitiveType.PrimitiveTypeName.INT64, timestampField.getPrimitiveTypeName()); + assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MICROS), + timestampField.getLogicalTypeAnnotation()); + } + + @Test + public void testRepairTimestampMicrosToTimestampMillis() { + MessageType requestedSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named("timestamp") + ); + MessageType tableSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .named("timestamp") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create a new schema with timestamp-millis"); + PrimitiveType timestampField = result.getType("timestamp").asPrimitiveType(); + assertEquals(PrimitiveType.PrimitiveTypeName.INT64, timestampField.getPrimitiveTypeName()); + assertEquals(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS), + timestampField.getLogicalTypeAnnotation()); + } + + @Test + public void testNoRepairNeededTimestampMillisToTimestampMicros() { + // This direction should NOT trigger repair + MessageType requestedSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .named("timestamp") + ); + MessageType tableSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named("timestamp") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertSame(requestedSchema, result, "Should not repair timestamp-millis to timestamp-micros"); + } + + @Test + public void testNoRepairNeededNonLongTypes() { + MessageType requestedSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id") + ); + MessageType tableSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.dateType()) + .named("id") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertSame(requestedSchema, result, "Should not repair non-LONG types"); + } + + @Test + public void testRepairRecordSingleField() { + MessageType requestedSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp") + ); + + MessageType tableSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .named("timestamp") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new record schema"); + assertEquals(1, result.getFields().size()); + + PrimitiveType field = result.getType("timestamp").asPrimitiveType(); + assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS), + field.getLogicalTypeAnnotation()); + } + + @Test + public void testRepairRecordMultipleFieldsOnlyOneNeedsRepair() { + MessageType requestedSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp"), + Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .named("name") + ); + + MessageType tableSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named("timestamp"), + Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .named("name") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new record schema"); + assertEquals(3, result.getFields().size()); + + // Verify id field unchanged - should be same type instance + assertSame(requestedSchema.getType("id"), result.getType("id")); + + // Verify timestamp field repaired + PrimitiveType timestampField = result.getType("timestamp").asPrimitiveType(); + assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MICROS), + timestampField.getLogicalTypeAnnotation()); + + // Verify name field unchanged - should be same type instance + assertSame(requestedSchema.getType("name"), result.getType("name")); + } + + @Test + public void testRepairRecordNestedRecord() { + GroupType nestedRequestedSchema = new GroupType(Type.Repetition.REQUIRED, "nested", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp") + ); + + GroupType nestedTableSchema = new GroupType(Type.Repetition.REQUIRED, "nested", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .named("timestamp") + ); + + MessageType requestedSchema = new MessageType("OuterRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + nestedRequestedSchema + ); + + MessageType tableSchema = new MessageType("OuterRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + nestedTableSchema + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new schema for nested record"); + + // Verify id field unchanged - should be same type instance + assertSame(requestedSchema.getType("id"), result.getType("id")); + + // Verify nested record was repaired + GroupType nestedResult = result.getType("nested").asGroupType(); + PrimitiveType nestedTimestamp = nestedResult.getType("timestamp").asPrimitiveType(); + assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS), + nestedTimestamp.getLogicalTypeAnnotation()); + } + + @Test + public void testRepairRecordMissingFieldInTableSchema() { + // Requested schema has a field not present in table schema + MessageType requestedSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("newField") + ); + + MessageType tableSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + // Should return original schema unchanged since newField doesn't exist in table schema + assertSame(requestedSchema, result, "Should return original when field missing in table schema"); + } + + @Test + public void testRepairRecordMultipleFieldsMissingInTableSchema() { + // Requested schema has multiple fields not present in table schema + MessageType requestedSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("newField1"), + Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .named("name"), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("newField2") + ); + + MessageType tableSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .named("name") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + // Should return original schema unchanged since new fields don't exist in table schema + assertSame(requestedSchema, result, "Should return original when multiple fields missing in table schema"); + } + + @Test + public void testRepairRecordMixedMissingAndRepairableFields() { + // Requested schema has some fields missing in table, some needing repair, some unchanged + MessageType requestedSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp"), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("newField"), + Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .named("name") + ); + + MessageType tableSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .named("timestamp"), + Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .named("name") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + // Should create new schema with timestamp repaired, but newField preserved from requested + assertNotSame(requestedSchema, result, "Should create new schema"); + assertEquals(4, result.getFields().size()); + + // Verify id field unchanged + assertSame(requestedSchema.getType("id"), result.getType("id")); + + // Verify timestamp field repaired + PrimitiveType timestampField = result.getType("timestamp").asPrimitiveType(); + assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS), + timestampField.getLogicalTypeAnnotation()); + + // Verify newField preserved from requested schema (not in table) + assertSame(requestedSchema.getType("newField"), result.getType("newField")); + + // Verify name field unchanged + assertSame(requestedSchema.getType("name"), result.getType("name")); + } + + @Test + public void testRepairNestedRecordFieldMissingInTableSchema() { + // Requested nested record has a field not present in table's nested record + GroupType nestedRequestedSchema = new GroupType(Type.Repetition.REQUIRED, "nested", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp"), + Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .named("extraField") + ); + + GroupType nestedTableSchema = new GroupType(Type.Repetition.REQUIRED, "nested", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .named("timestamp") + ); + + MessageType requestedSchema = new MessageType("OuterRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + nestedRequestedSchema + ); + + MessageType tableSchema = new MessageType("OuterRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + nestedTableSchema + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result, "Should create new schema"); + + // Verify id field unchanged + assertSame(requestedSchema.getType("id"), result.getType("id")); + + // Verify nested record was repaired but still has extraField + GroupType nestedResult = result.getType("nested").asGroupType(); + assertEquals(2, nestedResult.getFieldCount()); + + // Timestamp should be repaired + PrimitiveType timestampField = nestedResult.getType("timestamp").asPrimitiveType(); + assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS), + timestampField.getLogicalTypeAnnotation()); + + // extraField should be preserved from requested schema + assertSame(nestedRequestedSchema.getType("extraField"), nestedResult.getType("extraField")); + } + + @Test + public void testRepairRecordWholeNestedRecordMissingInTableSchema() { + // Requested schema has a nested record field that doesn't exist in table schema + GroupType nestedRequestedSchema = new GroupType(Type.Repetition.REQUIRED, "newNested", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp") + ); + + MessageType requestedSchema = new MessageType("OuterRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + nestedRequestedSchema + ); + + MessageType tableSchema = new MessageType("OuterRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + // Should return original schema unchanged since newNested field doesn't exist in table + assertSame(requestedSchema, result, "Should return original when nested field missing in table schema"); + } + + @Test + public void testEdgeCaseEmptyRecord() { + MessageType requestedSchema = new MessageType("EmptyRecord"); + MessageType tableSchema = new MessageType("EmptyRecord"); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertSame(requestedSchema, result, "Empty records should return same instance"); + } + + @Test + public void testRepairRecordFirstFieldChanged() { + // Test the optimization path where the first field needs repair + MessageType requestedSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp1"), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp2") + ); + + MessageType tableSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .named("timestamp1"), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named("timestamp2") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result); + PrimitiveType timestamp1 = result.getType("timestamp1").asPrimitiveType(); + assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS), + timestamp1.getLogicalTypeAnnotation()); + PrimitiveType timestamp2 = result.getType("timestamp2").asPrimitiveType(); + assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MICROS), + timestamp2.getLogicalTypeAnnotation()); + } + + @Test + public void testRepairRecordLastFieldChanged() { + // Test the optimization path where only the last field needs repair + MessageType requestedSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .named("name"), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp") + ); + + MessageType tableSchema = new MessageType("TestRecord", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, Type.Repetition.REQUIRED) + .named("id"), + Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .named("name"), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .named("timestamp") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result); + // Verify id and name fields unchanged - should be same type instances + assertSame(requestedSchema.getType("id"), result.getType("id")); + assertSame(requestedSchema.getType("name"), result.getType("name")); + // Verify timestamp field repaired + PrimitiveType timestampField = result.getType("timestamp").asPrimitiveType(); + assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS), + timestampField.getLogicalTypeAnnotation()); + } + + @Test + public void testRepairLogicalTypesWithOptionEmpty() { + MessageType requestedSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, Option.empty()); + + assertSame(requestedSchema, result, "Should return original when Option is empty"); + } + + @Test + public void testRepairLogicalTypesWithOptionPresent() { + MessageType requestedSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp") + ); + MessageType tableSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .named("timestamp") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, Option.of(tableSchema)); + + assertNotSame(requestedSchema, result, "Should repair when Option is present"); + PrimitiveType timestampField = result.getType("timestamp").asPrimitiveType(); + assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS), + timestampField.getLogicalTypeAnnotation()); + } + + @Test + public void testRepairOptionalFieldRepetition() { + // Test that repair preserves the requested field's repetition (OPTIONAL vs REQUIRED) + MessageType requestedSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.OPTIONAL) + .named("timestamp") + ); + MessageType tableSchema = new MessageType("TestSchema", + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .named("timestamp") + ); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result); + PrimitiveType timestampField = result.getType("timestamp").asPrimitiveType(); + assertEquals(Type.Repetition.OPTIONAL, timestampField.getRepetition(), + "Should preserve requested field's repetition"); + assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS), + timestampField.getLogicalTypeAnnotation()); + } + + @Test + public void testRepairNestedGroupPreservesLogicalType() { + // Test that repair preserves the group's logical type annotation + GroupType nestedRequestedSchema = new GroupType(Type.Repetition.REQUIRED, "nested", + LogicalTypeAnnotation.listType(), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .named("timestamp") + ); + + GroupType nestedTableSchema = new GroupType(Type.Repetition.REQUIRED, "nested", + LogicalTypeAnnotation.listType(), + Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS)) + .named("timestamp") + ); + + MessageType requestedSchema = new MessageType("OuterRecord", nestedRequestedSchema); + MessageType tableSchema = new MessageType("OuterRecord", nestedTableSchema); + + MessageType result = SchemaRepair.repairLogicalTypes(requestedSchema, tableSchema); + + assertNotSame(requestedSchema, result); + GroupType nestedResult = result.getType("nested").asGroupType(); + assertEquals(LogicalTypeAnnotation.listType(), nestedResult.getLogicalTypeAnnotation(), + "Should preserve group's logical type annotation"); + PrimitiveType timestampField = nestedResult.getType("timestamp").asPrimitiveType(); + assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS), + timestampField.getLogicalTypeAnnotation()); + } +} diff --git a/hudi-common/src/parquet/test/org/apache/parquet/schema/TestSchemaRepairEquivalence.java b/hudi-common/src/parquet/test/org/apache/parquet/schema/TestSchemaRepairEquivalence.java new file mode 100644 index 0000000000000..75fe9ffde7d61 --- /dev/null +++ b/hudi-common/src/parquet/test/org/apache/parquet/schema/TestSchemaRepairEquivalence.java @@ -0,0 +1,481 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.schema; + +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.avro.HoodieAvroParquetSchemaConverter; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Tests equivalence between {@link SchemaRepair} and {@link AvroSchemaRepair}. + * + * This test class verifies that both repair implementations produce logically + * equivalent results when converting between Avro and Parquet schemas. + */ +public class TestSchemaRepairEquivalence { + + private HoodieAvroParquetSchemaConverter converter; + + @BeforeEach + public void setUp() { + converter = HoodieAvroParquetSchemaConverter.getAvroSchemaConverter(new Configuration()); + } + + /** + * Helper method to verify that AvroSchemaRepair and SchemaRepair produce equivalent results. + */ + private void assertRepairEquivalence(Schema requestedAvro, Schema tableAvro) { + // Apply Avro repair + Schema repairedAvro = AvroSchemaRepair.repairLogicalTypes(requestedAvro, tableAvro); + + // Convert to Parquet schemas + MessageType requestedParquet = converter.convert(requestedAvro); + MessageType tableParquet = converter.convert(tableAvro); + + // Apply Parquet repair + MessageType repairedParquet = SchemaRepair.repairLogicalTypes(requestedParquet, tableParquet); + + // Convert repaired Parquet back to Avro + Schema repairedParquetAsAvro = converter.convert(repairedParquet); + + // Verify equivalence + assertEquals(repairedAvro, repairedParquetAsAvro, + "SchemaRepair and AvroSchemaRepair should produce equivalent results"); + } + + @Test + public void testEquivalenceNoRepairNeeded() { + Schema requestedSchema = SchemaBuilder.record("testrecord") + .fields() + .name("value").type().longType().noDefault() + .endRecord(); + Schema tableSchema = SchemaBuilder.record("testrecord") + .fields() + .name("value").type().longType().noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceLongToLocalTimestampMillis() { + Schema requestedSchema = SchemaBuilder.record("testrecord") + .fields() + .name("timestamp").type().longType().noDefault() + .endRecord(); + Schema tableSchema = SchemaBuilder.record("testrecord") + .fields() + .name("timestamp") + .type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceLongToLocalTimestampMicros() { + Schema requestedSchema = SchemaBuilder.record("testrecord") + .fields() + .name("timestamp").type().longType().noDefault() + .endRecord(); + Schema tableSchema = SchemaBuilder.record("testrecord") + .fields() + .name("timestamp") + .type(LogicalTypes.localTimestampMicros().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceTimestampMicrosToMillis() { + Schema requestedSchema = SchemaBuilder.record("testrecord") + .fields() + .name("timestamp") + .type(LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + Schema tableSchema = SchemaBuilder.record("testrecord") + .fields() + .name("timestamp") + .type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceNoRepairTimestampMillisToMicros() { + Schema requestedSchema = SchemaBuilder.record("testrecord") + .fields() + .name("timestamp") + .type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + Schema tableSchema = SchemaBuilder.record("testrecord") + .fields() + .name("timestamp") + .type(LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceSimpleRecord() { + Schema requestedSchema = SchemaBuilder.record("testrecord") + .fields() + .name("timestamp").type().longType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("testrecord") + .fields() + .name("timestamp") + .type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceRecordMultipleFields() { + Schema requestedSchema = SchemaBuilder.record("testrecord") + .fields() + .name("id").type().intType().noDefault() + .name("timestamp").type().longType().noDefault() + .name("name").type().stringType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("testrecord") + .fields() + .name("id").type().intType().noDefault() + .name("timestamp") + .type(LogicalTypes.localTimestampMicros().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .name("name").type().stringType().noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceNestedRecord() { + Schema nestedRequestedSchema = SchemaBuilder.record("nestedrecord") + .fields() + .name("timestamp").type().longType().noDefault() + .endRecord(); + + Schema nestedTableSchema = SchemaBuilder.record("nestedrecord") + .fields() + .name("timestamp") + .type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + Schema requestedSchema = SchemaBuilder.record("outerrecord") + .fields() + .name("id").type().intType().noDefault() + .name("nestedrecord").type(nestedRequestedSchema).noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("outerrecord") + .fields() + .name("id").type().intType().noDefault() + .name("nestedrecord").type(nestedTableSchema).noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceRecordWithExtraFieldInRequested() { + Schema requestedSchema = SchemaBuilder.record("testrecord") + .fields() + .name("id").type().intType().noDefault() + .name("timestamp").type().longType().noDefault() + .name("newfield").type().longType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("testrecord") + .fields() + .name("id").type().intType().noDefault() + .name("timestamp") + .type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceRecordMixedFields() { + Schema requestedSchema = SchemaBuilder.record("testrecord") + .fields() + .name("id").type().intType().noDefault() + .name("timestamp").type().longType().noDefault() + .name("newfield").type().stringType().noDefault() + .name("name").type().stringType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("testrecord") + .fields() + .name("id").type().intType().noDefault() + .name("timestamp") + .type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .name("name").type().stringType().noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceNestedRecordWithExtraField() { + Schema nestedRequestedSchema = SchemaBuilder.record("nestedrecord") + .fields() + .name("timestamp").type().longType().noDefault() + .name("extrafield").type().stringType().noDefault() + .endRecord(); + + Schema nestedTableSchema = SchemaBuilder.record("nestedrecord") + .fields() + .name("timestamp") + .type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + Schema requestedSchema = SchemaBuilder.record("outerrecord") + .fields() + .name("id").type().intType().noDefault() + .name("nestedrecord").type(nestedRequestedSchema).noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("outerrecord") + .fields() + .name("id").type().intType().noDefault() + .name("nestedrecord").type(nestedTableSchema).noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceRecordFirstFieldChanged() { + Schema requestedSchema = SchemaBuilder.record("testrecord") + .fields() + .name("timestamp1").type().longType().noDefault() + .name("timestamp2").type().longType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("testrecord") + .fields() + .name("timestamp1") + .type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .name("timestamp2") + .type(LogicalTypes.localTimestampMicros().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceRecordLastFieldChanged() { + Schema requestedSchema = SchemaBuilder.record("testrecord") + .fields() + .name("id").type().intType().noDefault() + .name("name").type().stringType().noDefault() + .name("timestamp").type().longType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("testrecord") + .fields() + .name("id").type().intType().noDefault() + .name("name").type().stringType().noDefault() + .name("timestamp") + .type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceComplexNestedStructure() { + Schema innerRecordRequested = SchemaBuilder.record("inner") + .fields() + .name("timestamp").type().longType().noDefault() + .name("value").type().intType().noDefault() + .endRecord(); + + Schema innerRecordTable = SchemaBuilder.record("inner") + .fields() + .name("timestamp") + .type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .name("value").type().intType().noDefault() + .endRecord(); + + Schema middleRecordRequested = SchemaBuilder.record("middle") + .fields() + .name("inner").type(innerRecordRequested).noDefault() + .name("middletimestamp").type().longType().noDefault() + .endRecord(); + + Schema middleRecordTable = SchemaBuilder.record("middle") + .fields() + .name("inner").type(innerRecordTable).noDefault() + .name("middletimestamp") + .type(LogicalTypes.localTimestampMicros().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + Schema requestedSchema = SchemaBuilder.record("outer") + .fields() + .name("id").type().intType().noDefault() + .name("middle").type(middleRecordRequested).noDefault() + .name("outertimestamp").type().longType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("outer") + .fields() + .name("id").type().intType().noDefault() + .name("middle").type(middleRecordTable).noDefault() + .name("outertimestamp") + .type(LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceEmptyRecord() { + Schema requestedSchema = SchemaBuilder.record("emptyrecord").fields().endRecord(); + Schema tableSchema = SchemaBuilder.record("emptyrecord").fields().endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceRecordNoFieldsMatch() { + Schema requestedSchema = SchemaBuilder.record("testrecord") + .fields() + .name("field1").type().longType().noDefault() + .name("field2").type().stringType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("testrecord") + .fields() + .name("field3").type().intType().noDefault() + .name("field4") + .type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceMultipleTimestampRepairs() { + Schema requestedSchema = SchemaBuilder.record("testrecord") + .fields() + .name("ts1").type().longType().noDefault() + .name("ts2").type().longType().noDefault() + .name("ts3").type(LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG))).noDefault() + .name("ts4").type().longType().noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("testrecord") + .fields() + .name("ts1") + .type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .name("ts2") + .type(LogicalTypes.localTimestampMicros().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .name("ts3") + .type(LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .name("ts4").type().longType().noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } + + @Test + public void testEquivalenceDeepNesting() { + Schema level3Requested = SchemaBuilder.record("level3") + .fields() + .name("timestamp").type().longType().noDefault() + .endRecord(); + + Schema level3Table = SchemaBuilder.record("level3") + .fields() + .name("timestamp") + .type(LogicalTypes.localTimestampMillis().addToSchema(Schema.create(Schema.Type.LONG))) + .noDefault() + .endRecord(); + + Schema level2Requested = SchemaBuilder.record("level2") + .fields() + .name("level3").type(level3Requested).noDefault() + .endRecord(); + + Schema level2Table = SchemaBuilder.record("level2") + .fields() + .name("level3").type(level3Table).noDefault() + .endRecord(); + + Schema level1Requested = SchemaBuilder.record("level1") + .fields() + .name("level2").type(level2Requested).noDefault() + .endRecord(); + + Schema level1Table = SchemaBuilder.record("level1") + .fields() + .name("level2").type(level2Table).noDefault() + .endRecord(); + + Schema requestedSchema = SchemaBuilder.record("level0") + .fields() + .name("level1").type(level1Requested).noDefault() + .endRecord(); + + Schema tableSchema = SchemaBuilder.record("level0") + .fields() + .name("level1").type(level1Table).noDefault() + .endRecord(); + + assertRepairEquivalence(requestedSchema, tableSchema); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java index cd3755d26c81f..32ca10777aaf2 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java @@ -19,9 +19,11 @@ package org.apache.hudi.common.testutils; +import org.apache.hudi.avro.AvroSchemaUtils; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroIndexedRecord; import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -35,10 +37,13 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.util.AvroOrcUtils; +import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; import org.apache.avro.Conversions; import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; @@ -62,8 +67,10 @@ import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.time.Instant; +import java.time.LocalDate; import java.time.LocalDateTime; import java.time.ZoneOffset; +import java.time.ZonedDateTime; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -72,15 +79,20 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Objects; import java.util.Random; import java.util.Set; import java.util.UUID; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Predicate; import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; +import static org.apache.hudi.avro.HoodieAvroUtils.createNewSchemaField; +import static org.apache.hudi.common.util.StringUtils.getUTF8Bytes; import static org.apache.hudi.common.util.ValidationUtils.checkState; /** @@ -131,6 +143,21 @@ public class HoodieTestDataGenerator implements AutoCloseable { + "{\"name\":\"current_date\",\"type\": {\"type\": \"int\", \"logicalType\": \"date\"}}," + "{\"name\":\"current_ts\",\"type\": {\"type\": \"long\"}}," + "{\"name\":\"height\",\"type\":{\"type\":\"fixed\",\"name\":\"abc\",\"size\":5,\"logicalType\":\"decimal\",\"precision\":10,\"scale\":6}},"; + public static final String EXTENDED_LOGICAL_TYPES_SCHEMA_V6 = "{\"name\":\"ts_millis\",\"type\":{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"}}," + + "{\"name\":\"ts_micros\",\"type\":{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}}," + + "{\"name\":\"local_ts_millis\",\"type\":{\"type\":\"long\",\"logicalType\":\"local-timestamp-millis\"}}," + + "{\"name\":\"local_ts_micros\",\"type\":{\"type\":\"long\",\"logicalType\":\"local-timestamp-micros\"}}," + + "{\"name\":\"event_date\",\"type\":{\"type\":\"int\",\"logicalType\":\"date\"}},"; + public static final String EXTENDED_LOGICAL_TYPES_SCHEMA = "{\"name\":\"ts_millis\",\"type\":{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"}}," + + "{\"name\":\"ts_micros\",\"type\":{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}}," + + "{\"name\":\"local_ts_millis\",\"type\":{\"type\":\"long\",\"logicalType\":\"local-timestamp-millis\"}}," + + "{\"name\":\"local_ts_micros\",\"type\":{\"type\":\"long\",\"logicalType\":\"local-timestamp-micros\"}}," + + "{\"name\":\"event_date\",\"type\":{\"type\":\"int\",\"logicalType\":\"date\"}},"; + + // LTS = Local Timestamp + public static final String EXTENDED_LOGICAL_TYPES_SCHEMA_NO_LTS = "{\"name\":\"ts_millis\",\"type\":{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"}}," + + "{\"name\":\"ts_micros\",\"type\":{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}}," + + "{\"name\":\"event_date\",\"type\":{\"type\":\"int\",\"logicalType\":\"date\"}},"; public static final String TRIP_EXAMPLE_SCHEMA = TRIP_SCHEMA_PREFIX + EXTRA_TYPE_SCHEMA + MAP_TYPE_SCHEMA + FARE_NESTED_SCHEMA + TIP_NESTED_SCHEMA + TRIP_SCHEMA_SUFFIX; @@ -139,6 +166,13 @@ public class HoodieTestDataGenerator implements AutoCloseable { public static final String TRIP_NESTED_EXAMPLE_SCHEMA = TRIP_SCHEMA_PREFIX + FARE_NESTED_SCHEMA + TRIP_SCHEMA_SUFFIX; + public static final String TRIP_LOGICAL_TYPES_SCHEMA_V6 = + TRIP_SCHEMA_PREFIX + EXTENDED_LOGICAL_TYPES_SCHEMA_V6 + TRIP_SCHEMA_SUFFIX; + public static final String TRIP_LOGICAL_TYPES_SCHEMA = + TRIP_SCHEMA_PREFIX + EXTENDED_LOGICAL_TYPES_SCHEMA + TRIP_SCHEMA_SUFFIX; + // LTS = Local Timestamp + public static final String TRIP_LOGICAL_TYPES_SCHEMA_NO_LTS = + TRIP_SCHEMA_PREFIX + EXTENDED_LOGICAL_TYPES_SCHEMA_NO_LTS + TRIP_SCHEMA_SUFFIX; public static final String TRIP_SCHEMA = "{\"type\":\"record\",\"name\":\"tripUberRec\",\"fields\":[" + "{\"name\":\"timestamp\",\"type\":\"long\"},{\"name\":\"_row_key\",\"type\":\"string\"},{\"name\":\"rider\",\"type\":\"string\"}," @@ -151,13 +185,15 @@ public class HoodieTestDataGenerator implements AutoCloseable { public static final String TRIP_HIVE_COLUMN_TYPES = "bigint,string,string,string,string,string,double,double,double,double,int,bigint,float,binary,int,bigint,decimal(10,6)," + "map,struct,array>,boolean"; - public static final Schema AVRO_SCHEMA = new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA); public static final Schema NESTED_AVRO_SCHEMA = new Schema.Parser().parse(TRIP_NESTED_EXAMPLE_SCHEMA); public static final TypeDescription ORC_SCHEMA = AvroOrcUtils.createOrcSchema(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA)); public static final Schema AVRO_SCHEMA_WITH_METADATA_FIELDS = HoodieAvroUtils.addMetadataFields(AVRO_SCHEMA); public static final Schema AVRO_SHORT_TRIP_SCHEMA = new Schema.Parser().parse(SHORT_TRIP_SCHEMA); + public static final Schema AVRO_TRIP_LOGICAL_TYPES_SCHEMA = new Schema.Parser().parse(TRIP_LOGICAL_TYPES_SCHEMA); + public static final Schema AVRO_TRIP_LOGICAL_TYPES_SCHEMA_V6 = new Schema.Parser().parse(TRIP_LOGICAL_TYPES_SCHEMA_V6); + public static final Schema AVRO_TRIP_LOGICAL_TYPES_SCHEMA_NO_LTS = new Schema.Parser().parse(TRIP_LOGICAL_TYPES_SCHEMA_NO_LTS); public static final Schema AVRO_TRIP_SCHEMA = new Schema.Parser().parse(TRIP_SCHEMA); public static final TypeDescription ORC_TRIP_SCHEMA = AvroOrcUtils.createOrcSchema(new Schema.Parser().parse(TRIP_SCHEMA)); public static final Schema FLATTENED_AVRO_SCHEMA = new Schema.Parser().parse(TRIP_FLATTENED_SCHEMA); @@ -169,6 +205,7 @@ public class HoodieTestDataGenerator implements AutoCloseable { private final String[] partitionPaths; //maintains the count of existing keys schema wise private Map numKeysBySchema; + private Option extendedSchema = Option.empty(); public HoodieTestDataGenerator(long seed) { this(seed, DEFAULT_PARTITION_PATHS, new HashMap<>()); @@ -258,16 +295,40 @@ public int getEstimatedFileSizeInBytes(int numOfRecords) { } public RawTripTestPayload generateRandomValueAsPerSchema(String schemaStr, HoodieKey key, String commitTime, boolean isFlattened) throws IOException { - if (TRIP_EXAMPLE_SCHEMA.equals(schemaStr)) { - return generateRandomValue(key, commitTime, isFlattened); - } else if (TRIP_SCHEMA.equals(schemaStr)) { - return generatePayloadForTripSchema(key, commitTime); - } else if (SHORT_TRIP_SCHEMA.equals(schemaStr)) { - return generatePayloadForShortTripSchema(key, commitTime); - } else if (TRIP_NESTED_EXAMPLE_SCHEMA.equals(schemaStr)) { - return generateNestedExampleRandomValue(key, commitTime); + return generateRandomValueAsPerSchema(schemaStr, key, commitTime, isFlattened, false, 0L); + } + + public RawTripTestPayload generateRandomValueAsPerSchema(String schemaStr, HoodieKey key, String commitTime, + boolean isFlattened, boolean isDelete, long timestamp) throws IOException { + if (!isDelete) { + if (TRIP_FLATTENED_SCHEMA.equals(schemaStr)) { + return generateRandomValue(key, commitTime, true, timestamp); + } else if (TRIP_EXAMPLE_SCHEMA.equals(schemaStr)) { + return generateRandomValue(key, commitTime, isFlattened, timestamp); + } else if (TRIP_SCHEMA.equals(schemaStr)) { + return generatePayloadForTripSchema(key, commitTime); + } else if (SHORT_TRIP_SCHEMA.equals(schemaStr)) { + return generatePayloadForShortTripSchema(key, commitTime); + } else if (TRIP_NESTED_EXAMPLE_SCHEMA.equals(schemaStr)) { + return generateNestedExampleRandomValue(key, commitTime); + } else if (TRIP_LOGICAL_TYPES_SCHEMA.equals(schemaStr)) { + return generatePayloadForLogicalTypesSchema(key, commitTime, false, timestamp); + } else if (TRIP_LOGICAL_TYPES_SCHEMA_V6.equals(schemaStr)) { + return generatePayloadForLogicalTypesSchemaV6(key, commitTime, false, timestamp); + } else if (TRIP_LOGICAL_TYPES_SCHEMA_NO_LTS.equals(schemaStr)) { + return generatePayloadForLogicalTypesSchemaNoLTS(key, commitTime, false, timestamp); + } + } else { + if (TRIP_EXAMPLE_SCHEMA.equals(schemaStr)) { + return generateRandomDeleteValue(key, commitTime); + } else if (TRIP_LOGICAL_TYPES_SCHEMA.equals(schemaStr)) { + return generatePayloadForLogicalTypesSchema(key, commitTime, true, timestamp); + } else if (TRIP_LOGICAL_TYPES_SCHEMA_V6.equals(schemaStr)) { + return generatePayloadForLogicalTypesSchemaV6(key, commitTime, true, timestamp); + } else if (TRIP_LOGICAL_TYPES_SCHEMA_NO_LTS.equals(schemaStr)) { + return generatePayloadForLogicalTypesSchemaNoLTS(key, commitTime, true, timestamp); + } } - return null; } @@ -336,9 +397,17 @@ public RawTripTestPayload generatePayloadForShortTripSchema(HoodieKey key, Strin * Generates a new avro record of the above schema format for a delete. */ private RawTripTestPayload generateRandomDeleteValue(HoodieKey key, String instantTime) throws IOException { + return generateRandomDeleteValue(key, instantTime, TRIP_EXAMPLE_SCHEMA); + } + + private RawTripTestPayload generateRandomDeleteValue(HoodieKey key, String instantTime, String schemaStr) throws IOException { GenericRecord rec = generateGenericRecord(key.getRecordKey(), key.getPartitionPath(), "rider-" + instantTime, "driver-" + instantTime, 0, true, false); - return new RawTripTestPayload(Option.of(rec.toString()), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA, true, 0L); + return new RawTripTestPayload(Option.of(rec.toString()), key.getRecordKey(), key.getPartitionPath(), schemaStr, true, 0L); + } + + private RawTripTestPayload generateRandomDeleteValuePerSchema(HoodieKey key, String instantTime, String schemaStr) throws IOException { + return generateRandomValueAsPerSchema(schemaStr, key, instantTime, false, true, 0L); } /** @@ -354,6 +423,20 @@ public GenericRecord generateGenericRecord(String rowKey, String partitionPath, return generateGenericRecord(rowKey, partitionPath, riderName, driverName, timestamp, false, false); } + /** + * LTS = Local Timestamp + */ + public RawTripTestPayload generatePayloadForLogicalTypesSchemaNoLTS(HoodieKey key, String commitTime, boolean isDelete, long timestamp) throws IOException { + return generateRecordForTripLogicalTypesSchema(key, "rider-" + commitTime, "driver-" + commitTime, timestamp, isDelete, false, false); + } + + public RawTripTestPayload generatePayloadForLogicalTypesSchema(HoodieKey key, String commitTime, boolean isDelete, long timestamp) throws IOException { + return generateRecordForTripLogicalTypesSchema(key, "rider-" + commitTime, "driver-" + commitTime, timestamp, isDelete, false, true); + } + + public RawTripTestPayload generatePayloadForLogicalTypesSchemaV6(HoodieKey key, String commitTime, boolean isDelete, long timestamp) throws IOException { + return generateRecordForTripLogicalTypesSchema(key, "rider-" + commitTime, "driver-" + commitTime, timestamp, isDelete, true, true); + } /** * Populate rec with values for TRIP_SCHEMA_PREFIX @@ -442,13 +525,11 @@ private void generateTripSuffixValues(GenericRecord rec, boolean isDeleteRecord) } } - /** * Generate record conforming to TRIP_EXAMPLE_SCHEMA or TRIP_FLATTENED_SCHEMA if isFlattened is true */ public GenericRecord generateGenericRecord(String rowKey, String partitionPath, String riderName, String driverName, - long timestamp, boolean isDeleteRecord, - boolean isFlattened) { + long timestamp, boolean isDeleteRecord, boolean isFlattened) { GenericRecord rec = new GenericData.Record(isFlattened ? FLATTENED_AVRO_SCHEMA : AVRO_SCHEMA); generateTripPrefixValues(rec, rowKey, partitionPath, riderName, driverName, timestamp); if (isFlattened) { @@ -500,6 +581,65 @@ public GenericRecord generateRecordForShortTripSchema(String rowKey, String ride return rec; } + public RawTripTestPayload generateRecordForTripLogicalTypesSchema(HoodieKey key, String riderName, String driverName, + long timestamp, boolean isDeleteRecord, boolean v6, + boolean hasLTS) throws IOException { + GenericRecord rec; + if (!hasLTS) { + // LTS = Local Timestamp + rec = new GenericData.Record(AVRO_TRIP_LOGICAL_TYPES_SCHEMA_NO_LTS); + } else if (v6) { + rec = new GenericData.Record(AVRO_TRIP_LOGICAL_TYPES_SCHEMA_V6); + } else { + rec = new GenericData.Record(AVRO_TRIP_LOGICAL_TYPES_SCHEMA); + } + generateTripPrefixValues(rec, key.getRecordKey(), key.getPartitionPath(), riderName, driverName, timestamp); + + int hash = key.getRecordKey().hashCode(); + boolean above = (hash & 1) == 0; // half above, half below threshold + + // ------------------- + // Threshold definitions + // ------------------- + Instant tsMillisThreshold = Instant.parse("2020-01-01T00:00:00Z"); + Instant tsMicrosThreshold = Instant.parse("2020-06-01T12:00:00Z"); + + Instant localTsMillisThreshold = ZonedDateTime.of( + 2015, 5, 20, 12, 34, 56, 0, ZoneOffset.UTC).toInstant(); + Instant localTsMicrosThreshold = ZonedDateTime.of( + 2017, 7, 7, 7, 7, 7, 0, ZoneOffset.UTC).toInstant(); + + LocalDate dateThreshold = LocalDate.of(2000, 1, 1); + + // ------------------- + // Assign edge values + // ------------------- + + // ts_millis + long tsMillisBase = tsMillisThreshold.toEpochMilli(); + rec.put("ts_millis", above ? tsMillisBase + 1 : tsMillisBase - 1); + + // ts_micros + long tsMicrosBase = TimeUnit.SECONDS.toMicros(tsMicrosThreshold.getEpochSecond()) + tsMicrosThreshold.getNano() / 1_000L; + rec.put("ts_micros", above ? tsMicrosBase + 1 : tsMicrosBase - 1); + + if (hasLTS) { + // local_ts_millis + long localTsMillisBase = localTsMillisThreshold.toEpochMilli(); + rec.put("local_ts_millis", above ? localTsMillisBase + 1 : localTsMillisBase - 1); + + // local_ts_micros + long localTsMicrosBase = TimeUnit.SECONDS.toMicros(localTsMicrosThreshold.getEpochSecond()) + localTsMicrosThreshold.getNano() / 1_000L; + rec.put("local_ts_micros", above ? localTsMicrosBase + 1 : localTsMicrosBase - 1); + } + + // event_date + int eventDateBase = (int) dateThreshold.toEpochDay(); + rec.put("event_date", above ? eventDateBase + 1 : eventDateBase - 1); + generateTripSuffixValues(rec, isDeleteRecord); + return new RawTripTestPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), rec.getSchema().toString()); + } + public static void createRequestedCommitFile(String basePath, String instantTime, Configuration configuration) throws IOException { Path pendingRequestedFile = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeRequestedCommitFileName(instantTime)); @@ -939,9 +1079,13 @@ public Stream generateUniqueUpdatesStream(String instantTime, Inte * @return stream of hoodie record updates */ public Stream generateUniqueDeleteStream(Integer n) { + return generateUniqueDeleteStream(n, TRIP_EXAMPLE_SCHEMA); + } + + public Stream generateUniqueDeleteStream(Integer n, String streamStr) { final Set used = new HashSet<>(); - Map existingKeys = existingKeysBySchema.get(TRIP_EXAMPLE_SCHEMA); - Integer numExistingKeys = numKeysBySchema.get(TRIP_EXAMPLE_SCHEMA); + Map existingKeys = existingKeysBySchema.get(streamStr); + Integer numExistingKeys = numKeysBySchema.get(streamStr); if (n > numExistingKeys) { throw new IllegalArgumentException("Requested unique deletes is greater than number of available keys"); } @@ -959,7 +1103,7 @@ public Stream generateUniqueDeleteStream(Integer n) { used.add(kp); result.add(kp.key); } - numKeysBySchema.put(TRIP_EXAMPLE_SCHEMA, numExistingKeys); + numKeysBySchema.put(streamStr, numExistingKeys); return result.stream(); } @@ -971,9 +1115,13 @@ public Stream generateUniqueDeleteStream(Integer n) { * @return stream of hoodie records for delete */ public Stream generateUniqueDeleteRecordStream(String instantTime, Integer n) { + return generateUniqueDeleteRecordStream(instantTime, n, TRIP_EXAMPLE_SCHEMA); + } + + public Stream generateUniqueDeleteRecordStream(String instantTime, Integer n, String schemaStr) { final Set used = new HashSet<>(); - Map existingKeys = existingKeysBySchema.get(TRIP_EXAMPLE_SCHEMA); - Integer numExistingKeys = numKeysBySchema.get(TRIP_EXAMPLE_SCHEMA); + Map existingKeys = existingKeysBySchema.get(schemaStr); + Integer numExistingKeys = numKeysBySchema.get(schemaStr); if (n > numExistingKeys) { throw new IllegalArgumentException("Requested unique deletes is greater than number of available keys"); } @@ -991,12 +1139,12 @@ public Stream generateUniqueDeleteRecordStream(String instantTime, numExistingKeys--; used.add(kp); try { - result.add(new HoodieAvroRecord(kp.key, generateRandomDeleteValue(kp.key, instantTime))); + result.add(new HoodieAvroRecord(kp.key, generateRandomDeleteValuePerSchema(kp.key, instantTime, schemaStr))); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } } - numKeysBySchema.put(TRIP_EXAMPLE_SCHEMA, numExistingKeys); + numKeysBySchema.put(schemaStr, numExistingKeys); return result.stream(); } @@ -1062,7 +1210,7 @@ public void close() { private static long genRandomTimeMillis(Random r) { // Fri Feb 13 15:31:30 PST 2009 - long anchorTs = 1234567890L; + long anchorTs = 1234567890000L; // NOTE: To provide for certainty and not generate overly random dates, we will limit // dispersion to be w/in +/- 3 days from the anchor date return anchorTs + r.nextLong() % 259200000L; @@ -1086,4 +1234,311 @@ public static UUID genPseudoRandomUUID(Random r) { throw new HoodieException(e); } } + + /** + * Used for equality checks between the expected and actual records for generated by the HoodieTestDataGenerator. + * The fields identify the record with the combination of the recordKey and partitionPath and assert that the proper + * value is present with the orderingVal and the riderValue, which is updated as part of the update utility methods. + */ + public static class RecordIdentifier { + private final String recordKey; + private final String orderingVal; + private final String partitionPath; + private final String riderValue; + + @JsonCreator + public RecordIdentifier(@JsonProperty("recordKey") String recordKey, + @JsonProperty("partitionPath") String partitionPath, + @JsonProperty("orderingVal") String orderingVal, + @JsonProperty("riderValue") String riderValue) { + this.recordKey = recordKey; + this.orderingVal = orderingVal; + this.partitionPath = partitionPath; + this.riderValue = riderValue; + } + + public static RecordIdentifier clone(RecordIdentifier toClone, String orderingVal) { + return new RecordIdentifier(toClone.recordKey, toClone.partitionPath, orderingVal, toClone.riderValue); + } + + public static RecordIdentifier fromTripTestPayload(HoodieAvroIndexedRecord record, String[] orderingFields) { + String recordKey = record.getRecordKey(); + String partitionPath = record.getPartitionPath(); + Comparable orderingValue = record.getOrderingValue(record.getData().getSchema(), CollectionUtils.emptyProps()); + String orderingValStr = orderingValue.toString(); + GenericRecord data = (GenericRecord) record.getData(); + String riderValue = data.getSchema().getField("rider") != null ? data.get("rider").toString() : ""; + return new RecordIdentifier(recordKey, partitionPath, orderingValStr, riderValue); + } + + @Override + public boolean equals(Object o) { + if (o == null || getClass() != o.getClass()) { + return false; + } + RecordIdentifier that = (RecordIdentifier) o; + return Objects.equals(recordKey, that.recordKey) + && Objects.equals(orderingVal, that.orderingVal) + && Objects.equals(partitionPath, that.partitionPath) + && Objects.equals(riderValue, that.riderValue); + } + + @Override + public int hashCode() { + return Objects.hash(recordKey, orderingVal, partitionPath, riderValue); + } + + public String getRecordKey() { + return recordKey; + } + + public String getOrderingVal() { + return orderingVal; + } + + public String getPartitionPath() { + return partitionPath; + } + + public String getRiderValue() { + return riderValue; + } + + @Override + public String toString() { + return "RowKey: " + recordKey + ", PartitionPath: " + partitionPath + + ", OrderingVal: " + orderingVal + ", RiderValue: " + riderValue; + } + } + + public static class SchemaEvolutionConfigs { + public Schema schema = AVRO_SCHEMA; + public boolean nestedSupport = true; + public boolean mapSupport = true; + public boolean arraySupport = true; + public boolean addNewFieldSupport = true; + // TODO: [HUDI-9603] Flink 1.18 array values incorrect in fg reader test + public boolean anyArraySupport = true; + + // Int + public boolean intToLongSupport = true; + public boolean intToFloatSupport = true; + public boolean intToDoubleSupport = true; + public boolean intToStringSupport = true; + + // Long + public boolean longToFloatSupport = true; + public boolean longToDoubleSupport = true; + public boolean longToStringSupport = true; + + // Float + public boolean floatToDoubleSupport = true; + public boolean floatToStringSupport = true; + + // Double + public boolean doubleToStringSupport = true; + + // String + public boolean stringToBytesSupport = true; + + // Bytes + public boolean bytesToStringSupport = true; + } + + private enum SchemaEvolutionTypePromotionCase { + INT_TO_INT(Schema.Type.INT, Schema.Type.INT, config -> true), + INT_TO_LONG(Schema.Type.INT, Schema.Type.LONG, config -> config.intToLongSupport), + INT_TO_FLOAT(Schema.Type.INT, Schema.Type.FLOAT, config -> config.intToFloatSupport), + INT_TO_DOUBLE(Schema.Type.INT, Schema.Type.DOUBLE, config -> config.intToDoubleSupport), + INT_TO_STRING(Schema.Type.INT, Schema.Type.STRING, config -> config.intToStringSupport), + LONG_TO_LONG(Schema.Type.LONG, Schema.Type.LONG, config -> true), + LONG_TO_FLOAT(Schema.Type.LONG, Schema.Type.FLOAT, config -> config.longToFloatSupport), + LONG_TO_DOUBLE(Schema.Type.LONG, Schema.Type.DOUBLE, config -> config.longToDoubleSupport), + LONG_TO_STRING(Schema.Type.LONG, Schema.Type.STRING, config -> config.longToStringSupport), + FLOAT_TO_FLOAT(Schema.Type.FLOAT, Schema.Type.FLOAT, config -> true), + FLOAT_TO_DOUBLE(Schema.Type.FLOAT, Schema.Type.DOUBLE, config -> config.floatToDoubleSupport), + FLOAT_TO_STRING(Schema.Type.FLOAT, Schema.Type.STRING, config -> config.floatToStringSupport), + DOUBLE_TO_DOUBLE(Schema.Type.DOUBLE, Schema.Type.DOUBLE, config -> true), + DOUBLE_TO_STRING(Schema.Type.DOUBLE, Schema.Type.STRING, config -> config.doubleToStringSupport), + STRING_TO_STRING(Schema.Type.STRING, Schema.Type.STRING, config -> true), + STRING_TO_BYTES(Schema.Type.STRING, Schema.Type.BYTES, config -> config.stringToBytesSupport), + BYTES_TO_BYTES(Schema.Type.BYTES, Schema.Type.BYTES, config -> true), + BYTES_TO_STRING(Schema.Type.BYTES, Schema.Type.STRING, config -> config.bytesToStringSupport); + + public final Schema.Type before; + public final Schema.Type after; + public final Predicate isEnabled; + + SchemaEvolutionTypePromotionCase(Schema.Type before, Schema.Type after, Predicate isEnabled) { + this.before = before; + this.after = after; + this.isEnabled = isEnabled; + } + } + + public void extendSchema(SchemaEvolutionConfigs configs, boolean isBefore) { + List baseFields = new ArrayList<>(); + for (SchemaEvolutionTypePromotionCase evolution : SchemaEvolutionTypePromotionCase.values()) { + if (evolution.isEnabled.test(configs)) { + baseFields.add(isBefore ? evolution.before : evolution.after); + } + } + + // Add new field if we are testing adding new fields + if (!isBefore && configs.addNewFieldSupport) { + baseFields.add(Schema.Type.BOOLEAN); + } + + this.extendedSchema = Option.of(generateExtendedSchema(configs, new ArrayList<>(baseFields))); + } + + public void extendSchemaBeforeEvolution(SchemaEvolutionConfigs configs) { + extendSchema(configs, true); + } + + public void extendSchemaAfterEvolution(SchemaEvolutionConfigs configs) { + extendSchema(configs, false); + } + + public Schema getExtendedSchema() { + return extendedSchema.orElseThrow(IllegalArgumentException::new); + } + + private static Schema generateExtendedSchema(SchemaEvolutionConfigs configs, List baseFields) { + return generateExtendedSchema(configs.schema, configs, baseFields, "customField", true); + } + + private static Schema generateExtendedSchema(Schema baseSchema, SchemaEvolutionConfigs configs, List baseFields, String fieldPrefix, boolean toplevel) { + List fields = baseSchema.getFields(); + List finalFields = new ArrayList<>(fields.size() + baseFields.size()); + boolean addedFields = false; + for (Schema.Field field : fields) { + if (configs.nestedSupport && field.name().equals("fare") && field.schema().getType() == Schema.Type.RECORD) { + finalFields.add(createNewSchemaField(field.name(), generateExtendedSchema(field.schema(), configs, baseFields, "customFare", false), field.doc(), field.defaultVal())); + } else if (configs.anyArraySupport || !field.name().equals("tip_history")) { + //TODO: [HUDI-9603] remove the if condition when the issue is fixed + if (field.name().equals("_hoodie_is_deleted")) { + addedFields = true; + addFields(configs, finalFields, baseFields, fieldPrefix, baseSchema.getNamespace(), toplevel); + } + finalFields.add(createNewSchemaField(field)); + } + } + if (!addedFields) { + addFields(configs, finalFields, baseFields, fieldPrefix, baseSchema.getNamespace(), toplevel); + } + Schema finalSchema = Schema.createRecord(baseSchema.getName(), baseSchema.getDoc(), + baseSchema.getNamespace(), baseSchema.isError()); + finalSchema.setFields(finalFields); + return finalSchema; + } + + private static void addFields(SchemaEvolutionConfigs configs, List finalFields, List baseFields, String fieldPrefix, String namespace, boolean toplevel) { + if (toplevel) { + if (configs.mapSupport) { + List mapFields = new ArrayList<>(baseFields.size()); + addFieldsHelper(mapFields, baseFields, fieldPrefix + "Map"); + finalFields.add(new Schema.Field(fieldPrefix + "Map", Schema.createMap(Schema.createRecord("customMapRecord", "", namespace, false, mapFields)), "", null)); + } + + if (configs.arraySupport) { + List arrayFields = new ArrayList<>(baseFields.size()); + addFieldsHelper(arrayFields, baseFields, fieldPrefix + "Array"); + finalFields.add(new Schema.Field(fieldPrefix + "Array", Schema.createArray(Schema.createRecord("customArrayRecord", "", namespace, false, arrayFields)), "", null)); + } + } + addFieldsHelper(finalFields, baseFields, fieldPrefix); + } + + private static void addFieldsHelper(List finalFields, List baseFields, String fieldPrefix) { + for (int i = 0; i < baseFields.size(); i++) { + if (baseFields.get(i) == Schema.Type.BOOLEAN) { + // boolean fields are added fields + finalFields.add(new Schema.Field(fieldPrefix + i, AvroSchemaUtils.createNullableSchema(Schema.Type.BOOLEAN), "", null)); + } else { + finalFields.add(new Schema.Field(fieldPrefix + i, Schema.create(baseFields.get(i)), "", null)); + } + } + } + + private void generateCustomValues(GenericRecord rec, String customPrefix) { + for (Schema.Field field : rec.getSchema().getFields()) { + if (field.name().startsWith(customPrefix)) { + switch (field.schema().getType()) { + case INT: + rec.put(field.name(), rand.nextInt()); + break; + case LONG: + rec.put(field.name(), rand.nextLong()); + break; + case FLOAT: + rec.put(field.name(), rand.nextFloat()); + break; + case DOUBLE: + rec.put(field.name(), rand.nextDouble()); + break; + case STRING: + rec.put(field.name(), genPseudoRandomUUID(rand).toString()); + break; + case BYTES: + rec.put(field.name(), ByteBuffer.wrap(getUTF8Bytes(genPseudoRandomUUID(rand).toString()))); + break; + case UNION: + if (!AvroSchemaUtils.getNonNullTypeFromUnion(field.schema()).getType().equals(Schema.Type.BOOLEAN)) { + throw new IllegalStateException("Union should only be boolean"); + } + rec.put(field.name(), rand.nextBoolean()); + break; + case BOOLEAN: + rec.put(field.name(), rand.nextBoolean()); + break; + case MAP: + rec.put(field.name(), genMap(field.schema(), field.name())); + break; + case ARRAY: + rec.put(field.name(), genArray(field.schema(), field.name())); + break; + default: + throw new UnsupportedOperationException("Unsupported type: " + field.schema().getType()); + } + } + } + } + + private GenericArray genArray(Schema arraySchema, String customPrefix) { + GenericArray customArray = new GenericData.Array<>(1, arraySchema); + Schema arrayElementSchema = arraySchema.getElementType(); + GenericRecord customRecord = new GenericData.Record(arrayElementSchema); + generateCustomValues(customRecord, customPrefix); + customArray.add(customRecord); + return customArray; + } + + private Map genMap(Schema mapSchema, String customPrefix) { + Schema mapElementSchema = mapSchema.getValueType(); + GenericRecord customRecord = new GenericData.Record(mapElementSchema); + generateCustomValues(customRecord, customPrefix); + return Collections.singletonMap("customMapKey", customRecord); + } + + public static List recordsToStrings(List records) { + return records.stream().map(HoodieTestDataGenerator::recordToString).filter(Option::isPresent).map(Option::get) + .collect(Collectors.toList()); + } + + public static Option recordToString(HoodieRecord record) { + try { + String str = ((GenericRecord) record.getData()).toString(); + // Remove the last } bracket + str = str.substring(0, str.length() - 1); + return Option.of(str + ", \"partition\": \"" + record.getPartitionPath() + "\"}"); + } catch (Exception e) { + return Option.empty(); + } + } + + public static List deleteRecordsToStrings(List records) { + return records.stream().map(record -> "{\"_row_key\": \"" + record.getRecordKey() + "\",\"partition\": \"" + record.getPartitionPath() + "\"}") + .collect(Collectors.toList()); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java index 7100ab9af3419..91b8e33548f46 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java @@ -35,16 +35,23 @@ import com.esotericsoftware.kryo.serializers.JavaSerializer; import org.apache.hadoop.conf.Configuration; +import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.io.Serializable; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.Properties; import java.util.UUID; import org.junit.jupiter.api.Assumptions; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; /** * A utility class for testing. @@ -78,7 +85,7 @@ public static HoodieTableMetaClient init(String basePath, HoodieTableType tableT } public static HoodieTableMetaClient init(String basePath, HoodieTableType tableType, String bootstrapBasePath, boolean bootstrapIndexEnable, String keyGenerator, - String partitionFieldConfigValue) throws IOException { + String partitionFieldConfigValue) throws IOException { Properties props = new Properties(); props.setProperty(HoodieTableConfig.BOOTSTRAP_BASE_PATH.key(), bootstrapBasePath); props.put(HoodieTableConfig.BOOTSTRAP_INDEX_ENABLE.key(), bootstrapIndexEnable); @@ -255,4 +262,47 @@ public static DistributedFileSystem useExternalHdfs() throws IOException { conf.set("dfs.replication", "3"); return (DistributedFileSystem) DistributedFileSystem.get(conf); } + + /** + * Extracts a ZIP file from resources to a target directory. + * + * @param resourcePath the path to the ZIP resource (relative to classpath) + * @param targetDirectory the target directory to extract files to + * @param resourceClass the class to use for resource loading + * @throws IOException if extraction fails + */ + public static void extractZipToDirectory(String resourcePath, Path targetDirectory, Class resourceClass) throws IOException { + InputStream resourceStream = resourceClass.getClassLoader().getResourceAsStream(resourcePath); + if (resourceStream == null) { + // Fallback to getResourceAsStream if getClassLoader().getResourceAsStream() fails + resourceStream = resourceClass.getResourceAsStream(resourcePath); + } + + if (resourceStream == null) { + throw new IOException("Resource not found at: " + resourcePath); + } + + try (ZipInputStream zip = new ZipInputStream(resourceStream)) { + ZipEntry entry; + while ((entry = zip.getNextEntry()) != null) { + File file = targetDirectory.resolve(entry.getName()).toFile(); + if (entry.isDirectory()) { + file.mkdirs(); + continue; + } + + // Create parent directories if they don't exist + file.getParentFile().mkdirs(); + + // Extract file content + byte[] buffer = new byte[10000]; + try (BufferedOutputStream out = new BufferedOutputStream(Files.newOutputStream(file.toPath()))) { + int count; + while ((count = zip.read(buffer)) != -1) { + out.write(buffer, 0, count); + } + } + } + } + } } diff --git a/hudi-hadoop-common/pom.xml b/hudi-hadoop-common/pom.xml new file mode 100644 index 0000000000000..a1bf50f5dadbb --- /dev/null +++ b/hudi-hadoop-common/pom.xml @@ -0,0 +1,169 @@ + + + + + hudi + org.apache.hudi + 0.14.2-SNAPSHOT + + 4.0.0 + + hudi-hadoop-common + + + ${project.parent.basedir} + + + + + + src/main/resources + + + + + + org.apache.maven.plugins + maven-jar-plugin + ${maven-jar-plugin.version} + + + + test-jar + + test-compile + + + + false + + + + org.apache.rat + apache-rat-plugin + + + org.jacoco + jacoco-maven-plugin + + + org.codehaus.mojo + build-helper-maven-plugin + 3.5.0 + + + add-spark34plus-avro-sources + generate-sources + + add-source + + + ${spark33orEarlier} + + src/avro/java + src/parquet/java + + + + + add-spark34plus-parquet-test-sources + generate-test-sources + + add-test-source + + + ${spark33orEarlier} + + src/parquet/test/java + + + + + + + + + + + org.apache.hudi + hudi-common + ${project.version} + + + + + org.apache.hadoop + hadoop-client + + + javax.servlet + * + + + provided + + + org.apache.hadoop + hadoop-hdfs + provided + + + + org.apache.parquet + parquet-avro + + + + + com.esotericsoftware + kryo-shaded + + + + org.apache.hudi + hudi-tests-common + ${project.version} + test + + + + org.apache.hudi + hudi-io + tests + ${project.version} + test + + + org.apache.hudi + hudi-common + ${project.version} + tests + test-jar + test + + + com.github.stefanbirkner + system-rules + 1.17.2 + test + + + diff --git a/hudi-hadoop-common/src/test/resources/parquet-java/all.avsc b/hudi-hadoop-common/src/test/resources/parquet-java/all.avsc new file mode 100644 index 0000000000000..116e98c519de4 --- /dev/null +++ b/hudi-hadoop-common/src/test/resources/parquet-java/all.avsc @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "name" : "myrecord", + "namespace": "org.apache.parquet.avro", + "type" : "record", + "fields" : [ { + "name" : "mynull", + "type" : "null" + }, { + "name" : "myboolean", + "type" : "boolean" + }, { + "name" : "myint", + "type" : "int" + }, { + "name" : "mylong", + "type" : "long" + }, { + "name" : "myfloat", + "type" : "float" + }, { + "name" : "mydouble", + "type" : "double" + }, { + "name" : "mybytes", + "type" : "bytes" + }, { + "name" : "mystring", + "type" : "string" + }, { + "name" : "mynestedrecord", + "type" : { + "type" : "record", + "name" : "ignored1", + "namespace" : "", + "fields" : [ { + "name" : "mynestedint", + "type" : "int" + } ] + } + }, { + "name" : "myenum", + "type" : { + "type" : "enum", + "name" : "ignored2", + "namespace" : "", + "symbols" : [ "a", "b" ] + } + }, { + "name" : "myarray", + "type" : { + "type" : "array", + "items" : "int" + } + }, { + "name" : "myemptyarray", + "type" : { + "type" : "array", + "items" : "int" + } + }, { + "name" : "myoptionalarray", + "type" : [ "null", { + "type" : "array", + "items" : "int" + }] + }, { + "name" : "myarrayofoptional", + "type" : { + "type" : "array", + "items" : [ "null", "int" ] + } + }, { + "name" : "mymap", + "type" : { + "type" : "map", + "values" : "int" + } + }, { + "name" : "myemptymap", + "type" : { + "type" : "map", + "values" : "int" + } + }, { + "name" : "myfixed", + "type" : { + "type" : "fixed", + "name" : "ignored3", + "namespace" : "", + "size" : 1 + } + } ] +} \ No newline at end of file diff --git a/hudi-hadoop-common/src/test/resources/parquet-java/allFromParquetNewBehavior.avsc b/hudi-hadoop-common/src/test/resources/parquet-java/allFromParquetNewBehavior.avsc new file mode 100644 index 0000000000000..606213cb16830 --- /dev/null +++ b/hudi-hadoop-common/src/test/resources/parquet-java/allFromParquetNewBehavior.avsc @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "name" : "myrecord", + "namespace": "org.apache.parquet.avro", + "type" : "record", + "fields" : [ { + "name" : "myboolean", + "type" : "boolean" + }, { + "name" : "myint", + "type" : "int" + }, { + "name" : "mylong", + "type" : "long" + }, { + "name" : "myfloat", + "type" : "float" + }, { + "name" : "mydouble", + "type" : "double" + }, { + "name" : "mybytes", + "type" : "bytes" + }, { + "name" : "mystring", + "type" : "string" + }, { + "name" : "mynestedrecord", + "type" : { + "type" : "record", + "name" : "mynestedrecord", + "namespace" : "", + "fields" : [ { + "name" : "mynestedint", + "type" : "int" + } ] + } + }, { + "name" : "myenum", + "type" : "string" + }, { + "name" : "myarray", + "type" : { + "type" : "array", + "items" : "int" + } + }, { + "name" : "myoptionalarray", + "type" : [ "null", { + "type" : "array", + "items" : "int" + }], + "default" : null + }, { + "name" : "myarrayofoptional", + "type" : { + "type" : "array", + "items" : ["null", "int"] + } + }, { + "name" : "myrecordarray", + "type" : { + "type" : "array", + "items" : { + "type" : "record", + "name" : "array", + "namespace" : "", + "fields" : [ { + "name" : "a", + "type" : "int" + }, { + "name" : "b", + "type" : "int" + } ] + } + } + }, { + "name" : "mymap", + "type" : { + "type" : "map", + "values" : "int" + } + }, { + "name" : "myfixed", + "type" : { + "type" : "fixed", + "name" : "myfixed", + "namespace" : "", + "size" : 1 + } + } ] +} \ No newline at end of file diff --git a/hudi-hadoop-common/src/test/resources/parquet-java/allFromParquetOldBehavior.avsc b/hudi-hadoop-common/src/test/resources/parquet-java/allFromParquetOldBehavior.avsc new file mode 100644 index 0000000000000..7a98a74633559 --- /dev/null +++ b/hudi-hadoop-common/src/test/resources/parquet-java/allFromParquetOldBehavior.avsc @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "name" : "myrecord", + "namespace": "org.apache.parquet.avro", + "type" : "record", + "fields" : [ { + "name" : "myboolean", + "type" : "boolean" + }, { + "name" : "myint", + "type" : "int" + }, { + "name" : "mylong", + "type" : "long" + }, { + "name" : "myfloat", + "type" : "float" + }, { + "name" : "mydouble", + "type" : "double" + }, { + "name" : "mybytes", + "type" : "bytes" + }, { + "name" : "mystring", + "type" : "string" + }, { + "name" : "mynestedrecord", + "type" : { + "type" : "record", + "name" : "mynestedrecord", + "namespace" : "", + "fields" : [ { + "name" : "mynestedint", + "type" : "int" + } ] + } + }, { + "name" : "myenum", + "type" : "string" + }, { + "name" : "myarray", + "type" : { + "type" : "array", + "items" : "int" + } + }, { + "name" : "myoptionalarray", + "type" : [ "null", { + "type" : "array", + "items" : "int" + }], + "default" : null + }, { + "name" : "myarrayofoptional", + "type" : { + "type" : "array", + "items" : { + "type": "record", + "name": "list", + "namespace": "", + "fields": [ { + "name": "element", + "type": ["null", "int"], + "default": null + } ] + } + } + }, { + "name" : "myrecordarray", + "type" : { + "type" : "array", + "items" : { + "type" : "record", + "name" : "array", + "namespace" : "", + "fields" : [ { + "name" : "a", + "type" : "int" + }, { + "name" : "b", + "type" : "int" + } ] + } + } + }, { + "name" : "mymap", + "type" : { + "type" : "map", + "values" : "int" + } + }, { + "name" : "myfixed", + "type" : { + "type" : "fixed", + "name" : "myfixed", + "namespace" : "", + "size" : 1 + } + } ] +} \ No newline at end of file diff --git a/hudi-hadoop-common/src/test/resources/parquet-java/fixedToInt96.avsc b/hudi-hadoop-common/src/test/resources/parquet-java/fixedToInt96.avsc new file mode 100644 index 0000000000000..ca1e505ec3380 --- /dev/null +++ b/hudi-hadoop-common/src/test/resources/parquet-java/fixedToInt96.avsc @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "name": "fixedToInt96", + "namespace": "org.apache.parquet.avro", + "type": "record", + "fields": [ + { + "name": "int96", + "type": { + "type": "fixed", + "name": "ignored1", + "namespace": "", + "size": 12 + } + }, + { + "name": "notanint96", + "type": { + "type": "fixed", + "name": "ignored2", + "namespace": "", + "size": 12 + } + }, + { + "name": "mynestedrecord", + "type": { + "type": "record", + "name": "ignored3", + "namespace": "", + "fields": [ + { + "name": "int96inrecord", + "type": { + "type": "fixed", + "name": "ignored4", + "namespace": "", + "size": 12 + } + }, + { + "name": "myarrayofoptional", + "type": { + "type": "array", + "items": [ + "null", + { + "type": "fixed", + "name": "ignored5", + "namespace": "", + "size": 12 + } + ] + } + }, + { + "name": "mymap", + "type": { + "type": "map", + "values": { + "type": "fixed", + "name": "ignored6", + "namespace": "", + "size": 12 + } + } + } + ] + } + }, + { + "name": "onebytefixed", + "type": { + "type": "fixed", + "name": "ignored7", + "namespace": "", + "size": 1 + } + } + ] +} \ No newline at end of file diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index 9e6565299040b..fc9290a5c3742 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -19,6 +19,7 @@ package org.apache.hudi.hadoop; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.hadoop.utils.HoodieHiveUtils; import org.apache.hudi.exception.HoodieException; @@ -126,7 +127,7 @@ private RecordReader getRecordReaderInternal(InputS Reporter reporter) throws IOException { try { if (supportAvroRead && HoodieColumnProjectionUtils.supportTimestamp(job)) { - return new ParquetRecordReaderWrapper(new HoodieTimestampAwareParquetInputFormat(), split, job, reporter); + return new ParquetRecordReaderWrapper(new HoodieTimestampAwareParquetInputFormat(Option.empty(), Option.empty()), split, job, reporter); } else { return super.getRecordReader(split, job, reporter); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/SchemaEvolutionContext.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/SchemaEvolutionContext.java index 746066e1c1c74..b8d0241881916 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/SchemaEvolutionContext.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/SchemaEvolutionContext.java @@ -252,6 +252,9 @@ private TypeInfo constructHiveSchemaFromType(Type type, TypeInfo typeInfo) { case DOUBLE: case DATE: case TIMESTAMP: + case TIMESTAMP_MILLIS: + case LOCAL_TIMESTAMP_MICROS: + case LOCAL_TIMESTAMP_MILLIS: case STRING: case UUID: case FIXED: @@ -259,6 +262,7 @@ private TypeInfo constructHiveSchemaFromType(Type type, TypeInfo typeInfo) { case DECIMAL: return typeInfo; case TIME: + case TIME_MILLIS: throw new UnsupportedOperationException(String.format("cannot convert %s type to hive", type)); default: LOG.error(String.format("cannot convert unknown type: %s to Hive", type)); diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/avro/HoodieAvroParquetReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/avro/HoodieAvroParquetReader.java index c31041ddc76b0..9aa5887b9cbcc 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/avro/HoodieAvroParquetReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/avro/HoodieAvroParquetReader.java @@ -19,8 +19,12 @@ package org.apache.hudi.hadoop.avro; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.util.Option; import org.apache.hudi.hadoop.HoodieColumnProjectionUtils; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; +import org.apache.hudi.internal.schema.InternalSchema; +import org.apache.hudi.internal.schema.action.InternalSchemaMerger; +import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; @@ -31,7 +35,6 @@ import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.parquet.avro.AvroReadSupport; -import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.ParquetInputSplit; @@ -43,6 +46,7 @@ import java.util.Arrays; import java.util.List; +import static org.apache.parquet.avro.HoodieAvroParquetSchemaConverter.getAvroSchemaConverter; import static org.apache.parquet.hadoop.ParquetInputFormat.getFilter; public class HoodieAvroParquetReader extends RecordReader { @@ -55,7 +59,7 @@ public HoodieAvroParquetReader(InputSplit inputSplit, Configuration conf) throws ParquetMetadata fileFooter = ParquetFileReader.readFooter(conf, ((ParquetInputSplit) inputSplit).getPath(), ParquetMetadataConverter.NO_FILTER); MessageType messageType = fileFooter.getFileMetaData().getSchema(); - baseSchema = new AvroSchemaConverter(conf).convert(messageType); + baseSchema = getAvroSchemaConverter(conf).convert(messageType); // if exists read columns, we need to filter columns. List readColNames = Arrays.asList(HoodieColumnProjectionUtils.getReadColumnNames(conf)); @@ -64,7 +68,37 @@ public HoodieAvroParquetReader(InputSplit inputSplit, Configuration conf) throws AvroReadSupport.setAvroReadSchema(conf, filterSchema); AvroReadSupport.setRequestedProjection(conf, filterSchema); } + parquetRecordReader = new ParquetRecordReader<>(new AvroReadSupport<>(), getFilter(conf)); + } + public HoodieAvroParquetReader(InputSplit inputSplit, Configuration conf, Option internalSchemaOption, Option dataSchema) throws IOException { + if (dataSchema.isPresent()) { + baseSchema = dataSchema.get(); + } else { + // get base schema + ParquetMetadata fileFooter = + ParquetFileReader.readFooter(conf, ((ParquetInputSplit) inputSplit).getPath(), ParquetMetadataConverter.NO_FILTER); + MessageType messageType = fileFooter.getFileMetaData().getSchema(); + baseSchema = getAvroSchemaConverter(conf).convert(messageType); + + if (internalSchemaOption.isPresent()) { + // do schema reconciliation in case there exists read column which is not in the file schema. + InternalSchema mergedInternalSchema = new InternalSchemaMerger( + AvroInternalSchemaConverter.convert(baseSchema), + internalSchemaOption.get(), + true, + true).mergeSchema(); + baseSchema = AvroInternalSchemaConverter.convert(mergedInternalSchema, baseSchema.getFullName()); + } + + // if exists read columns, we need to filter columns. + List readColNames = Arrays.asList(HoodieColumnProjectionUtils.getReadColumnNames(conf)); + if (!readColNames.isEmpty()) { + Schema filterSchema = HoodieAvroUtils.generateProjectionSchema(baseSchema, readColNames); + AvroReadSupport.setAvroReadSchema(conf, filterSchema); + AvroReadSupport.setRequestedProjection(conf, filterSchema); + } + } parquetRecordReader = new ParquetRecordReader<>(new AvroReadSupport<>(), getFilter(conf)); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/avro/HoodieTimestampAwareParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/avro/HoodieTimestampAwareParquetInputFormat.java index 8f9aae530e412..4c55fe26551f7 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/avro/HoodieTimestampAwareParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/avro/HoodieTimestampAwareParquetInputFormat.java @@ -18,6 +18,10 @@ package org.apache.hudi.hadoop.avro; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.internal.schema.InternalSchema; + +import org.apache.avro.Schema; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.mapreduce.InputSplit; @@ -33,6 +37,14 @@ * we need to handle timestamp types separately based on the parquet-avro approach. */ public class HoodieTimestampAwareParquetInputFormat extends ParquetInputFormat { + private final Option internalSchemaOption; + private final Option dataSchema; + + public HoodieTimestampAwareParquetInputFormat(Option internalSchemaOption, Option dataSchema) { + super(); + this.internalSchemaOption = internalSchemaOption; + this.dataSchema = dataSchema; + } @Override public RecordReader createRecordReader( diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HiveAvroSerializer.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HiveAvroSerializer.java index a0d1b086e0357..7c11467fd30c3 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HiveAvroSerializer.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HiveAvroSerializer.java @@ -32,6 +32,7 @@ import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector; import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils; import org.apache.hadoop.hive.serde2.avro.InstanceCache; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; @@ -61,7 +62,7 @@ import java.util.Set; import java.util.stream.Collectors; -import static org.apache.hudi.avro.AvroSchemaUtils.resolveNullableSchema; +import static org.apache.hudi.avro.AvroSchemaUtils.getNonNullTypeFromUnion; import static org.apache.hudi.avro.AvroSchemaUtils.resolveUnionSchema; import static org.apache.hudi.avro.HoodieAvroUtils.isMetadataField; @@ -76,7 +77,7 @@ public class HiveAvroSerializer { private static final Logger LOG = LoggerFactory.getLogger(HiveAvroSerializer.class); - public HiveAvroSerializer(ObjectInspector objectInspector, List columnNames, List columnTypes) { + public HiveAvroSerializer(ArrayWritableObjectInspector objectInspector, List columnNames, List columnTypes) { this.columnNames = columnNames; this.columnTypes = columnTypes; this.objectInspector = objectInspector; @@ -197,9 +198,8 @@ private Object serialize(TypeInfo typeInfo, ObjectInspector fieldOI, Object stru return null; } - if (isNullableType(schema)) { - schema = getOtherTypeFromNullableType(schema); - } + schema = getNonNullTypeFromUnion(schema); + /* Because we use Hive's 'string' type when Avro calls for enum, we have to expressly check for enum-ness */ if (Schema.Type.ENUM.equals(schema.getType())) { assert fieldOI instanceof PrimitiveObjectInspector; @@ -336,7 +336,7 @@ private Object serializeList(ListTypeInfo typeInfo, ListObjectInspector fieldOI, ObjectInspector listElementObjectInspector = fieldOI.getListElementObjectInspector(); // NOTE: We have to resolve nullable schema, since Avro permits array elements // to be null - Schema arrayNestedType = resolveNullableSchema(schema.getElementType()); + Schema arrayNestedType = getNonNullTypeFromUnion(schema.getElementType()); Schema elementType; if (listElementObjectInspector.getCategory() == ObjectInspector.Category.PRIMITIVE) { elementType = arrayNestedType; diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala index dd76aee2f187b..5fc0d0d6a6945 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala @@ -35,13 +35,14 @@ import org.apache.hudi.common.util.hash.ColumnIndexID import org.apache.hudi.data.HoodieJavaRDD import org.apache.hudi.metadata.{HoodieMetadataPayload, HoodieTableMetadata, HoodieTableMetadataUtil, MetadataPartitionType} import org.apache.hudi.util.JFunction +import org.apache.hudi.common.function.SerializableFunction import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.HoodieUnsafeUtils.{createDataFrameFromInternalRows, createDataFrameFromRDD, createDataFrameFromRows} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.functions.col import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.sql.{DataFrame, HoodieUnsafeUtils, Row, SparkSession} import org.apache.spark.storage.StorageLevel import java.nio.ByteBuffer @@ -145,6 +146,62 @@ class ColumnStatsIndexSupport(spark: SparkSession, } } + /** + * Loads view of the Column Stats Index in a transposed format where single row coalesces every columns' + * statistics for a single file, returning it as [[DataFrame]] + * + * Please check out scala-doc of the [[transpose]] method explaining this view in more details + */ + def loadTransposed[T](targetColumns: Seq[String], + shouldReadInMemory: Boolean, + prunedPartitions: Option[Set[String]] = None, + prunedFileNamesOpt: Option[Set[String]] = None)(block: DataFrame => T): T = { + cachedColumnStatsIndexViews.get(targetColumns) match { + case Some(cachedDF) => + block(cachedDF) + case None => + val colStatsRecords: HoodieData[HoodieMetadataColumnStats] = prunedFileNamesOpt match { + case Some(prunedFileNames) => + val filterFunction = new SerializableFunction[HoodieMetadataColumnStats, java.lang.Boolean] { + override def apply(r: HoodieMetadataColumnStats): java.lang.Boolean = { + prunedFileNames.contains(r.getFileName) + } + } + loadColumnStatsIndexRecords(targetColumns, shouldReadInMemory).filter(filterFunction) + case None => + loadColumnStatsIndexRecords(targetColumns, shouldReadInMemory) + } + + withPersistedData(colStatsRecords, StorageLevel.MEMORY_ONLY) { + val (transposedRows, indexSchema) = transpose(colStatsRecords, targetColumns) + val df = if (shouldReadInMemory) { + // NOTE: This will instantiate a [[Dataset]] backed by [[LocalRelation]] holding all of the rows + // of the transposed table in memory, facilitating execution of the subsequently chained operations + // on it locally (on the driver; all such operations are actually going to be performed by Spark's + // Optimizer) + HoodieUnsafeUtils.createDataFrameFromRows(spark, transposedRows.collectAsList().asScala.toSeq, indexSchema) + } else { + val rdd = HoodieJavaRDD.getJavaRDD(transposedRows) + spark.createDataFrame(rdd, indexSchema) + } + + if (allowCaching) { + cachedColumnStatsIndexViews.put(targetColumns, df) + // NOTE: Instead of collecting the rows from the index and hold them in memory, we instead rely + // on Spark as (potentially distributed) cache managing data lifecycle, while we simply keep + // the referenced to persisted [[DataFrame]] instance + df.persist(StorageLevel.MEMORY_ONLY) + + block(df) + } else { + withPersistedDataset(df) { + block(df) + } + } + } + } + } + /** * Loads a view of the Column Stats Index in a raw format, returning it as [[DataFrame]] * @@ -334,6 +391,7 @@ class ColumnStatsIndexSupport(spark: SparkSession, columnStatsRecords } + private def loadFullColumnStatsIndexInternal(): DataFrame = { val metadataTablePath = HoodieTableMetadata.getMetadataTableBasePath(metaClient.getBasePathV2.toString) // Read Metadata Table's Column Stats Index into Spark's [[DataFrame]] diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala index 0098ee54c2bc9..0d0b595730a2b 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala @@ -243,7 +243,8 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, case HoodieFileFormat.PARQUET => // We're delegating to Spark to append partition values to every row only in cases // when these corresponding partition-values are not persisted w/in the data file itself - val parquetFileFormat = sparkAdapter.createLegacyHoodieParquetFileFormat(shouldExtractPartitionValuesFromPartitionPath).get + val parquetFileFormat = sparkAdapter.createLegacyHoodieParquetFileFormat( + shouldExtractPartitionValuesFromPartitionPath, tableAvroSchema).get (parquetFileFormat, LegacyHoodieParquetFileFormat.FILE_FORMAT_ID) } @@ -548,7 +549,8 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, hadoopConf = hadoopConf, // We're delegating to Spark to append partition values to every row only in cases // when these corresponding partition-values are not persisted w/in the data file itself - appendPartitionValues = shouldAppendPartitionValuesOverride.getOrElse(shouldExtractPartitionValuesFromPartitionPath) + appendPartitionValues = shouldAppendPartitionValuesOverride.getOrElse(shouldExtractPartitionValuesFromPartitionPath), + tableAvroSchema ) // Since partition values by default are omitted, and not persisted w/in data-files by Spark, // data-file readers (such as [[ParquetFileFormat]]) have to inject partition values while reading diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala index 4add21b5b8da4..9c48f40b00ca1 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala @@ -51,8 +51,9 @@ object HoodieDataSourceHelper extends PredicateHelper with SparkAdapterSupport { filters: Seq[Filter], options: Map[String, String], hadoopConf: Configuration, - appendPartitionValues: Boolean = false): PartitionedFile => Iterator[InternalRow] = { - val parquetFileFormat: ParquetFileFormat = sparkAdapter.createLegacyHoodieParquetFileFormat(appendPartitionValues).get + appendPartitionValues: Boolean = false, + avroTableSchema: Schema): PartitionedFile => Iterator[InternalRow] = { + val parquetFileFormat: ParquetFileFormat = sparkAdapter.createLegacyHoodieParquetFileFormat(appendPartitionValues, avroTableSchema).get val readParquetFile: PartitionedFile => Iterator[Any] = parquetFileFormat.buildReaderWithPartitionValues( sparkSession = sparkSession, dataSchema = dataSchema, diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala index f60263b3344e0..124291f1bef0d 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala @@ -20,6 +20,7 @@ package org.apache.hudi import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hudi.HoodieFileIndex.{DataSkippingFailureMode, collectReferencedColumns, convertFilterForTimestampKeyGenerator, getConfigProperties} import org.apache.hudi.HoodieSparkConfUtils.getConfigValue +import org.apache.hudi.avro.AvroSchemaUtils import org.apache.hudi.common.config.TimestampKeyGeneratorConfig.{TIMESTAMP_INPUT_DATE_FORMAT, TIMESTAMP_OUTPUT_DATE_FORMAT} import org.apache.hudi.common.config.{HoodieMetadataConfig, TypedProperties} import org.apache.hudi.common.model.{FileSlice, HoodieBaseFile, HoodieLogFile} @@ -30,6 +31,7 @@ import org.apache.hudi.keygen.{TimestampBasedAvroKeyGenerator, TimestampBasedKey import org.apache.hudi.metadata.HoodieMetadataPayload import org.apache.hudi.util.JFunction import org.apache.spark.internal.Logging +import org.apache.spark.sql.{Column, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{And, Expression, Literal} import org.apache.spark.sql.execution.datasources.{FileIndex, FileStatusCache, NoopCache, PartitionDirectory} @@ -37,15 +39,14 @@ import org.apache.spark.sql.hudi.DataSkippingUtils.translateIntoColumnStatsIndex import org.apache.spark.sql.hudi.HoodieSqlCommonUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -import org.apache.spark.sql.{Column, SparkSession} import org.apache.spark.unsafe.types.UTF8String import java.text.SimpleDateFormat import java.util.stream.Collectors import javax.annotation.concurrent.NotThreadSafe import scala.collection.JavaConverters._ -import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} +import scala.util.control.NonFatal /** * A file index which support partition prune for hoodie snapshot and read-optimized query. @@ -351,10 +352,34 @@ case class HoodieFileIndex(spark: SparkSession, // threshold (of 100k records) val shouldReadInMemory = columnStatsIndex.shouldReadInMemory(this, queryReferencedColumns) + // Identify timestamp-millis columns from the Avro schema to skip from filter translation + // (even if they're in the index, they may have been indexed before the fix and should not be used for filtering) + val timestampMillisColumns = scala.collection.mutable.Set[String]() + try { + val avroSchema = rawAvroSchema + if (avroSchema.getType == org.apache.avro.Schema.Type.RECORD) { + avroSchema.getFields.asScala.foreach { field => + val fieldSchema = AvroSchemaUtils.getNonNullTypeFromUnion(field.schema()) + if (fieldSchema.getType == org.apache.avro.Schema.Type.LONG) { + val logicalType = fieldSchema.getLogicalType + if (logicalType != null) { + val logicalTypeName = logicalType.getName + if (logicalTypeName == "timestamp-millis" || logicalTypeName == "local-timestamp-millis") { + timestampMillisColumns.add(field.name()) + } + } + } + } + } + } catch { + case e: Exception => + logDebug(s"Failed to identify timestamp-millis columns from Avro schema: ${e.getMessage}") + } + columnStatsIndex.loadTransposed(queryReferencedColumns, shouldReadInMemory) { transposedColStatsDF => val indexSchema = transposedColStatsDF.schema val indexFilter = - queryFilters.map(translateIntoColumnStatsIndexFilterExpr(_, indexSchema)) + queryFilters.map(translateIntoColumnStatsIndexFilterExpr(_, indexSchema, timestampMillisColumns.toSet)) .reduce(And) val allIndexedFileNames = diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index 41e8ba902a7e8..b3e0e2785050f 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -29,7 +29,7 @@ import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.HoodieConversionUtils.{toProperties, toScalaOption} import org.apache.hudi.HoodieSparkSqlWriter.StreamingWriteParams import org.apache.hudi.HoodieWriterUtils._ -import org.apache.hudi.avro.AvroSchemaUtils.resolveNullableSchema +import org.apache.hudi.avro.AvroSchemaUtils.getNonNullTypeFromUnion import org.apache.hudi.avro.HoodieAvroUtils import org.apache.hudi.client.common.HoodieSparkEngineContext import org.apache.hudi.client.embedded.EmbeddedTimelineService @@ -892,7 +892,7 @@ class HoodieSparkSqlWriterInternal { def validateSchemaForHoodieIsDeleted(schema: Schema): Unit = { if (schema.getField(HoodieRecord.HOODIE_IS_DELETED_FIELD) != null && - resolveNullableSchema(schema.getField(HoodieRecord.HOODIE_IS_DELETED_FIELD).schema()).getType != Schema.Type.BOOLEAN) { + getNonNullTypeFromUnion(schema.getField(HoodieRecord.HOODIE_IS_DELETED_FIELD).schema()).getType != Schema.Type.BOOLEAN) { throw new HoodieException(HoodieRecord.HOODIE_IS_DELETED_FIELD + " has to be BOOLEAN type. Passed in dataframe's schema has type " + schema.getField(HoodieRecord.HOODIE_IS_DELETED_FIELD).schema().getType) } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala index 53385bbe2b9ce..ee23fe85c7632 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala @@ -107,7 +107,7 @@ class IncrementalRelation(val sqlContext: SQLContext, // use schema from a file produced in the end/latest instant - val (usedSchema, internalSchema) = { + val (usedSchema, internalSchema, tableAvroSchema) = { log.info("Inferring schema..") val schemaResolver = new TableSchemaResolver(metaClient) val iSchema : InternalSchema = if (!isSchemaEvolutionEnabledOnRead(optParams, sqlContext.sparkSession)) { @@ -126,14 +126,14 @@ class IncrementalRelation(val sqlContext: SQLContext, } if (tableSchema.getType == Schema.Type.NULL) { // if there is only one commit in the table and is an empty commit without schema, return empty RDD here - (StructType(Nil), InternalSchema.getEmptyInternalSchema) + (StructType(Nil), InternalSchema.getEmptyInternalSchema, tableSchema) } else { val dataSchema = AvroConversionUtils.convertAvroSchemaToStructType(tableSchema) if (iSchema != null && !iSchema.isEmptySchema) { // if internalSchema is ready, dataSchema will contains skeletonSchema - (dataSchema, iSchema) + (dataSchema, iSchema, tableSchema) } else { - (StructType(skeletonSchema.fields ++ dataSchema.fields), InternalSchema.getEmptyInternalSchema) + (StructType(skeletonSchema.fields ++ dataSchema.fields), InternalSchema.getEmptyInternalSchema, tableSchema) } } } @@ -203,6 +203,11 @@ class IncrementalRelation(val sqlContext: SQLContext, sqlContext.sparkContext.hadoopConfiguration.set(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA, SerDeHelper.toJson(internalSchema)) sqlContext.sparkContext.hadoopConfiguration.set(SparkInternalSchemaConverter.HOODIE_TABLE_PATH, metaClient.getBasePath) sqlContext.sparkContext.hadoopConfiguration.set(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST, validCommits) + // Pass table Avro schema to LegacyHoodieParquetFileFormat to preserve correct logical types + if (tableAvroSchema.getType != Schema.Type.NULL) { + LegacyHoodieParquetFileFormat.setTableAvroSchemaInConf( + sqlContext.sparkContext.hadoopConfiguration, tableAvroSchema) + } val formatClassName = metaClient.getTableConfig.getBaseFileFormat match { case HoodieFileFormat.PARQUET => LegacyHoodieParquetFileFormat.FILE_FORMAT_ID case HoodieFileFormat.ORC => "orc" diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala index 166579c867328..08559d0de6514 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala @@ -23,7 +23,7 @@ import org.apache.hudi.DataSourceReadOptions._ import org.apache.hudi.HoodieConversionUtils.toJavaOption import org.apache.hudi.SparkHoodieTableFileIndex._ import org.apache.hudi.client.common.HoodieSparkEngineContext -import org.apache.hudi.common.config.TypedProperties +import org.apache.hudi.common.config.{TimestampKeyGeneratorConfig, TypedProperties} import org.apache.hudi.common.model.HoodieRecord.HOODIE_META_COLUMNS_WITH_OPERATION import org.apache.hudi.common.model.{FileSlice, HoodieTableQueryType} import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} @@ -33,6 +33,8 @@ import org.apache.hudi.internal.schema.Types.RecordType import org.apache.hudi.internal.schema.utils.Conversions import org.apache.hudi.keygen.{StringPartitionPathFormatter, TimestampBasedAvroKeyGenerator, TimestampBasedKeyGenerator} import org.apache.hudi.util.JFunction + +import org.apache.avro.Schema import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession @@ -41,10 +43,12 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.catalyst.{InternalRow, expressions} import org.apache.spark.sql.execution.datasources.{FileStatusCache, NoopCache} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types._ +import org.apache.spark.sql.types.{ByteType, DateType, IntegerType, LongType, ShortType, StringType, StructField, StructType} +import org.apache.spark.unsafe.types.UTF8String import java.util.Collections import javax.annotation.concurrent.NotThreadSafe + import scala.collection.JavaConverters._ import scala.language.implicitConversions import scala.util.{Success, Try} @@ -90,15 +94,19 @@ class SparkHoodieTableFileIndex(spark: SparkSession, * Get the schema of the table. */ lazy val schema: StructType = if (shouldFastBootstrap) { - StructType(rawSchema.fields.filterNot(f => HOODIE_META_COLUMNS_WITH_OPERATION.contains(f.name))) + StructType(rawStructSchema.fields.filterNot(f => HOODIE_META_COLUMNS_WITH_OPERATION.contains(f.name))) } else { - rawSchema + rawStructSchema } - private lazy val rawSchema: StructType = schemaSpec.getOrElse({ - val schemaUtil = new TableSchemaResolver(metaClient) - AvroConversionUtils.convertAvroSchemaToStructType(schemaUtil.getTableAvroSchema) - }) + lazy val rawAvroSchema: Schema = { + val schemaUtil = new TableSchemaResolver(metaClient) + schemaUtil.getTableAvroSchema + } + + private lazy val rawStructSchema: StructType = schemaSpec.getOrElse { + AvroConversionUtils.convertAvroSchemaToStructType(rawAvroSchema) + } protected lazy val shouldFastBootstrap = configProperties.getBoolean(DATA_QUERIES_ONLY.key, false) @@ -396,9 +404,21 @@ class SparkHoodieTableFileIndex(spark: SparkSession, } protected def doParsePartitionColumnValues(partitionColumns: Array[String], partitionPath: String): Array[Object] = { - HoodieSparkUtils.parsePartitionColumnValues(partitionColumns, partitionPath, getBasePath, schema, - configProperties.getString(DateTimeUtils.TIMEZONE_OPTION, SQLConf.get.sessionLocalTimeZone), - sparkParsePartitionUtil, shouldValidatePartitionColumns(spark)) + val tableConfig = metaClient.getTableConfig + if (null != tableConfig.getKeyGeneratorClassName + && tableConfig.getKeyGeneratorClassName.equals(classOf[org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator].getName) + && tableConfig.propsMap.get(TimestampKeyGeneratorConfig.TIMESTAMP_TYPE_FIELD.key()) + .matches("SCALAR|UNIX_TIMESTAMP|EPOCHMILLISECONDS|EPOCHMICROSECONDS")) { + // For TIMESTAMP key generator when TYPE is SCALAR, UNIX_TIMESTAMP, + // EPOCHMILLISECONDS, or EPOCHMICROSECONDS, + // we couldn't reconstruct initial partition column values from partition paths due to lost data after formatting in most cases. + // But the output for these cases is in a string format, so we can pass partitionPath as UTF8String + Array.fill(partitionColumns.length)(UTF8String.fromString(partitionPath)) + } else { + HoodieSparkUtils.parsePartitionColumnValues(partitionColumns, partitionPath, getBasePath, schema, + configProperties.getString(DateTimeUtils.TIMEZONE_OPTION, SQLConf.get.sessionLocalTimeZone), + sparkParsePartitionUtil, shouldValidatePartitionColumns(spark)) + } } private def arePartitionPathsUrlEncoded: Boolean = diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/cdc/CDCRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/cdc/CDCRelation.scala index afccd43ca3ea2..b27c03deb1e10 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/cdc/CDCRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/cdc/CDCRelation.scala @@ -99,7 +99,8 @@ class CDCRelation( requiredSchema = tableStructSchema, filters = Nil, options = options, - hadoopConf = spark.sessionState.newHadoopConf() + hadoopConf = spark.sessionState.newHadoopConf(), + avroTableSchema = tableAvroSchema ) val changes = cdcExtractor.extractCDCFileSplits().values().asScala.map { splits => diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala index b4e09f6d1f656..0edb6fda98526 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala @@ -19,12 +19,13 @@ package org.apache.spark.sql.avro import org.apache.avro.LogicalTypes.{Date, Decimal, TimestampMicros, TimestampMillis} import org.apache.avro.Schema.Type._ -import org.apache.avro.{LogicalTypes, Schema, SchemaBuilder} +import org.apache.avro.{LogicalType, LogicalTypes, Schema, SchemaBuilder} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.types.Decimal.minBytesForPrecision import org.apache.spark.sql.types._ import scala.collection.JavaConverters._ +import scala.util.Try /** * This object contains method that are used to convert sparkSQL schemas to avro schemas and vice @@ -40,6 +41,73 @@ import scala.collection.JavaConverters._ private[sql] object SchemaConverters { private lazy val nullSchema = Schema.create(Schema.Type.NULL) + // Reflection-based checks for types that may not be available in all Avro/Spark versions + private lazy val localTimestampMillisClass: Option[Class[_]] = Try { + Class.forName("org.apache.avro.LogicalTypes$LocalTimestampMillis") + }.toOption + + private lazy val localTimestampMicrosClass: Option[Class[_]] = Try { + Class.forName("org.apache.avro.LogicalTypes$LocalTimestampMicros") + }.toOption + + private lazy val timestampNTZTypeClass: Option[Class[_]] = Try { + Class.forName("org.apache.spark.sql.types.TimestampNTZType$") + }.toOption + + private lazy val timestampNTZTypeInstance: Option[DataType] = timestampNTZTypeClass.flatMap { clazz => + Try { + val module = clazz.getField("MODULE$").get(null) + module.asInstanceOf[DataType] + }.toOption + } + + private lazy val localTimestampMicrosMethod: Option[java.lang.reflect.Method] = Try { + classOf[LogicalTypes].getMethod("localTimestampMicros") + }.toOption + + private lazy val localTimestampMillisMethod: Option[java.lang.reflect.Method] = Try { + classOf[LogicalTypes].getMethod("localTimestampMillis") + }.toOption + + /** + * Checks if a logical type is an instance of LocalTimestampMillis using reflection. + * Returns false if the class doesn't exist (e.g., in Avro 1.8.2). + */ + private def isLocalTimestampMillis(logicalType: LogicalType): Boolean = { + logicalType != null && localTimestampMillisClass.exists(_.isInstance(logicalType)) + } + + /** + * Checks if a logical type is an instance of LocalTimestampMicros using reflection. + * Returns false if the class doesn't exist (e.g., in Avro 1.8.2). + */ + private def isLocalTimestampMicros(logicalType: LogicalType): Boolean = { + logicalType != null && localTimestampMicrosClass.exists(_.isInstance(logicalType)) + } + + /** + * Checks if a DataType is TimestampNTZType using reflection. + * Returns false if the class doesn't exist (e.g., in Spark 2.x or Spark 3.2). + */ + private def isTimestampNTZType(dataType: DataType): Boolean = { + timestampNTZTypeInstance.contains(dataType) || + (timestampNTZTypeClass.isDefined && timestampNTZTypeClass.get.isInstance(dataType)) + } + + /** + * Creates a LocalTimestampMicros schema using reflection. + * Throws UnsupportedOperationException if not available. + */ + private def createLocalTimestampMicrosSchema(): Schema = { + localTimestampMicrosMethod match { + case Some(method) => + val logicalType = method.invoke(null).asInstanceOf[LogicalType] + logicalType.addToSchema(Schema.create(Schema.Type.LONG)) + case None => + throw new UnsupportedOperationException("LocalTimestampMicros is not supported in this Avro version") + } + } + /** * Internal wrapper for SQL data type and nullability. * @@ -77,6 +145,11 @@ private[sql] object SchemaConverters { case FLOAT => SchemaType(FloatType, nullable = false) case LONG => avroSchema.getLogicalType match { case _: TimestampMillis | _: TimestampMicros => SchemaType(TimestampType, nullable = false) + case logicalType if isLocalTimestampMillis(logicalType) || isLocalTimestampMicros(logicalType) => + timestampNTZTypeInstance match { + case Some(timestampNTZ) => SchemaType(timestampNTZ, nullable = false) + case None => SchemaType(LongType, nullable = false) // Fallback for older Spark versions + } case _ => SchemaType(LongType, nullable = false) } @@ -165,6 +238,8 @@ private[sql] object SchemaConverters { LogicalTypes.date().addToSchema(builder.intType()) case TimestampType => LogicalTypes.timestampMicros().addToSchema(builder.longType()) + case dataType if isTimestampNTZType(dataType) => + createLocalTimestampMicrosSchema() case FloatType => builder.floatType() case DoubleType => builder.doubleType() diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/LegacyHoodieParquetFileFormat.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/LegacyHoodieParquetFileFormat.scala index 046640c11c1ba..5ecdc8ae476e5 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/LegacyHoodieParquetFileFormat.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/LegacyHoodieParquetFileFormat.scala @@ -18,32 +18,60 @@ package org.apache.spark.sql.execution.datasources.parquet +import org.apache.avro.Schema import org.apache.hadoop.conf.Configuration -import org.apache.hudi.{DataSourceReadOptions, HoodieSparkUtils, SparkAdapterSupport} +import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, SparkAdapterSupport} +import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.PartitionedFile import org.apache.spark.sql.execution.datasources.parquet.LegacyHoodieParquetFileFormat.FILE_FORMAT_ID import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.types.{AtomicType, StructType} +import org.apache.spark.sql.types.StructType /** * This legacy parquet file format implementation to support Hudi will be replaced by * [[NewHoodieParquetFileFormat]] in the future. */ -class LegacyHoodieParquetFileFormat extends ParquetFileFormat with SparkAdapterSupport { +class LegacyHoodieParquetFileFormat extends ParquetFileFormat with SparkAdapterSupport with Logging { override def shortName(): String = FILE_FORMAT_ID override def toString: String = "Hoodie-Parquet" - override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = { - if (HoodieSparkUtils.gteqSpark3_4) { - val conf = sparkSession.sessionState.conf - conf.parquetVectorizedReaderEnabled && schema.forall(_.dataType.isInstanceOf[AtomicType]) + /** + * Try to get table Avro schema from hadoopConf. + * Callers (e.g., IncrementalRelation, HoodieBaseRelation) should set the schema + * using LegacyHoodieParquetFileFormat.setTableAvroSchemaInConf() before reading. + * + * @return Some(schema) if found in hadoopConf, None otherwise (falls back to StructType conversion) + */ + private def getTableAvroSchemaFromConf(hadoopConf: Configuration): Option[Schema] = { + val schemaStr = hadoopConf.get(LegacyHoodieParquetFileFormat.HOODIE_TABLE_AVRO_SCHEMA) + if (schemaStr != null && schemaStr.nonEmpty) { + try { + val schema = new Schema.Parser().parse(schemaStr) + logDebug("Using table Avro schema from hadoopConf") + Some(schema) + } catch { + case e: Exception => + logWarning(s"Failed to parse table Avro schema from hadoopConf: ${e.getMessage}") + None + } } else { - super.supportBatch(sparkSession, schema) + None + } + } + + override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = { + // Try to get schema from hadoopConf (set by callers like IncrementalRelation) + val avroSchema = getTableAvroSchemaFromConf(sparkSession.sessionState.newHadoopConf()).getOrElse { + // Fallback to converting StructType to Avro schema + AvroConversionUtils.convertStructTypeToAvroSchema(schema, schema.typeName) } + + sparkAdapter + .createLegacyHoodieParquetFileFormat(true, avroSchema).get.supportBatch(sparkSession, schema) } override def buildReaderWithPartitionValues(sparkSession: SparkSession, @@ -57,12 +85,36 @@ class LegacyHoodieParquetFileFormat extends ParquetFileFormat with SparkAdapterS options.getOrElse(DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.key, DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.defaultValue.toString).toBoolean + // Try to get schema from hadoopConf (set by callers like IncrementalRelation) + val avroSchema = getTableAvroSchemaFromConf(hadoopConf).getOrElse { + // Fallback to converting StructType to Avro schema + val fullTableSchema = StructType(dataSchema.fields ++ partitionSchema.fields) + AvroConversionUtils.convertStructTypeToAvroSchema(fullTableSchema, dataSchema.typeName) + } + sparkAdapter - .createLegacyHoodieParquetFileFormat(shouldExtractPartitionValuesFromPartitionPath).get + .createLegacyHoodieParquetFileFormat(shouldExtractPartitionValuesFromPartitionPath, avroSchema).get .buildReaderWithPartitionValues(sparkSession, dataSchema, partitionSchema, requiredSchema, filters, options, hadoopConf) } } object LegacyHoodieParquetFileFormat { val FILE_FORMAT_ID = "hoodie-parquet" + + /** + * Configuration key for passing table Avro schema through hadoopConf. + * If set, this schema will be used instead of fetching from storage. + */ + val HOODIE_TABLE_AVRO_SCHEMA = "hoodie.table.avro.schema" + + /** + * Helper method to set table Avro schema in hadoopConf. + * This allows callers to pass the schema through hadoopConf to avoid fetching from storage. + * + * @param hadoopConf The Hadoop configuration to set the schema in + * @param avroSchema The Avro schema to set + */ + def setTableAvroSchemaInConf(hadoopConf: Configuration, avroSchema: Schema): Unit = { + hadoopConf.set(HOODIE_TABLE_AVRO_SCHEMA, avroSchema.toString) + } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala index 7cb4a3c542843..d58b977e5e79e 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/DataSkippingUtils.scala @@ -38,11 +38,12 @@ object DataSkippingUtils extends Logging { * * @param dataTableFilterExpr source table's query's filter expression * @param indexSchema index table schema + * @param columnsToSkip optional set of column names to skip from filter translation (e.g., timestamp-millis columns) * @return filter for column-stats index's table */ - def translateIntoColumnStatsIndexFilterExpr(dataTableFilterExpr: Expression, indexSchema: StructType): Expression = { + def translateIntoColumnStatsIndexFilterExpr(dataTableFilterExpr: Expression, indexSchema: StructType, columnsToSkip: Set[String] = Set.empty): Expression = { try { - createColumnStatsIndexFilterExprInternal(dataTableFilterExpr, indexSchema) + createColumnStatsIndexFilterExprInternal(dataTableFilterExpr, indexSchema, columnsToSkip) } catch { case e: AnalysisException => logDebug(s"Failed to translated provided data table filter expr into column stats one ($dataTableFilterExpr)", e) @@ -50,10 +51,10 @@ object DataSkippingUtils extends Logging { } } - private def createColumnStatsIndexFilterExprInternal(dataTableFilterExpr: Expression, indexSchema: StructType): Expression = { + private def createColumnStatsIndexFilterExprInternal(dataTableFilterExpr: Expression, indexSchema: StructType, columnsToSkip: Set[String] = Set.empty): Expression = { // Try to transform original Source Table's filter expression into // Column-Stats Index filter expression - tryComposeIndexFilterExpr(dataTableFilterExpr, indexSchema) match { + tryComposeIndexFilterExpr(dataTableFilterExpr, indexSchema, columnsToSkip) match { case Some(e) => e // NOTE: In case we can't transform source filter expression, we fallback // to {@code TrueLiteral}, to essentially avoid pruning any indexed files from scanning @@ -61,7 +62,7 @@ object DataSkippingUtils extends Logging { } } - private def tryComposeIndexFilterExpr(sourceFilterExpr: Expression, indexSchema: StructType): Option[Expression] = { + private def tryComposeIndexFilterExpr(sourceFilterExpr: Expression, indexSchema: StructType, columnsToSkip: Set[String] = Set.empty): Option[Expression] = { // // For translation of the Filter Expression for the Data Table into Filter Expression for Column Stats Index, we're // assuming that @@ -98,7 +99,7 @@ object DataSkippingUtils extends Logging { // Filter "expr(colA) = B" and "B = expr(colA)" // Translates to "(expr(colA_minValue) <= B) AND (B <= expr(colA_maxValue))" condition for index lookup case EqualTo(sourceExpr @ AllowedTransformationExpression(attrRef), valueExpr: Expression) if isValueExpression(valueExpr) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => // NOTE: Since we're supporting (almost) arbitrary expressions of the form `f(colA) = B`, we have to // appropriately translate such original expression targeted at Data Table, to corresponding @@ -110,7 +111,7 @@ object DataSkippingUtils extends Logging { } case EqualTo(valueExpr: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(valueExpr) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) genColumnValuesEqualToExpression(colName, valueExpr, targetExprBuilder) @@ -121,14 +122,14 @@ object DataSkippingUtils extends Logging { // NOTE: This is NOT an inversion of `colA = b`, instead this filter ONLY excludes files for which `colA = B` // holds true case Not(EqualTo(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression)) if isValueExpression(value) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) Not(genColumnOnlyValuesEqualToExpression(colName, value, targetExprBuilder)) } case Not(EqualTo(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef))) if isValueExpression(value) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) Not(genColumnOnlyValuesEqualToExpression(colName, value, targetExprBuilder)) @@ -137,20 +138,20 @@ object DataSkippingUtils extends Logging { // Filter "colA = null" // Translates to "colA_nullCount = null" for index lookup case EqualNullSafe(attrRef: AttributeReference, litNull @ Literal(null, _)) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map(colName => EqualTo(genColNumNullsExpr(colName), litNull)) // Filter "expr(colA) < B" and "B > expr(colA)" // Translates to "expr(colA_minValue) < B" for index lookup case LessThan(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression) if isValueExpression(value) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) LessThan(targetExprBuilder.apply(genColMinValueExpr(colName)), value) } case GreaterThan(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(value) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) LessThan(targetExprBuilder.apply(genColMinValueExpr(colName)), value) @@ -159,14 +160,14 @@ object DataSkippingUtils extends Logging { // Filter "B < expr(colA)" and "expr(colA) > B" // Translates to "B < colA_maxValue" for index lookup case LessThan(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(value) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) GreaterThan(targetExprBuilder.apply(genColMaxValueExpr(colName)), value) } case GreaterThan(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression) if isValueExpression(value) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) GreaterThan(targetExprBuilder.apply(genColMaxValueExpr(colName)), value) @@ -175,14 +176,14 @@ object DataSkippingUtils extends Logging { // Filter "expr(colA) <= B" and "B >= expr(colA)" // Translates to "colA_minValue <= B" for index lookup case LessThanOrEqual(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression) if isValueExpression(value) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) LessThanOrEqual(targetExprBuilder.apply(genColMinValueExpr(colName)), value) } case GreaterThanOrEqual(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(value) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) LessThanOrEqual(targetExprBuilder.apply(genColMinValueExpr(colName)), value) @@ -191,14 +192,14 @@ object DataSkippingUtils extends Logging { // Filter "B <= expr(colA)" and "expr(colA) >= B" // Translates to "B <= colA_maxValue" for index lookup case LessThanOrEqual(value: Expression, sourceExpr @ AllowedTransformationExpression(attrRef)) if isValueExpression(value) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) GreaterThanOrEqual(targetExprBuilder.apply(genColMaxValueExpr(colName)), value) } case GreaterThanOrEqual(sourceExpr @ AllowedTransformationExpression(attrRef), value: Expression) if isValueExpression(value) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) GreaterThanOrEqual(targetExprBuilder.apply(genColMaxValueExpr(colName)), value) @@ -207,13 +208,13 @@ object DataSkippingUtils extends Logging { // Filter "colA is null" // Translates to "colA_nullCount > 0" for index lookup case IsNull(attribute: AttributeReference) => - getTargetIndexedColumnName(attribute, indexSchema) + getTargetIndexedColumnName(attribute, indexSchema, columnsToSkip) .map(colName => GreaterThan(genColNumNullsExpr(colName), Literal(0))) // Filter "colA is not null" // Translates to "colA_nullCount < colA_valueCount" for index lookup case IsNotNull(attribute: AttributeReference) => - getTargetIndexedColumnName(attribute, indexSchema) + getTargetIndexedColumnName(attribute, indexSchema, columnsToSkip) .map(colName => LessThan(genColNumNullsExpr(colName), genColValueCountExpr)) // Filter "expr(colA) in (B1, B2, ...)" @@ -221,7 +222,7 @@ object DataSkippingUtils extends Logging { // for index lookup // NOTE: This is equivalent to "colA = B1 OR colA = B2 OR ..." case In(sourceExpr @ AllowedTransformationExpression(attrRef), list: Seq[Expression]) if list.forall(isValueExpression) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) list.map(lit => genColumnValuesEqualToExpression(colName, lit, targetExprBuilder)).reduce(Or) @@ -231,7 +232,7 @@ object DataSkippingUtils extends Logging { // NOTE: [[InSet]] is an optimized version of the [[In]] expression, where every sub-expression w/in the // set is a static literal case InSet(sourceExpr @ AllowedTransformationExpression(attrRef), hset: Set[Any]) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) hset.map { value => @@ -249,7 +250,7 @@ object DataSkippingUtils extends Logging { // Translates to "NOT((colA_minValue = B1 AND colA_maxValue = B1) OR (colA_minValue = B2 AND colA_maxValue = B2))" for index lookup // NOTE: This is NOT an inversion of `in (B1, B2, ...)` expr, this is equivalent to "colA != B1 AND colA != B2 AND ..." case Not(In(sourceExpr @ AllowedTransformationExpression(attrRef), list: Seq[Expression])) if list.forall(_.foldable) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) Not(list.map(lit => genColumnOnlyValuesEqualToExpression(colName, lit, targetExprBuilder)).reduce(Or)) @@ -262,7 +263,7 @@ object DataSkippingUtils extends Logging { // lexicographically, we essentially need to check that provided literal falls w/in min/max bounds of the // given column case StartsWith(sourceExpr @ AllowedTransformationExpression(attrRef), v @ Literal(_: UTF8String, _)) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) genColumnValuesEqualToExpression(colName, v, targetExprBuilder) @@ -272,7 +273,7 @@ object DataSkippingUtils extends Logging { // Translates to "NOT(expr(colA_minValue) like 'xxx%' AND expr(colA_maxValue) like 'xxx%')" for index lookup // NOTE: This is NOT an inversion of "colA like xxx" case Not(StartsWith(sourceExpr @ AllowedTransformationExpression(attrRef), value @ Literal(_: UTF8String, _))) => - getTargetIndexedColumnName(attrRef, indexSchema) + getTargetIndexedColumnName(attrRef, indexSchema, columnsToSkip) .map { colName => val targetExprBuilder: Expression => Expression = swapAttributeRefInExpr(sourceExpr, attrRef, _) val minValueExpr = targetExprBuilder.apply(genColMinValueExpr(colName)) @@ -281,14 +282,14 @@ object DataSkippingUtils extends Logging { } case or: Or => - val resLeft = createColumnStatsIndexFilterExprInternal(or.left, indexSchema) - val resRight = createColumnStatsIndexFilterExprInternal(or.right, indexSchema) + val resLeft = createColumnStatsIndexFilterExprInternal(or.left, indexSchema, columnsToSkip) + val resRight = createColumnStatsIndexFilterExprInternal(or.right, indexSchema, columnsToSkip) Option(Or(resLeft, resRight)) case and: And => - val resLeft = createColumnStatsIndexFilterExprInternal(and.left, indexSchema) - val resRight = createColumnStatsIndexFilterExprInternal(and.right, indexSchema) + val resLeft = createColumnStatsIndexFilterExprInternal(and.left, indexSchema, columnsToSkip) + val resRight = createColumnStatsIndexFilterExprInternal(and.right, indexSchema, columnsToSkip) Option(And(resLeft, resRight)) @@ -299,10 +300,10 @@ object DataSkippingUtils extends Logging { // case Not(And(left: Expression, right: Expression)) => - Option(createColumnStatsIndexFilterExprInternal(Or(Not(left), Not(right)), indexSchema)) + Option(createColumnStatsIndexFilterExprInternal(Or(Not(left), Not(right)), indexSchema, columnsToSkip)) case Not(Or(left: Expression, right: Expression)) => - Option(createColumnStatsIndexFilterExprInternal(And(Not(left), Not(right)), indexSchema)) + Option(createColumnStatsIndexFilterExprInternal(And(Not(left), Not(right)), indexSchema, columnsToSkip)) case _: Expression => None } @@ -317,13 +318,22 @@ object DataSkippingUtils extends Logging { .forall(stat => indexSchema.exists(_.name == stat)) } - private def getTargetIndexedColumnName(resolvedExpr: AttributeReference, indexSchema: StructType): Option[String] = { + private def getTargetIndexedColumnName(resolvedExpr: AttributeReference, indexSchema: StructType, columnsToSkip: Set[String] = Set.empty): Option[String] = { val colName = UnresolvedAttribute(getTargetColNameParts(resolvedExpr)).name - // Verify that the column is indexed - if (checkColIsIndexed(colName, indexSchema)) { + // Skip columns that should not be used for filtering (e.g., timestamp-millis columns that were + // incorrectly indexed before the fix, or columns explicitly marked to skip) + if (columnsToSkip.contains(colName)) { + logDebug(s"Skipping filter translation for column '$colName' as it is in the skip list (likely timestamp-millis)") + None + } else if (checkColIsIndexed(colName, indexSchema)) { + // Verify that the column is indexed + // NOTE: This check ensures that columns not in the column stats index (e.g., timestamp-millis columns + // which are excluded due to schema mismatch) will not have their filters translated, preventing + // incorrect file filtering during data skipping. Option.apply(colName) } else { + logDebug(s"Skipping filter translation for column '$colName' as it is not indexed in column stats index") None } } diff --git a/hudi-spark-datasource/hudi-spark/pom.xml b/hudi-spark-datasource/hudi-spark/pom.xml index 9a2a9232619a6..f2bc4b816331c 100644 --- a/hudi-spark-datasource/hudi-spark/pom.xml +++ b/hudi-spark-datasource/hudi-spark/pom.xml @@ -335,6 +335,18 @@ org.pentaho * + + org.apache.parquet + * + + + org.codehaus.janino + janino + + + org.codehaus.janino + commons-compiler + @@ -350,6 +362,18 @@ javax.servlet.jsp * + + org.apache.parquet + * + + + org.codehaus.janino + janino + + + org.codehaus.janino + commons-compiler + @@ -365,6 +389,18 @@ javax.servlet.jsp * + + org.apache.parquet + * + + + org.codehaus.janino + janino + + + org.codehaus.janino + commons-compiler + @@ -376,6 +412,18 @@ org.eclipse.jetty.orbit javax.servlet + + org.apache.parquet + * + + + org.codehaus.janino + janino + + + org.codehaus.janino + commons-compiler + @@ -403,6 +451,16 @@ hudi-tests-common ${project.version} test + + + org.codehaus.janino + janino + + + org.codehaus.janino + commons-compiler + + org.apache.hudi @@ -411,6 +469,16 @@ tests test-jar test + + + org.codehaus.janino + janino + + + org.codehaus.janino + commons-compiler + + org.apache.hudi @@ -419,6 +487,16 @@ tests test-jar test + + + org.codehaus.janino + janino + + + org.codehaus.janino + commons-compiler + + org.apache.hudi diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/SparkHelpers.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/SparkHelpers.scala index 6917a4360bf95..6d3378396b3b9 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/SparkHelpers.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/SparkHelpers.scala @@ -29,6 +29,7 @@ import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord} import org.apache.hudi.common.util.BaseFileUtils import org.apache.hudi.io.storage.{HoodieAvroParquetWriter, HoodieParquetConfig} import org.apache.parquet.avro.AvroSchemaConverter + import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.spark.sql.{DataFrame, SQLContext} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala index 24820c1c03204..6718c441f1d05 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala @@ -227,10 +227,14 @@ object HoodieAnalysis extends SparkAdapterSupport { } if (updatedTargetTable.isDefined || updatedQuery.isDefined) { - mit.asInstanceOf[MergeIntoTable].copy( + val mergeIntoTable = mit.asInstanceOf[MergeIntoTable] + // Use all parameters to avoid NoSuchMethodError when method signature changes between Spark versions + mergeIntoTable.copy( targetTable = updatedTargetTable.getOrElse(targetTable), - sourceTable = updatedQuery.getOrElse(query) - ) + sourceTable = updatedQuery.getOrElse(query), + mergeCondition = mergeIntoTable.mergeCondition, + matchedActions = mergeIntoTable.matchedActions, + notMatchedActions = mergeIntoTable.notMatchedActions) } else { mit } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala index 3f3d4e10ea9e4..d905939144f7d 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala @@ -95,7 +95,7 @@ object InsertIntoHoodieTableCommand extends Logging with ProvidesHoodieConfig wi } val config = buildHoodieInsertConfig(catalogTable, sparkSession, isOverWritePartition, isOverWriteTable, partitionSpec, extraOptions, staticOverwritePartitionPathOpt) - val alignedQuery = alignQueryOutput(query, catalogTable, partitionSpec, sparkSession.sessionState.conf) + val alignedQuery = alignQueryOutput(query, catalogTable, partitionSpec, sparkSession.sessionState.conf, sparkSession) val (success, _, _, _, _, _) = HoodieSparkSqlWriter.write(sparkSession.sqlContext, mode, config, Dataset.ofRows(sparkSession, alignedQuery)) @@ -122,11 +122,13 @@ object InsertIntoHoodieTableCommand extends Logging with ProvidesHoodieConfig wi * @param catalogTable catalog table * @param partitionsSpec partition spec specifying static/dynamic partition values * @param conf Spark's [[SQLConf]] + * @param sparkSession Spark session (required for Spark 3.5) */ private def alignQueryOutput(query: LogicalPlan, catalogTable: HoodieCatalogTable, partitionsSpec: Map[String, Option[String]], - conf: SQLConf): LogicalPlan = { + conf: SQLConf, + sparkSession: SparkSession): LogicalPlan = { val targetPartitionSchema = catalogTable.partitionSchema val staticPartitionValues = filterStaticPartitionValues(partitionsSpec) @@ -141,7 +143,7 @@ object InsertIntoHoodieTableCommand extends Logging with ProvidesHoodieConfig wi // since such columns wouldn't be otherwise specified w/in the query itself and therefore couldn't be matched // positionally for example val expectedQueryColumns = catalogTable.tableSchemaWithoutMetaFields.filterNot(f => staticPartitionValues.contains(f.name)) - val coercedQueryOutput = coerceQueryOutputColumns(StructType(expectedQueryColumns), cleanedQuery, catalogTable, conf) + val coercedQueryOutput = coerceQueryOutputColumns(StructType(expectedQueryColumns), cleanedQuery, catalogTable, conf, sparkSession) // After potential reshaping validate that the output of the query conforms to the table's schema validate(removeMetaFields(coercedQueryOutput.schema), partitionsSpec, catalogTable) @@ -153,14 +155,28 @@ object InsertIntoHoodieTableCommand extends Logging with ProvidesHoodieConfig wi private def coerceQueryOutputColumns(expectedSchema: StructType, query: LogicalPlan, catalogTable: HoodieCatalogTable, - conf: SQLConf): LogicalPlan = { - val planUtils = sparkAdapter.getCatalystPlanUtils + conf: SQLConf, + sparkSession: SparkSession): LogicalPlan = { try { - planUtils.resolveOutputColumns(catalogTable.catalogTableName, expectedSchema.toAttributes, query, byName = true, conf) + sparkAdapter.resolveOutputColumns( + sparkSession, + catalogTable.catalogTableName, + expectedSchema.toAttributes, + query, + byName = true, + conf) } catch { // NOTE: In case matching by name didn't match the query output, we will attempt positional matching - case ae: AnalysisException if ae.getMessage().startsWith("Cannot write incompatible data to table") => - planUtils.resolveOutputColumns(catalogTable.catalogTableName, expectedSchema.toAttributes, query, byName = false, conf) + // SPARK-42309 Error message changed in Spark 3.5.0 so we need to match two strings here + case ae: AnalysisException if (ae.getMessage().startsWith("[INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_FIND_DATA] Cannot write incompatible data for the table") + || ae.getMessage().startsWith("Cannot write incompatible data to table")) => + sparkAdapter.resolveOutputColumns( + sparkSession, + catalogTable.catalogTableName, + expectedSchema.toAttributes, + query, + byName = false, + conf) } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala index 0989b8b09aee4..954b731dd4c33 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala @@ -23,12 +23,12 @@ import org.apache.avro.generic.{GenericData, GenericRecord, IndexedRecord} import org.apache.hudi.AvroConversionUtils.{convertAvroSchemaToStructType, convertStructTypeToAvroSchema} import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.SparkAdapterSupport.sparkAdapter -import org.apache.hudi.avro.AvroSchemaUtils.{isNullable, resolveNullableSchema} +import org.apache.hudi.avro.AvroSchemaUtils.{getNonNullTypeFromUnion, isNullable} import org.apache.hudi.avro.HoodieAvroUtils import org.apache.hudi.avro.HoodieAvroUtils.bytesToAvro import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodiePayloadProps, HoodieRecord} +import org.apache.hudi.common.util.{BinaryUtil, Option => HOption, ValidationUtils} import org.apache.hudi.common.util.ValidationUtils.checkState -import org.apache.hudi.common.util.{BinaryUtil, ValidationUtils, Option => HOption} import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.exception.HoodieException import org.apache.spark.internal.Logging @@ -487,8 +487,8 @@ object ExpressionPayload { .zipWithIndex .foreach { case ((expectedField, targetField), idx) => - val expectedFieldSchema = resolveNullableSchema(expectedField.schema()) - val targetFieldSchema = resolveNullableSchema(targetField.schema()) + val expectedFieldSchema = getNonNullTypeFromUnion(expectedField.schema()) + val targetFieldSchema = getNonNullTypeFromUnion(targetField.schema()) val equal = Objects.equals(expectedFieldSchema, targetFieldSchema) ValidationUtils.checkState(equal, diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieSqlCommonAstBuilder.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieSqlCommonAstBuilder.scala index 4005ef97e4561..c5211d8852e11 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieSqlCommonAstBuilder.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieSqlCommonAstBuilder.scala @@ -36,7 +36,7 @@ import scala.collection.JavaConverters._ class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface) extends HoodieSqlCommonBaseVisitor[AnyRef] with Logging with SparkAdapterSupport { - import ParserUtils._ + import ParserUtils.{checkDuplicateKeys, operationNotAllowed, string} /** * Override the default behavior for all visit methods. This will only return a non-null result @@ -51,25 +51,25 @@ class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface } } - override def visitSingleStatement(ctx: SingleStatementContext): LogicalPlan = withOrigin(ctx) { + override def visitSingleStatement(ctx: SingleStatementContext): LogicalPlan = ParserUtils.withOrigin(ctx) { ctx.statement().accept(this).asInstanceOf[LogicalPlan] } - override def visitCompactionOnTable(ctx: CompactionOnTableContext): LogicalPlan = withOrigin(ctx) { + override def visitCompactionOnTable(ctx: CompactionOnTableContext): LogicalPlan = ParserUtils.withOrigin(ctx) { val table = ctx.tableIdentifier().accept(this).asInstanceOf[LogicalPlan] val operation = CompactionOperation.withName(ctx.operation.getText.toUpperCase) val timestamp = if (ctx.instantTimestamp != null) Some(ctx.instantTimestamp.getText.toLong) else None CompactionTable(table, operation, timestamp) } - override def visitCompactionOnPath(ctx: CompactionOnPathContext): LogicalPlan = withOrigin(ctx) { + override def visitCompactionOnPath(ctx: CompactionOnPathContext): LogicalPlan = ParserUtils.withOrigin(ctx) { val path = string(ctx.path) val operation = CompactionOperation.withName(ctx.operation.getText.toUpperCase) val timestamp = if (ctx.instantTimestamp != null) Some(ctx.instantTimestamp.getText.toLong) else None CompactionPath(path, operation, timestamp) } - override def visitShowCompactionOnTable(ctx: ShowCompactionOnTableContext): LogicalPlan = withOrigin(ctx) { + override def visitShowCompactionOnTable(ctx: ShowCompactionOnTableContext): LogicalPlan = ParserUtils.withOrigin(ctx) { val table = ctx.tableIdentifier().accept(this).asInstanceOf[LogicalPlan] if (ctx.limit != null) { CompactionShowOnTable(table, ctx.limit.getText.toInt) @@ -78,7 +78,7 @@ class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface } } - override def visitShowCompactionOnPath(ctx: ShowCompactionOnPathContext): LogicalPlan = withOrigin(ctx) { + override def visitShowCompactionOnPath(ctx: ShowCompactionOnPathContext): LogicalPlan = ParserUtils.withOrigin(ctx) { val path = string(ctx.path) if (ctx.limit != null) { CompactionShowOnPath(path, ctx.limit.getText.toInt) @@ -87,11 +87,11 @@ class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface } } - override def visitTableIdentifier(ctx: TableIdentifierContext): LogicalPlan = withOrigin(ctx) { + override def visitTableIdentifier(ctx: TableIdentifierContext): LogicalPlan = ParserUtils.withOrigin(ctx) { UnresolvedRelation(TableIdentifier(ctx.table.getText, Option(ctx.db).map(_.getText))) } - override def visitCall(ctx: CallContext): LogicalPlan = withOrigin(ctx) { + override def visitCall(ctx: CallContext): LogicalPlan = ParserUtils.withOrigin(ctx) { if (ctx.callArgumentList() == null || ctx.callArgumentList().callArgument() == null || ctx.callArgumentList().callArgument().size() == 0) { val name: Seq[String] = ctx.multipartIdentifier().parts.asScala.map(_.getText) CallCommand(name, Seq()) @@ -105,14 +105,14 @@ class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface /** * Return a multi-part identifier as Seq[String]. */ - override def visitMultipartIdentifier(ctx: MultipartIdentifierContext): Seq[String] = withOrigin(ctx) { - ctx.parts.asScala.map(_.getText) + override def visitMultipartIdentifier(ctx: MultipartIdentifierContext): Seq[String] = ParserUtils.withOrigin(ctx) { + ctx.parts.asScala.map(_.getText).toSeq } /** * Create a positional argument in a stored procedure call. */ - override def visitPositionalArgument(ctx: PositionalArgumentContext): CallArgument = withOrigin(ctx) { + override def visitPositionalArgument(ctx: PositionalArgumentContext): CallArgument = ParserUtils.withOrigin(ctx) { val expr = typedVisit[Expression](ctx.expression) PositionalArgument(expr) } @@ -120,7 +120,7 @@ class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface /** * Create a named argument in a stored procedure call. */ - override def visitNamedArgument(ctx: NamedArgumentContext): CallArgument = withOrigin(ctx) { + override def visitNamedArgument(ctx: NamedArgumentContext): CallArgument = ParserUtils.withOrigin(ctx) { val name = ctx.identifier.getText val expr = typedVisit[Expression](ctx.expression) NamedArgument(name, expr) @@ -160,7 +160,7 @@ class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface * indexPropertyList: index_property_name [= index_property_value] [ , . . . ] * }}} */ - override def visitCreateIndex(ctx: CreateIndexContext): LogicalPlan = withOrigin(ctx) { + override def visitCreateIndex(ctx: CreateIndexContext): LogicalPlan = ParserUtils.withOrigin(ctx) { val (indexName, indexType) = if (ctx.identifier.size() == 1) { (ctx.identifier(0).getText, "") } else { @@ -189,7 +189,7 @@ class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface * DROP INDEX [IF EXISTS] index_name ON [TABLE] table_name * }}} */ - override def visitDropIndex(ctx: DropIndexContext): LogicalPlan = withOrigin(ctx) { + override def visitDropIndex(ctx: DropIndexContext): LogicalPlan = ParserUtils.withOrigin(ctx) { val indexName = ctx.identifier.getText DropIndex( visitTableIdentifier(ctx.tableIdentifier()), @@ -204,7 +204,7 @@ class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface * SHOW INDEXES (FROM | IN) [TABLE] table_name * }}} */ - override def visitShowIndexes(ctx: ShowIndexesContext): LogicalPlan = withOrigin(ctx) { + override def visitShowIndexes(ctx: ShowIndexesContext): LogicalPlan = ParserUtils.withOrigin(ctx) { ShowIndexes(visitTableIdentifier(ctx.tableIdentifier())) } @@ -215,7 +215,7 @@ class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface * REFRESH INDEX index_name ON [TABLE] table_name * }}} */ - override def visitRefreshIndex(ctx: RefreshIndexContext): LogicalPlan = withOrigin(ctx) { + override def visitRefreshIndex(ctx: RefreshIndexContext): LogicalPlan = ParserUtils.withOrigin(ctx) { RefreshIndex(visitTableIdentifier(ctx.tableIdentifier()), ctx.identifier.getText) } @@ -224,7 +224,7 @@ class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface * This should be called through [[visitPropertyKeyValues]] or [[visitPropertyKeys]]. */ override def visitPropertyList( - ctx: PropertyListContext): Map[String, String] = withOrigin(ctx) { + ctx: PropertyListContext): Map[String, String] = ParserUtils.withOrigin(ctx) { val properties = ctx.property.asScala.map { property => val key = visitPropertyKey(property.key) val value = visitPropertyValue(property.value) diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestSparkSortAndSizeClustering.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestSparkSortAndSizeClustering.java index 1898a276a9f6e..b26805a5cc4ec 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestSparkSortAndSizeClustering.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestSparkSortAndSizeClustering.java @@ -18,10 +18,13 @@ package org.apache.hudi.functional; +import org.apache.hudi.avro.AvroSchemaUtils; import org.apache.hudi.avro.model.HoodieClusteringGroup; import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.config.HoodieStorageConfig; +import org.apache.hudi.common.model.HoodieAvroIndexedRecord; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; @@ -33,6 +36,7 @@ import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieClusteringConfig; @@ -42,6 +46,10 @@ import org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode; import org.apache.hudi.testutils.HoodieSparkClientTestHarness; import org.apache.hudi.testutils.MetadataMergeWriteStatus; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -50,15 +58,20 @@ import org.junit.jupiter.api.Test; import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.time.LocalDate; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Properties; +import java.util.Random; import java.util.stream.Collectors; +import java.util.stream.IntStream; public class TestSparkSortAndSizeClustering extends HoodieSparkClientTestHarness { - private HoodieWriteConfig config; private HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(0); @@ -165,4 +178,57 @@ public HoodieWriteConfig.Builder getConfigBuilder() { .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()); } + + private List generateInserts(String instant, long ts, int count) { + Schema schema = getSchema(); + Schema decimalSchema = schema.getField("decimal_field").schema(); + Schema nestedSchema = AvroSchemaUtils.getNonNullTypeFromUnion(schema.getField("nested_record").schema()); + Schema enumSchema = AvroSchemaUtils.getNonNullTypeFromUnion(schema.getField("enum_field").schema()); + Random random = new Random(0); + return IntStream.range(0, count) + .mapToObj(i -> { + GenericRecord record = new GenericData.Record(schema); + String key = "key_" + i; + String partition = "partition_" + (i % 3); + record.put("_row_key", key); + record.put("ts", ts); + record.put("partition_path", partition); + record.put("_hoodie_is_deleted", false); + record.put("double_field", random.nextDouble()); + record.put("float_field", random.nextFloat()); + record.put("int_field", random.nextInt()); + record.put("long_field", random.nextLong()); + record.put("string_field", instant); + record.put("bytes_field", ByteBuffer.wrap(instant.getBytes(StandardCharsets.UTF_8))); + GenericRecord nestedRecord = new GenericData.Record(nestedSchema); + nestedRecord.put("nested_int", random.nextInt()); + nestedRecord.put("nested_string", "nested_" + instant); + nestedRecord.put("nested_timestamp_millis_field", ts); + record.put("nested_record", nestedRecord); + record.put("array_field", Collections.singletonList(nestedRecord)); + record.put("nullable_map_field", Collections.singletonMap("key_" + instant, nestedRecord)); + // logical types + record.put("date_nullable_field", random.nextBoolean() ? null : LocalDate.now().minusDays(random.nextInt(3))); + record.put("timestamp_millis_field", ts); + record.put("timestamp_micros_nullable_field", random.nextBoolean() ? null : ts * 1000); + record.put("timestamp_local_millis_nullable_field", random.nextBoolean() ? null : ts); + record.put("timestamp_local_micros_field", ts * 1000); + record.put("enum_field", new GenericData.EnumSymbol( + enumSchema, + enumSchema + .getEnumSymbols() + .get(random.nextInt(enumSchema.getEnumSymbols().size())))); + return new HoodieAvroIndexedRecord(new HoodieKey(key, partition), record); + }) + .collect(Collectors.toList()); + } + + private Schema getSchema() { + try { + String schema = FileIOUtils.readAsUTFString(this.getClass().getClassLoader().getResourceAsStream("schema_with_logical_types.avsc")); + return new Schema.Parser().parse(schema); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestTimestampBasedKeyGenerator.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestTimestampBasedKeyGenerator.java index ec93ea229accb..fa4482e537fbd 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestTimestampBasedKeyGenerator.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestTimestampBasedKeyGenerator.java @@ -153,7 +153,7 @@ public void testTimestampBasedKeyGenerator() throws IOException { // timezone is GMT+8:00, createTime is BigDecimal BigDecimal decimal = new BigDecimal("1578283932000.0001"); Conversions.DecimalConversion conversion = new Conversions.DecimalConversion(); - Schema resolvedNullableSchema = AvroSchemaUtils.resolveNullableSchema(schema.getField("createTimeDecimal").schema()); + Schema resolvedNullableSchema = AvroSchemaUtils.getNonNullTypeFromUnion(schema.getField("createTimeDecimal").schema()); GenericFixed avroDecimal = conversion.toFixed(decimal, resolvedNullableSchema, LogicalTypes.decimal(20, 4)); baseRecord.put("createTimeDecimal", avroDecimal); properties = getBaseKeyConfig("createTimeDecimal", "EPOCHMILLISECONDS", "yyyy-MM-dd hh", "GMT+8:00", null); @@ -209,6 +209,31 @@ public void testTimestampBasedKeyGenerator() throws IOException { assertEquals("1970-01-01 12:00:00", keyGen.getPartitionPath(baseRow)); internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow); assertEquals(UTF8String.fromString("1970-01-01 12:00:00"), keyGen.getPartitionPath(internalRow, baseRow.schema())); + + // Timestamp field is in long type, with `EPOCHMICROSECONDS` timestamp type in the key generator + baseRecord.put("createTime", 1578283932123456L); + properties = getBaseKeyConfig("createTime", "EPOCHMICROSECONDS", "yyyy-MM-dd hh", "GMT+8:00", null); + keyGen = new TimestampBasedKeyGenerator(properties); + HoodieKey key = keyGen.getKey(baseRecord); + assertEquals("2020-01-06 12", key.getPartitionPath()); + baseRow = genericRecordToRow(baseRecord); + assertEquals("2020-01-06 12", keyGen.getPartitionPath(baseRow)); + internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow); + assertEquals(UTF8String.fromString("2020-01-06 12"), keyGen.getPartitionPath(internalRow, baseRow.schema())); + + // Timestamp field is in decimal type, with `EPOCHMICROSECONDS` timestamp type in the key generator + decimal = new BigDecimal("1578283932123456.0001"); + resolvedNullableSchema = AvroSchemaUtils.getNonNullTypeFromUnion( + schema.getField("createTimeDecimal").schema()); + avroDecimal = conversion.toFixed(decimal, resolvedNullableSchema, LogicalTypes.decimal(20, 4)); + baseRecord.put("createTimeDecimal", avroDecimal); + properties = getBaseKeyConfig( + "createTimeDecimal", "EPOCHMICROSECONDS", "yyyy-MM-dd hh", "GMT+8:00", null); + keyGen = new TimestampBasedKeyGenerator(properties); + bigDecimalKey = keyGen.getKey(baseRecord); + assertEquals("2020-01-06 12", bigDecimalKey.getPartitionPath()); + baseRow = genericRecordToRow(baseRecord); + assertEquals("2020-01-06 12", keyGen.getPartitionPath(baseRow)); } @Test diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_cow_read_v6.zip b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_cow_read_v6.zip new file mode 100644 index 0000000000000..1de8eadd923e6 Binary files /dev/null and b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_cow_read_v6.zip differ diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_cow_read_v8.zip b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_cow_read_v8.zip new file mode 100644 index 0000000000000..165dd4376cf29 Binary files /dev/null and b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_cow_read_v8.zip differ diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_cow_read_v9.zip b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_cow_read_v9.zip new file mode 100644 index 0000000000000..c38421631cb0a Binary files /dev/null and b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_cow_read_v9.zip differ diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v6.zip b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v6.zip new file mode 100644 index 0000000000000..e75f8c4909133 Binary files /dev/null and b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v6.zip differ diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v6_parquet_log.zip b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v6_parquet_log.zip new file mode 100644 index 0000000000000..22c849cc5ed37 Binary files /dev/null and b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v6_parquet_log.zip differ diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v8.zip b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v8.zip new file mode 100644 index 0000000000000..7d774b7349580 Binary files /dev/null and b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v8.zip differ diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v8_parquet_log.zip b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v8_parquet_log.zip new file mode 100644 index 0000000000000..230d2b6945a09 Binary files /dev/null and b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v8_parquet_log.zip differ diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v9.zip b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v9.zip new file mode 100644 index 0000000000000..3a93b28687189 Binary files /dev/null and b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v9.zip differ diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v9_parquet_log.zip b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v9_parquet_log.zip new file mode 100644 index 0000000000000..1f5d2e8de56d4 Binary files /dev/null and b/hudi-spark-datasource/hudi-spark/src/test/resources/trips_logical_types_json_mor_read_v9_parquet_log.zip differ diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestAvroSchemaResolutionSupport.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestAvroSchemaResolutionSupport.scala index a8f7c3c10ee1f..d0cff9650eedb 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestAvroSchemaResolutionSupport.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestAvroSchemaResolutionSupport.scala @@ -23,9 +23,13 @@ import org.apache.hudi.common.model.HoodieTableType import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.exception.SchemaCompatibilityException import org.apache.hudi.testutils.HoodieClientTestBase +import org.apache.spark.SparkException +import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.junit.jupiter.api.{AfterEach, BeforeEach} +import org.junit.jupiter.api.Assertions.assertDoesNotThrow +import org.junit.jupiter.api.function.Executable import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.{CsvSource, ValueSource} @@ -808,4 +812,87 @@ class TestAvroSchemaResolutionSupport extends HoodieClientTestBase with ScalaAss readDf.show(false) readDf.foreach(_ => {}) } + + @ParameterizedTest + @ValueSource(strings = Array("COPY_ON_WRITE", "MERGE_ON_READ")) + def testNestedTypeVectorizedReadWithTypeChange(tableType: String): Unit = { + // test to change the value type of a MAP in a column of ARRAY< MAP > type + val tempRecordPath = basePath + "/record_tbl/" + val arrayMapData = Seq( + Row(1, 100, List(Map("2022-12-01" -> 120), Map("2022-12-02" -> 130)), "aaa") + ) + val arrayMapSchema = new StructType() + .add("id", IntegerType) + .add("userid", IntegerType) + .add("salesMap", ArrayType( + new MapType(StringType, IntegerType, true))) + .add("name", StringType) + val df1 = spark.createDataFrame(spark.sparkContext.parallelize(arrayMapData), arrayMapSchema) + df1.printSchema() + df1.show(false) + + // recreate table + initialiseTable(df1, tempRecordPath, tableType.equals("COPY_ON_WRITE")) + + // read out the table, will not throw any exception + readTable(tempRecordPath) + + // change value type from integer to long + val newArrayMapData = Seq( + Row(2, 200, List(Map("2022-12-01" -> 220L), Map("2022-12-02" -> 230L)), "bbb") + ) + val newArrayMapSchema = new StructType() + .add("id", IntegerType) + .add("userid", IntegerType) + .add("salesMap", ArrayType( + new MapType(StringType, LongType, true))) + .add("name", StringType) + val df2 = spark.createDataFrame(spark.sparkContext.parallelize(newArrayMapData), newArrayMapSchema) + df2.printSchema() + df2.show(false) + // upsert + upsertData(df2, tempRecordPath, tableType.equals("COPY_ON_WRITE")) + + // after implicit type change, read the table with vectorized read enabled + if (HoodieSparkUtils.gteqSpark3_3) { + assertThrows(classOf[SparkException]) { + withSQLConf("spark.sql.parquet.enableNestedColumnVectorizedReader" -> "true") { + readTable(tempRecordPath) + } + } + } else { + withSQLConf("spark.sql.parquet.enableNestedColumnVectorizedReader" -> "true") { + readTable(tempRecordPath) + } + } + + withSQLConf("spark.sql.parquet.enableNestedColumnVectorizedReader" -> "false") { + readTable(tempRecordPath) + } + } + + + private def readTable(path: String): Unit = { + // read out the table + val readDf = spark.read.format("hudi").load(path) + readDf.printSchema() + readDf.show(false) + readDf.foreach(_ => {}) + } + + protected def withSQLConf[T](pairs: (String, String)*)(f: => T): T = { + val conf = spark.sessionState.conf + val currentValues = pairs.unzip._1.map { k => + if (conf.contains(k)) { + Some(conf.getConfString(k)) + } else None + } + pairs.foreach { case (k, v) => conf.setConfString(k, v) } + try f finally { + pairs.unzip._1.zip(currentValues).foreach { + case (key, Some(value)) => conf.setConfString(key, value) + case (key, None) => conf.unsetConf(key) + } + } + } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala index 38221cc05c7ea..03673c7b47e20 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala @@ -17,8 +17,6 @@ package org.apache.hudi -import org.apache.avro.Schema -import org.apache.commons.io.FileUtils import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.HoodieSparkUtils.gteqSpark3_0 import org.apache.hudi.client.SparkRDDWriteClient @@ -33,6 +31,9 @@ import org.apache.hudi.functional.TestBootstrap import org.apache.hudi.keygen.{ComplexKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator} import org.apache.hudi.testutils.DataSourceTestUtils import org.apache.hudi.testutils.HoodieClientTestUtils.getSparkConfForTest + +import org.apache.avro.Schema +import org.apache.commons.io.FileUtils import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql._ diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala index 6a9efb3371d89..70623814a1fbd 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/ColumnStatIndexTestBase.scala @@ -27,6 +27,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.functional.ColumnStatIndexTestBase.ColumnStatsTestCase import org.apache.hudi.testutils.HoodieSparkClientTestBase import org.apache.hudi.{ColumnStatsIndexSupport, DataSourceWriteOptions} +import org.apache.hudi.AvroConversionUtils import org.apache.spark.sql._ import org.apache.spark.sql.functions.typedLit import org.apache.spark.sql.types._ @@ -55,6 +56,8 @@ class ColumnStatIndexTestBase extends HoodieSparkClientTestBase { .add("c7", BinaryType) .add("c8", ByteType) + val sourceTableAvroSchema = AvroConversionUtils.convertStructTypeToAvroSchema(sourceTableSchema, "reocrd", "") + @BeforeEach override def setUp() { initPath() diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala index f500ea83120dc..d16509a7eb526 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala @@ -19,6 +19,7 @@ package org.apache.hudi.functional import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers, HoodieSparkUtils, QuickstartUtils, ScalaAssertionSupport} import org.apache.hudi.DataSourceWriteOptions.{INLINE_CLUSTERING_ENABLE, KEYGENERATOR_CLASS_NAME} import org.apache.hudi.HoodieConversionUtils.toJavaOption import org.apache.hudi.QuickstartUtils.{convertToStringList, getQuickstartWriteConfigs} @@ -30,7 +31,7 @@ import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType import org.apache.hudi.common.model.{HoodieRecord, WriteOperationType} import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline, TimelineUtils} import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} -import org.apache.hudi.common.testutils.HoodieTestDataGenerator +import org.apache.hudi.common.testutils.{HoodieTestDataGenerator, HoodieTestUtils} import org.apache.hudi.common.testutils.RawTripTestPayload.{deleteRecordsToStrings, recordsToStrings} import org.apache.hudi.common.util import org.apache.hudi.config.HoodieWriteConfig @@ -46,6 +47,11 @@ import org.apache.hudi.metrics.{Metrics, MetricsReporterType} import org.apache.hudi.testutils.HoodieSparkClientTestBase import org.apache.hudi.util.JFunction import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers, QuickstartUtils, ScalaAssertionSupport} +import org.apache.hudi.table.HoodieSparkTable +import org.apache.hudi.testutils.HoodieSparkClientTestBase +import org.apache.hudi.util.JFunction +import org.apache.hadoop.fs.FileSystem +import org.apache.spark.sql.functions.{col, concat, lit, udf, when} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.hudi.HoodieSparkSessionExtension @@ -57,7 +63,10 @@ import org.junit.jupiter.api.function.Executable import org.junit.jupiter.api.{AfterEach, BeforeEach, Disabled, Test} import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.{CsvSource, EnumSource, ValueSource} +import org.slf4j.LoggerFactory +import java.net.URI +import java.nio.file.Paths import java.sql.{Date, Timestamp} import java.util.concurrent.{CountDownLatch, TimeUnit} import java.util.function.Consumer @@ -1678,7 +1687,81 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup }) } - def getWriterReaderOpts(recordType: HoodieRecordType, + @ParameterizedTest + @CsvSource(Array("true, 6", "false, 6")) + def testLogicalTypesReadRepair(vectorizedReadEnabled: Boolean, tableVersion: Int): Unit = { + if (HoodieSparkUtils.gteqSpark3_4) { + // Note: for spark 3.4 we should fall back to nonvectorized reader + // if that is not happening then this test will fail + val prevValue = spark.conf.get("spark.sql.parquet.enableVectorizedReader") + val prevTimezone = spark.conf.get("spark.sql.session.timeZone") + val propertyValue: String = System.getProperty("spark.testing") + try { + if (HoodieSparkUtils.isSpark3_3) { + System.setProperty("spark.testing", "true") + } + spark.conf.set("spark.sql.parquet.enableVectorizedReader", vectorizedReadEnabled.toString) + spark.conf.set("spark.sql.session.timeZone", "UTC") + spark.conf.set("spark.sql.parquet.inferTimestampNTZ.enabled", "true") + val tableName = "trips_logical_types_json_cow_read_v" + tableVersion + val dataPath = "file://" + basePath + "/" + tableName + val zipOutput = Paths.get(new URI(dataPath)) + HoodieTestUtils.extractZipToDirectory("/" + tableName + ".zip", zipOutput, getClass) + val tableBasePath = zipOutput.toString + + val df = spark.read.format("hudi").load(tableBasePath) + + val rows = df.collect() + assertEquals(20, rows.length) + for (row <- rows) { + val hash = row.get(6).asInstanceOf[String].hashCode() + if ((hash & 1) == 0) { + assertEquals("2020-01-01T00:00:00.001Z", row.get(14).asInstanceOf[Timestamp].toInstant.toString) + assertEquals("2020-06-01T12:00:00.000001Z", row.get(15).asInstanceOf[Timestamp].toInstant.toString) + assertEquals("2015-05-20T12:34:56.001", row.get(16).toString) + assertEquals("2017-07-07T07:07:07.000001", row.get(17).toString) + } else { + assertEquals("2019-12-31T23:59:59.999Z", row.get(14).asInstanceOf[Timestamp].toInstant.toString) + assertEquals("2020-06-01T11:59:59.999999Z", row.get(15).asInstanceOf[Timestamp].toInstant.toString) + assertEquals("2015-05-20T12:34:55.999", row.get(16).toString) + assertEquals("2017-07-07T07:07:06.999999", row.get(17).toString) + } + } + + assertEquals(10, df.filter("ts_millis > timestamp('2020-01-01 00:00:00Z')").count()) + assertEquals(10, df.filter("ts_millis < timestamp('2020-01-01 00:00:00Z')").count()) + assertEquals(0, df.filter("ts_millis > timestamp('2020-01-01 00:00:00.001Z')").count()) + assertEquals(0, df.filter("ts_millis < timestamp('2019-12-31 23:59:59.999Z')").count()) + + assertEquals(10, df.filter("ts_micros > timestamp('2020-06-01 12:00:00Z')").count()) + assertEquals(10, df.filter("ts_micros < timestamp('2020-06-01 12:00:00Z')").count()) + assertEquals(0, df.filter("ts_micros > timestamp('2020-06-01 12:00:00.000001Z')").count()) + assertEquals(0, df.filter("ts_micros < timestamp('2020-06-01 11:59:59.999999Z')").count()) + + assertEquals(10, df.filter("local_ts_millis > CAST('2015-05-20 12:34:56' AS TIMESTAMP_NTZ)").count()) + assertEquals(10, df.filter("local_ts_millis < CAST('2015-05-20 12:34:56' AS TIMESTAMP_NTZ)").count()) + assertEquals(0, df.filter("local_ts_millis > CAST('2015-05-20 12:34:56.001' AS TIMESTAMP_NTZ)").count()) + assertEquals(0, df.filter("local_ts_millis < CAST('2015-05-20 12:34:55.999' AS TIMESTAMP_NTZ)").count()) + + assertEquals(10, df.filter("local_ts_micros > CAST('2017-07-07 07:07:07' AS TIMESTAMP_NTZ)").count()) + assertEquals(10, df.filter("local_ts_micros < CAST('2017-07-07 07:07:07' AS TIMESTAMP_NTZ)").count()) + assertEquals(0, df.filter("local_ts_micros > CAST('2017-07-07 07:07:07.000001' AS TIMESTAMP_NTZ)").count()) + assertEquals(0, df.filter("local_ts_micros < CAST('2017-07-07 07:07:06.999999' AS TIMESTAMP_NTZ)").count()) + } finally { + spark.conf.set("spark.sql.parquet.enableVectorizedReader", prevValue) + spark.conf.set("spark.sql.session.timeZone", prevTimezone) + if (HoodieSparkUtils.isSpark3_3) { + if (propertyValue == null) { + System.clearProperty("spark.testing") + } else { + System.setProperty("spark.testing", propertyValue) + } + } + } + } + } + + def getWriterReaderOpts(recordType: HoodieRecordType = HoodieRecordType.AVRO, opt: Map[String, String] = commonOpts, enableFileIndex: Boolean = DataSourceReadOptions.ENABLE_HOODIE_FILE_INDEX.defaultValue()): (Map[String, String], Map[String, String]) = { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala index ac83cf81918bb..7daa2dc69b9a6 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala @@ -20,6 +20,7 @@ package org.apache.hudi.functional import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path +import org.apache.hudi.{AvroConversionUtils, ColumnStatsIndexSupport, DataSourceWriteOptions} import org.apache.hudi.ColumnStatsIndexSupport.composeIndexSchema import org.apache.hudi.DataSourceWriteOptions.{PRECOMBINE_FIELD, RECORDKEY_FIELD} import org.apache.hudi.HoodieConversionUtils.toProperties @@ -29,7 +30,6 @@ import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} import org.apache.hudi.common.util.ParquetUtils import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.functional.ColumnStatIndexTestBase.ColumnStatsTestCase -import org.apache.hudi.{ColumnStatsIndexSupport, DataSourceWriteOptions} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, GreaterThan, Literal, Or} @@ -109,10 +109,11 @@ class TestColumnStatsIndex extends ColumnStatIndexTestBase { HoodieTableConfig.POPULATE_META_FIELDS.key -> "true" ) ++ metadataOpts - val schema = StructType(StructField("c1", IntegerType, false) :: StructField("c2", StringType, true) :: Nil) + val structSchema = StructType(StructField("c1", IntegerType, false) :: StructField("c2", StringType, true) :: Nil) + val avroSchema = AvroConversionUtils.convertStructTypeToAvroSchema(structSchema, "record", "") val inputDF = spark.createDataFrame( spark.sparkContext.parallelize(Seq(Row(1, "v1"), Row(2, "v2"), Row(3, null), Row(4, "v4"))), - schema) + structSchema) inputDF .sort("c1", "c2") @@ -129,7 +130,7 @@ class TestColumnStatsIndex extends ColumnStatIndexTestBase { .fromProperties(toProperties(metadataOpts)) .build() - val columnStatsIndex = new ColumnStatsIndexSupport(spark, schema, metadataConfig, metaClient) + val columnStatsIndex = new ColumnStatsIndexSupport(spark, structSchema, metadataConfig, metaClient) columnStatsIndex.loadTransposed(Seq("c2"), false) { transposedDF => val result = transposedDF.select("valueCount", "c2_nullCount") .collect().head diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala index 9c4099035b12d..82f0342f729ec 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndexWithSQL.scala @@ -19,6 +19,7 @@ package org.apache.hudi.functional import org.apache.hudi.DataSourceWriteOptions.{DELETE_OPERATION_OPT_VAL, PRECOMBINE_FIELD, RECORDKEY_FIELD} +import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, HoodieFileIndex} import org.apache.hudi.client.SparkRDDWriteClient import org.apache.hudi.client.common.HoodieSparkEngineContext import org.apache.hudi.client.utils.MetadataConversionUtils @@ -32,9 +33,9 @@ import org.apache.hudi.functional.ColumnStatIndexTestBase.ColumnStatsTestCase import org.apache.hudi.index.HoodieIndex.IndexType.INMEMORY import org.apache.hudi.metadata.HoodieMetadataFileSystemView import org.apache.hudi.util.JavaConversions -import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, HoodieFileIndex} import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, Expression, GreaterThan, Literal} +import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, EqualTo, Expression, GreaterThan, Literal} +import org.apache.spark.sql.test.LastOptions.saveMode import org.apache.spark.sql.types.StringType import org.junit.jupiter.api.Assertions.{assertEquals, assertFalse, assertTrue} import org.junit.jupiter.params.ParameterizedTest @@ -205,16 +206,24 @@ class TestColumnStatsIndexWithSQL extends ColumnStatIndexTestBase { verifyFileIndexAndSQLQueries(commonOpts) } - private def setupTable(testCase: ColumnStatsTestCase, metadataOpts: Map[String, String], commonOpts: Map[String, String], shouldValidate: Boolean): Unit = { + private def setupTable(testCase: ColumnStatsTestCase, metadataOpts: Map[String, String], commonOpts: Map[String, String], + shouldValidate: Boolean, useShortSchema: Boolean = false, + validationSortColumns : Seq[String] = Seq("c1_maxValue", "c1_minValue", "c2_maxValue", + "c2_minValue", "c3_maxValue", "c3_minValue", "c5_maxValue", "c5_minValue")): Unit = { + val filePostfix = if (useShortSchema) { + "-short-schema" + } else { + "" + } doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, dataSourcePath = "index/colstats/input-table-json", - expectedColStatsSourcePath = "index/colstats/column-stats-index-table.json", + expectedColStatsSourcePath = s"index/colstats/column-stats-index-table${filePostfix}.json", operation = DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL, saveMode = SaveMode.Overwrite) doWriteAndValidateColumnStats(testCase, metadataOpts, commonOpts, dataSourcePath = "index/colstats/another-input-table-json", - expectedColStatsSourcePath = "index/colstats/updated-column-stats-index-table.json", + expectedColStatsSourcePath = s"index/colstats/updated-column-stats-index-table${filePostfix}.json", operation = DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, saveMode = SaveMode.Append) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala index b1d3a17004bb1..6751aaca33305 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala @@ -18,6 +18,8 @@ package org.apache.hudi.functional import org.apache.hadoop.fs.Path + +import org.apache.hudi.{AvroConversionUtils, ColumnStatsIndexSupport, DataSourceReadOptions, DataSourceUtils, DataSourceWriteOptions, HoodieDataSourceHelpers, HoodieSparkRecordMerger, HoodieSparkUtils, SparkDatasetMixin} import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.HoodieConversionUtils.toJavaOption import org.apache.hudi.client.SparkRDDWriteClient @@ -29,6 +31,11 @@ import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.testutils.HoodieTestDataGenerator import org.apache.hudi.common.testutils.RawTripTestPayload.{recordToString, recordsToStrings} import org.apache.hudi.common.util +import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodieRecord, HoodieRecordPayload, HoodieTableType, OverwriteWithLatestAvroPayload} +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} +import org.apache.hudi.common.testutils.{HoodieTestDataGenerator, HoodieTestUtils} +import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings +import org.apache.hudi.common.util.Option import org.apache.hudi.config.{HoodieCompactionConfig, HoodieIndexConfig, HoodieWriteConfig} import org.apache.hudi.functional.TestCOWDataSource.convertColumnsToNullable import org.apache.hudi.hadoop.config.HoodieRealtimeConfig @@ -37,6 +44,10 @@ import org.apache.hudi.table.action.compact.CompactionTriggerStrategy import org.apache.hudi.testutils.{DataSourceTestUtils, HoodieSparkClientTestBase} import org.apache.hudi.util.JFunction import org.apache.hudi.{DataSourceReadOptions, DataSourceUtils, DataSourceWriteOptions, HoodieDataSourceHelpers, HoodieSparkRecordMerger, SparkDatasetMixin} +import org.apache.hudi.table.action.compact.CompactionTriggerStrategy +import org.apache.hudi.testutils.{DataSourceTestUtils, HoodieSparkClientTestBase} +import org.apache.hudi.util.{JavaConversions, JFunction} +import org.apache.hadoop.fs.Path import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.hudi.HoodieSparkSessionExtension @@ -47,8 +58,12 @@ import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.{CsvSource, EnumSource} import org.slf4j.LoggerFactory +import java.net.URI +import java.nio.file.Paths +import java.sql.Timestamp import java.util.function.Consumer import scala.collection.JavaConversions.mapAsJavaMap +import java.util.stream.Collectors import scala.collection.JavaConverters._ /** @@ -880,7 +895,11 @@ class TestMORDataSource extends HoodieSparkClientTestBase with SparkDatasetMixin @ParameterizedTest @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) def testReadPathsForOnlyLogFiles(recordType: HoodieRecordType): Unit = { - val (writeOpts, readOpts) = getWriterReaderOpts(recordType) + var (writeOpts, readOpts) = getWriterReaderOpts(recordType) + + writeOpts += ( + HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true", + HoodieMetadataConfig.ENABLE.key -> "true") initMetaClient(HoodieTableType.MERGE_ON_READ) val records1 = dataGen.generateInsertsContainsAllPartitions("000", 20) @@ -1079,7 +1098,7 @@ class TestMORDataSource extends HoodieSparkClientTestBase with SparkDatasetMixin @ParameterizedTest @EnumSource(value = classOf[HoodieRecordType], names = Array("AVRO", "SPARK")) - def testHoodieIsDeletedMOR(recordType: HoodieRecordType): Unit = { + def testHoodieIsDeletedMOR(recordType: HoodieRecordType): Unit = { val (writeOpts, readOpts) = getWriterReaderOpts(recordType) val numRecords = 100 @@ -1121,6 +1140,78 @@ class TestMORDataSource extends HoodieSparkClientTestBase with SparkDatasetMixin assertEquals(numRecords - numRecordsToDelete, snapshotDF2.count()) } + @ParameterizedTest + @CsvSource(Array("avro, 6", "parquet, 6")) + def testLogicalTypesReadRepair(logBlockFormat: String, tableVersion: Int): Unit = { + if (HoodieSparkUtils.gteqSpark3_4) { + val logBlockString = if (logBlockFormat == "avro") { + "" + } else { + "_parquet_log" + } + val prevTimezone = spark.conf.get("spark.sql.session.timeZone") + val propertyValue: String = System.getProperty("spark.testing") + try { + if (HoodieSparkUtils.isSpark3_3) { + System.setProperty("spark.testing", "true") + } + spark.conf.set("spark.sql.session.timeZone", "UTC") + val tableName = "trips_logical_types_json_mor_read_v" + tableVersion + logBlockString + val dataPath = "file://" + basePath + "/" + tableName + val zipOutput = Paths.get(new URI(dataPath)) + HoodieTestUtils.extractZipToDirectory("/" + tableName + ".zip", zipOutput, getClass) + val tableBasePath = zipOutput.toString + + val df = spark.read.format("hudi").load(tableBasePath) + + val rows = df.collect() + assertEquals(20, rows.length) + for (row <- rows) { + val hash = row.get(6).asInstanceOf[String].hashCode() + if ((hash & 1) == 0) { + assertEquals("2020-01-01T00:00:00.001Z", row.get(15).asInstanceOf[Timestamp].toInstant.toString) + assertEquals("2020-06-01T12:00:00.000001Z", row.get(16).asInstanceOf[Timestamp].toInstant.toString) + assertEquals("2015-05-20T12:34:56.001", row.get(17).toString) + assertEquals("2017-07-07T07:07:07.000001", row.get(18).toString) + } else { + assertEquals("2019-12-31T23:59:59.999Z", row.get(15).asInstanceOf[Timestamp].toInstant.toString) + assertEquals("2020-06-01T11:59:59.999999Z", row.get(16).asInstanceOf[Timestamp].toInstant.toString) + assertEquals("2015-05-20T12:34:55.999", row.get(17).toString) + assertEquals("2017-07-07T07:07:06.999999", row.get(18).toString) + } + } + assertEquals(10, df.filter("ts_millis > timestamp('2020-01-01 00:00:00Z')").count()) + assertEquals(10, df.filter("ts_millis < timestamp('2020-01-01 00:00:00Z')").count()) + assertEquals(0, df.filter("ts_millis > timestamp('2020-01-01 00:00:00.001Z')").count()) + assertEquals(0, df.filter("ts_millis < timestamp('2019-12-31 23:59:59.999Z')").count()) + + assertEquals(10, df.filter("ts_micros > timestamp('2020-06-01 12:00:00Z')").count()) + assertEquals(10, df.filter("ts_micros < timestamp('2020-06-01 12:00:00Z')").count()) + assertEquals(0, df.filter("ts_micros > timestamp('2020-06-01 12:00:00.000001Z')").count()) + assertEquals(0, df.filter("ts_micros < timestamp('2020-06-01 11:59:59.999999Z')").count()) + + assertEquals(10, df.filter("local_ts_millis > CAST('2015-05-20 12:34:56' AS TIMESTAMP_NTZ)").count()) + assertEquals(10, df.filter("local_ts_millis < CAST('2015-05-20 12:34:56' AS TIMESTAMP_NTZ)").count()) + assertEquals(0, df.filter("local_ts_millis > CAST('2015-05-20 12:34:56.001' AS TIMESTAMP_NTZ)").count()) + assertEquals(0, df.filter("local_ts_millis < CAST('2015-05-20 12:34:55.999' AS TIMESTAMP_NTZ)").count()) + + assertEquals(10, df.filter("local_ts_micros > CAST('2017-07-07 07:07:07' AS TIMESTAMP_NTZ)").count()) + assertEquals(10, df.filter("local_ts_micros < CAST('2017-07-07 07:07:07' AS TIMESTAMP_NTZ)").count()) + assertEquals(0, df.filter("local_ts_micros > CAST('2017-07-07 07:07:07.000001' AS TIMESTAMP_NTZ)").count()) + assertEquals(0, df.filter("local_ts_micros < CAST('2017-07-07 07:07:06.999999' AS TIMESTAMP_NTZ)").count()) + } finally { + spark.conf.set("spark.sql.session.timeZone", prevTimezone) + if (HoodieSparkUtils.isSpark3_3) { + if (propertyValue == null) { + System.clearProperty("spark.testing") + } else { + System.setProperty("spark.testing", propertyValue) + } + } + } + } + } + /** * This tests the case that query by with a specified partition condition on hudi table which is * different between the value of the partition field and the actual partition path, diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala index ee1edbcccb296..5eb75476cab2e 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala @@ -17,8 +17,6 @@ package org.apache.hudi.functional -import org.apache.avro.Schema -import org.apache.calcite.runtime.SqlFunctions.abs import org.apache.hudi.HoodieBaseRelation.projectSchema import org.apache.hudi.common.config.{HoodieMetadataConfig, HoodieStorageConfig} import org.apache.hudi.common.model.{HoodieRecord, OverwriteNonDefaultsWithLatestAvroPayload} @@ -28,11 +26,14 @@ import org.apache.hudi.config.{HoodieCompactionConfig, HoodieWriteConfig} import org.apache.hudi.testutils.SparkClientFunctionalTestHarness import org.apache.hudi.testutils.SparkClientFunctionalTestHarness.getSparkSqlConf import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, DefaultSource, HoodieBaseRelation, HoodieSparkUtils, HoodieUnsafeRDD} + +import org.apache.avro.Schema import org.apache.parquet.hadoop.util.counters.BenchmarkCounter import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.{Dataset, HoodieUnsafeUtils, Row, SaveMode} + import org.junit.jupiter.api.Assertions.{assertEquals, assertFalse, assertTrue, fail} import org.junit.jupiter.api.{Disabled, Tag, Test} @@ -398,7 +399,7 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with assertEquals(expectedRecordCount, rows.length) // verify within 10% of margin. - assertTrue((abs(expectedBytesRead - bytesRead) / expectedBytesRead) < 0.1) + assertTrue((Math.abs(expectedBytesRead - bytesRead) / expectedBytesRead) < 0.1) val readColumns = targetColumns ++ relation.mandatoryFields val (_, projectedStructType, _) = projectSchema(Left(tableState.schema), readColumns) diff --git a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala index ec275a1d3fdc2..47f2ce46c7bf9 100644 --- a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala +++ b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala @@ -23,6 +23,15 @@ import org.apache.hadoop.fs.Path import org.apache.hudi.client.utils.SparkRowSerDe import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.{AvroConversionUtils, DefaultSource, Spark2HoodieFileScanRDD, Spark2RowSerDe} +import org.apache.hudi.client.utils.SparkRowSerDe +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.{AvroConversionUtils, DefaultSource, Spark2HoodieFileScanRDD, Spark2RowSerDe} + +import org.apache.avro.Schema +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.FileStatus +import org.apache.hadoop.fs.Path +import org.apache.parquet.schema.MessageType import org.apache.spark.sql._ import org.apache.spark.sql.avro._ import org.apache.spark.sql.catalyst.InternalRow @@ -74,6 +83,23 @@ class Spark2Adapter extends SparkAdapter { throw new UnsupportedOperationException("Catalog utilities are not supported in Spark 2.x"); } + override def isTimestampNTZType(dataType: DataType): Boolean = { + dataType.getClass.getSimpleName.startsWith("TimestampNTZType") + } + + override def getParquetReadSupport(messageScheme: org.apache.hudi.common.util.Option[MessageType]): + org.apache.parquet.hadoop.api.ReadSupport[_] = { + // ParquetReadSupport is package-private in Spark 2.4, so we use reflection to instantiate it + val clazz = Class.forName("org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport") + clazz.getDeclaredConstructor().newInstance().asInstanceOf[org.apache.parquet.hadoop.api.ReadSupport[_]] + } + + override def repairSchemaIfSpecified(shouldRepair: Boolean, + fileSchema: MessageType, + tableSchemaOpt: org.apache.hudi.common.util.Option[MessageType]): MessageType = { + fileSchema + } + override def getCatalystPlanUtils: HoodieCatalystPlansUtils = HoodieSpark2CatalystPlanUtils override def getCatalystExpressionUtils: HoodieCatalystExpressionUtils = HoodieSpark2CatalystExpressionUtils @@ -144,7 +170,7 @@ class Spark2Adapter extends SparkAdapter { partitions.toSeq } - override def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = { + override def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean, tableAvroSchema: Schema): Option[ParquetFileFormat] = { Some(new Spark24LegacyHoodieParquetFileFormat(appendPartitionValues)) } @@ -207,4 +233,12 @@ class Spark2Adapter extends SparkAdapter { batch.setNumRows(numRows) batch } + + override def getReaderSchemas(conf: Configuration, readerSchema: Schema, requestedSchema: Schema, fileSchema: MessageType): + org.apache.hudi.common.util.collection.Pair[StructType, StructType] = { + org.apache.hudi.common.util.collection.Pair.of( + HoodieInternalRowUtils.getCachedSchema(readerSchema), + HoodieInternalRowUtils.getCachedSchema(requestedSchema) + ) + } } diff --git a/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/adapter/BaseSpark3Adapter.scala b/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/adapter/BaseSpark3Adapter.scala index b2a9a529511ec..17de9e00fea45 100644 --- a/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/adapter/BaseSpark3Adapter.scala +++ b/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/adapter/BaseSpark3Adapter.scala @@ -17,24 +17,29 @@ package org.apache.spark.sql.adapter -import org.apache.avro.Schema -import org.apache.hadoop.fs.Path import org.apache.hudi.client.utils.SparkRowSerDe import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.util.JsonUtils import org.apache.hudi.spark3.internal.ReflectUtil import org.apache.hudi.{AvroConversionUtils, DefaultSource, HoodieSparkUtils, Spark3RowSerDe} + +import org.apache.avro.Schema +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.parquet.schema.MessageType import org.apache.spark.internal.Logging import org.apache.spark.sql.avro.{HoodieAvroSchemaConverters, HoodieSparkAvroSchemaConverters} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{Expression, InterpretedPredicate, Predicate} import org.apache.spark.sql.catalyst.util.DateFormatter import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport import org.apache.spark.sql.hudi.SparkAdapter import org.apache.spark.sql.sources.{BaseRelation, Filter} import org.apache.spark.sql.{HoodieSpark3CatalogUtils, SQLContext, SparkSession} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} +import org.apache.spark.sql.HoodieInternalRowUtils import org.apache.spark.storage.StorageLevel import java.time.ZoneId @@ -102,4 +107,22 @@ abstract class BaseSpark3Adapter extends SparkAdapter with Logging { override def makeColumnarBatch(vectors: Array[ColumnVector], numRows: Int): ColumnarBatch = { new ColumnarBatch(vectors, numRows) } + + override def repairSchemaIfSpecified(shouldRepair: Boolean, + fileSchema: MessageType, + tableSchemaOpt: org.apache.hudi.common.util.Option[MessageType]): MessageType = { + fileSchema + } + + override def getParquetReadSupport(messageSchema: org.apache.hudi.common.util.Option[MessageType]): ParquetReadSupport = { + new ParquetReadSupport() + } + + override def getReaderSchemas(conf: Configuration, readerSchema: Schema, requestedSchema: Schema, fileSchema: MessageType): + org.apache.hudi.common.util.collection.Pair[StructType, StructType] = { + org.apache.hudi.common.util.collection.Pair.of( + HoodieInternalRowUtils.getCachedSchema(readerSchema), + HoodieInternalRowUtils.getCachedSchema(requestedSchema) + ) + } } diff --git a/hudi-spark-datasource/hudi-spark3.0.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_0Adapter.scala b/hudi-spark-datasource/hudi-spark3.0.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_0Adapter.scala index 22a9f090fb33e..e593cc5e78681 100644 --- a/hudi-spark-datasource/hudi-spark3.0.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_0Adapter.scala +++ b/hudi-spark-datasource/hudi-spark3.0.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_0Adapter.scala @@ -60,6 +60,10 @@ class Spark3_0Adapter extends BaseSpark3Adapter { override def isColumnarBatchRow(r: InternalRow): Boolean = ColumnarUtils.isColumnarBatchRow(r) + override def isTimestampNTZType(dataType: DataType): Boolean = { + dataType.getClass.getSimpleName.startsWith("TimestampNTZType") + } + def createCatalystMetadataForMetaField: Metadata = // NOTE: Since [[METADATA_COL_ATTR_KEY]] flag is not available in Spark 2.x, // we simply produce an empty [[Metadata]] instance @@ -84,7 +88,7 @@ class Spark3_0Adapter extends BaseSpark3Adapter { override def createExtendedSparkParser(spark: SparkSession, delegate: ParserInterface): HoodieExtendedParserInterface = new HoodieSpark3_0ExtendedSqlParser(spark, delegate) - override def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = { + override def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean, tableAvroSchema: Schema): Option[ParquetFileFormat] = { Some(new Spark30LegacyHoodieParquetFileFormat(appendPartitionValues)) } diff --git a/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_1Adapter.scala b/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_1Adapter.scala index 8ca072333d0e3..bf5e9d219abed 100644 --- a/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_1Adapter.scala +++ b/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_1Adapter.scala @@ -61,6 +61,10 @@ class Spark3_1Adapter extends BaseSpark3Adapter { override def isColumnarBatchRow(r: InternalRow): Boolean = ColumnarUtils.isColumnarBatchRow(r) + override def isTimestampNTZType(dataType: DataType): Boolean = { + dataType.getClass.getSimpleName.startsWith("TimestampNTZType") + } + def createCatalystMetadataForMetaField: Metadata = // NOTE: Since [[METADATA_COL_ATTR_KEY]] flag is not available in Spark 2.x, // we simply produce an empty [[Metadata]] instance @@ -85,7 +89,7 @@ class Spark3_1Adapter extends BaseSpark3Adapter { override def createExtendedSparkParser(spark: SparkSession, delegate: ParserInterface): HoodieExtendedParserInterface = new HoodieSpark3_1ExtendedSqlParser(spark, delegate) - override def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = { + override def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean, tableAvroSchema: Schema): Option[ParquetFileFormat] = { Some(new Spark31LegacyHoodieParquetFileFormat(appendPartitionValues)) } diff --git a/hudi-spark-datasource/hudi-spark3.2.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_2Adapter.scala b/hudi-spark-datasource/hudi-spark3.2.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_2Adapter.scala index 3a5812a5faa40..02237949a4866 100644 --- a/hudi-spark-datasource/hudi-spark3.2.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_2Adapter.scala +++ b/hudi-spark-datasource/hudi-spark3.2.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_2Adapter.scala @@ -60,6 +60,10 @@ class Spark3_2Adapter extends BaseSpark3Adapter { override def isColumnarBatchRow(r: InternalRow): Boolean = ColumnarUtils.isColumnarBatchRow(r) + override def isTimestampNTZType(dataType: DataType): Boolean = { + dataType.getClass.getSimpleName.startsWith("TimestampNTZType") + } + def createCatalystMetadataForMetaField: Metadata = new MetadataBuilder() .putBoolean(METADATA_COL_ATTR_KEY, value = true) @@ -84,7 +88,7 @@ class Spark3_2Adapter extends BaseSpark3Adapter { override def createExtendedSparkParser(spark: SparkSession, delegate: ParserInterface): HoodieExtendedParserInterface = new HoodieSpark3_2ExtendedSqlParser(spark, delegate) - override def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = { + override def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean, tableAvroSchema: Schema): Option[ParquetFileFormat] = { Some(new Spark32LegacyHoodieParquetFileFormat(appendPartitionValues)) } diff --git a/hudi-spark-datasource/hudi-spark3.2.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark32LegacyHoodieParquetFileFormat.scala b/hudi-spark-datasource/hudi-spark3.2.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark32LegacyHoodieParquetFileFormat.scala index c88c35b5eeb4e..dcaab6734e292 100644 --- a/hudi-spark-datasource/hudi-spark3.2.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark32LegacyHoodieParquetFileFormat.scala +++ b/hudi-spark-datasource/hudi-spark3.2.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark32LegacyHoodieParquetFileFormat.scala @@ -340,9 +340,10 @@ class Spark32LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu DataSourceUtils.int96RebaseSpec(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead) val datetimeRebaseSpec = DataSourceUtils.datetimeRebaseSpec(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) - new ParquetReadSupport( + new HoodieParquetReadSupport( convertTz, enableVectorizedReader = false, + enableTimestampFieldRepair = false, datetimeRebaseSpec, int96RebaseSpec) } else { diff --git a/hudi-spark-datasource/hudi-spark3.2plus-common/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieSpark32PlusAnalysis.scala b/hudi-spark-datasource/hudi-spark3.2plus-common/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieSpark32PlusAnalysis.scala index d64bc94301a12..181a9faf55102 100644 --- a/hudi-spark-datasource/hudi-spark3.2plus-common/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieSpark32PlusAnalysis.scala +++ b/hudi-spark-datasource/hudi-spark3.2plus-common/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieSpark32PlusAnalysis.scala @@ -138,7 +138,14 @@ case class HoodieSpark32PlusResolveReferences(spark: SparkSession) extends Rule[ lazy val analyzer = spark.sessionState.analyzer val targetTable = if (targetTableO.resolved) targetTableO else analyzer.execute(targetTableO) val sourceTable = if (sourceTableO.resolved) sourceTableO else analyzer.execute(sourceTableO) - val m = mO.asInstanceOf[MergeIntoTable].copy(targetTable = targetTable, sourceTable = sourceTable) + val mergeIntoTable = mO.asInstanceOf[MergeIntoTable] + // Use positional parameters to avoid NoSuchMethodError when method signature changes between Spark versions + val m = mergeIntoTable.copy( + targetTable = targetTable, + sourceTable = sourceTable, + mergeCondition = mergeIntoTable.mergeCondition, + matchedActions = mergeIntoTable.matchedActions, + notMatchedActions = mergeIntoTable.notMatchedActions) // END: custom Hudi change EliminateSubqueryAliases(targetTable) match { case r: NamedRelation if r.skipSchemaResolution => diff --git a/hudi-spark-datasource/hudi-spark3.3.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_3Adapter.scala b/hudi-spark-datasource/hudi-spark3.3.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_3Adapter.scala index e3d2cc9cd185e..11cc753d5e401 100644 --- a/hudi-spark-datasource/hudi-spark3.3.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_3Adapter.scala +++ b/hudi-spark-datasource/hudi-spark3.3.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_3Adapter.scala @@ -35,7 +35,7 @@ import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.hudi.analysis.TableValuedFunctions import org.apache.spark.sql.parser.{HoodieExtendedParserInterface, HoodieSpark3_3ExtendedSqlParser} -import org.apache.spark.sql.types.{DataType, Metadata, MetadataBuilder, StructType} +import org.apache.spark.sql.types.{DataType, DataTypes, Metadata, MetadataBuilder, StructType} import org.apache.spark.sql.vectorized.ColumnarBatchRow import org.apache.spark.storage.StorageLevel import org.apache.spark.storage.StorageLevel._ @@ -85,7 +85,7 @@ class Spark3_3Adapter extends BaseSpark3Adapter { override def createExtendedSparkParser(spark: SparkSession, delegate: ParserInterface): HoodieExtendedParserInterface = new HoodieSpark3_3ExtendedSqlParser(spark, delegate) - override def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = { + override def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean, tableAvroSchema: Schema): Option[ParquetFileFormat] = { Some(new Spark33LegacyHoodieParquetFileFormat(appendPartitionValues)) } @@ -124,4 +124,8 @@ class Spark3_3Adapter extends BaseSpark3Adapter { case OFF_HEAP => "OFF_HEAP" case _ => throw new IllegalArgumentException(s"Invalid StorageLevel: $level") } + + override def isTimestampNTZType(dataType: DataType): Boolean = { + dataType.getClass.getSimpleName.startsWith("TimestampNTZType") + } } diff --git a/hudi-spark-datasource/hudi-spark3.3.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark33LegacyHoodieParquetFileFormat.scala b/hudi-spark-datasource/hudi-spark3.3.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark33LegacyHoodieParquetFileFormat.scala index de6cbff90ca54..add53e12e2493 100644 --- a/hudi-spark-datasource/hudi-spark3.3.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark33LegacyHoodieParquetFileFormat.scala +++ b/hudi-spark-datasource/hudi-spark3.3.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark33LegacyHoodieParquetFileFormat.scala @@ -342,9 +342,10 @@ class Spark33LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu DataSourceUtils.int96RebaseSpec(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead) val datetimeRebaseSpec = DataSourceUtils.datetimeRebaseSpec(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) - new ParquetReadSupport( + new HoodieParquetReadSupport( convertTz, enableVectorizedReader = false, + enableTimestampFieldRepair = true, datetimeRebaseSpec, int96RebaseSpec) } else { diff --git a/hudi-spark-datasource/hudi-spark3.4.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_4Adapter.scala b/hudi-spark-datasource/hudi-spark3.4.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_4Adapter.scala index 0ae5ef3dbf34a..b7f1e69d60888 100644 --- a/hudi-spark-datasource/hudi-spark3.4.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_4Adapter.scala +++ b/hudi-spark-datasource/hudi-spark3.4.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_4Adapter.scala @@ -17,8 +17,14 @@ package org.apache.spark.sql.adapter +import org.apache.hudi.avro.AvroSchemaUtils +import org.apache.hudi.{Spark34HoodieFileScanRDD, SparkAdapterSupport$} +import org.apache.hudi.io.storage.HoodieSparkParquetReader + import org.apache.avro.Schema -import org.apache.hudi.Spark34HoodieFileScanRDD +import org.apache.hadoop.conf.Configuration +import org.apache.parquet.avro.HoodieAvroParquetSchemaConverter.getAvroSchemaConverter +import org.apache.parquet.schema.{MessageType, SchemaRepair} import org.apache.spark.sql.avro._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases @@ -27,21 +33,26 @@ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.util.METADATA_COL_ATTR_KEY +import org.apache.spark.sql.catalyst.util.{METADATA_COL_ATTR_KEY, RebaseDateTime} import org.apache.spark.sql.connector.catalog.V2TableWithV1Fallback -import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, Spark34LegacyHoodieParquetFileFormat} +import org.apache.spark.sql.execution.datasources.parquet.{HoodieParquetReadSupport, ParquetFileFormat, ParquetReadSupport, ParquetToSparkSchemaConverter, Spark34LegacyHoodieParquetFileFormat, SparkBasicSchemaEvolution} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.hudi.analysis.TableValuedFunctions import org.apache.spark.sql.parser.{HoodieExtendedParserInterface, HoodieSpark3_4ExtendedSqlParser} -import org.apache.spark.sql.types.{DataType, Metadata, MetadataBuilder, StructType} +import org.apache.spark.sql.types.{DataType, DataTypes, Metadata, MetadataBuilder, StructType} import org.apache.spark.sql.vectorized.ColumnarBatchRow import org.apache.spark.sql._ +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy import org.apache.spark.storage.StorageLevel import org.apache.spark.storage.StorageLevel._ +import java.time.ZoneId +import scala.Option + /** - * Implementation of [[SparkAdapter]] for Spark 3.4.x branch + * Implementation of [[org.apache.spark.sql.hudi.SparkAdapter]] for Spark 3.4.x branch */ class Spark3_4Adapter extends BaseSpark3Adapter { @@ -66,6 +77,31 @@ class Spark3_4Adapter extends BaseSpark3Adapter { .putBoolean(METADATA_COL_ATTR_KEY, value = true) .build() + override def isTimestampNTZType(dataType: DataType): Boolean = { + dataType == DataTypes.TimestampNTZType + } + + override def getParquetReadSupport(messageSchema: org.apache.hudi.common.util.Option[MessageType]): ParquetReadSupport = { + new HoodieParquetReadSupport( + Option.empty[ZoneId], + enableVectorizedReader = true, + enableTimestampFieldRepair = true, + RebaseDateTime.RebaseSpec(LegacyBehaviorPolicy.withName("CORRECTED")), + RebaseDateTime.RebaseSpec(LegacyBehaviorPolicy.withName("LEGACY")), + messageSchema + ) + } + + override def repairSchemaIfSpecified(shouldRepair: Boolean, + fileSchema: MessageType, + tableSchemaOpt: org.apache.hudi.common.util.Option[MessageType]): MessageType = { + if (shouldRepair) { + SchemaRepair.repairLogicalTypes(fileSchema, tableSchemaOpt) + } else { + fileSchema + } + } + override def getCatalogUtils: HoodieSpark3CatalogUtils = HoodieSpark34CatalogUtils override def getCatalystExpressionUtils: HoodieCatalystExpressionUtils = HoodieSpark34CatalystExpressionUtils @@ -85,8 +121,8 @@ class Spark3_4Adapter extends BaseSpark3Adapter { override def createExtendedSparkParser(spark: SparkSession, delegate: ParserInterface): HoodieExtendedParserInterface = new HoodieSpark3_4ExtendedSqlParser(spark, delegate) - override def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = { - Some(new Spark34LegacyHoodieParquetFileFormat(appendPartitionValues)) + override def createLegacyHoodieParquetFileFormat(appendPartitionValues: Boolean, tableAvroSchema: Schema): Option[ParquetFileFormat] = { + Some(new Spark34LegacyHoodieParquetFileFormat(appendPartitionValues, tableAvroSchema)) } override def createHoodieFileScanRDD(sparkSession: SparkSession, @@ -124,4 +160,19 @@ class Spark3_4Adapter extends BaseSpark3Adapter { case OFF_HEAP => "OFF_HEAP" case _ => throw new IllegalArgumentException(s"Invalid StorageLevel: $level") } + + override def getReaderSchemas(conf: Configuration, readerSchema: Schema, requestedSchema: Schema, fileSchema: MessageType): + org.apache.hudi.common.util.collection.Pair[StructType, StructType] = { + val nonNullRequestedSchema = AvroSchemaUtils.getNonNullTypeFromUnion(requestedSchema) + val cachedRequestedSchema = HoodieInternalRowUtils.getCachedSchema(nonNullRequestedSchema) + val requestedSchemaInMessageType = org.apache.hudi.common.util.Option.of(getAvroSchemaConverter(conf).convert(nonNullRequestedSchema)) + val enableTimestampFieldRepair = conf.getBoolean(HoodieSparkParquetReader.ENABLE_LOGICAL_TIMESTAMP_REPAIR, true) + val repairedRequestedSchema = repairSchemaIfSpecified(enableTimestampFieldRepair, fileSchema, requestedSchemaInMessageType) + val repairedRequestedStructType = new ParquetToSparkSchemaConverter(conf).convert(repairedRequestedSchema) + val evolution = new SparkBasicSchemaEvolution(repairedRequestedStructType, cachedRequestedSchema, SQLConf.get.sessionLocalTimeZone) + val readerStructType = evolution.getRequestSchema + org.apache.hudi.common.util.collection.Pair.of( + readerStructType, readerStructType + ) + } } diff --git a/hudi-spark-datasource/hudi-spark3.4.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark34LegacyHoodieParquetFileFormat.scala b/hudi-spark-datasource/hudi-spark3.4.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark34LegacyHoodieParquetFileFormat.scala index 6de8ded06ec00..fd8e52ea1e92c 100644 --- a/hudi-spark-datasource/hudi-spark3.4.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark34LegacyHoodieParquetFileFormat.scala +++ b/hudi-spark-datasource/hudi-spark3.4.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark34LegacyHoodieParquetFileFormat.scala @@ -21,7 +21,8 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapred.FileSplit import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.hadoop.mapreduce.{JobID, TaskAttemptID, TaskID, TaskType} -import org.apache.hudi.HoodieSparkUtils +import org.apache.avro.Schema +import org.apache.hudi.common.util.{Option => HOption} import org.apache.hudi.client.utils.SparkInternalSchemaConverter import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.util.InternalSchemaCache @@ -30,23 +31,29 @@ import org.apache.hudi.common.util.collection.Pair import org.apache.hudi.internal.schema.InternalSchema import org.apache.hudi.internal.schema.action.InternalSchemaMerger import org.apache.hudi.internal.schema.utils.{InternalSchemaUtils, SerDeHelper} +import org.apache.hudi.io.storage.HoodieSparkParquetReader.ENABLE_LOGICAL_TIMESTAMP_REPAIR +import org.apache.hudi.SparkAdapterSupport.sparkAdapter +import org.apache.hudi.common.table.ParquetTableSchemaResolver import org.apache.parquet.filter2.compat.FilterCompat import org.apache.parquet.filter2.predicate.FilterApi import org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS +import org.apache.parquet.hadoop.metadata.{FileMetaData, ParquetMetadata} import org.apache.parquet.hadoop.{ParquetInputFormat, ParquetRecordReader} +import org.apache.parquet.schema.{AvroSchemaRepair, MessageType, SchemaRepair} import org.apache.spark.TaskContext import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection -import org.apache.spark.sql.catalyst.expressions.{Cast, JoinedRow} +import org.apache.spark.sql.catalyst.expressions.JoinedRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.WholeStageCodegenExec -import org.apache.spark.sql.execution.datasources.parquet.Spark34LegacyHoodieParquetFileFormat._ +import org.apache.spark.sql.execution.datasources.parquet.Spark34LegacyHoodieParquetFileFormat.{pruneInternalSchema, rebuildFilterFromParquet, repairFooterSchema} import org.apache.spark.sql.execution.datasources.{DataSourceUtils, PartitionedFile, RecordReaderIterator} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources._ -import org.apache.spark.sql.types.{AtomicType, DataType, StructField, StructType} +import org.apache.spark.sql.types.{AtomicType, DataType, StructType} import org.apache.spark.util.SerializableConfiguration + /** * This class is an extension of [[ParquetFileFormat]] overriding Spark-specific behavior * that's not possible to customize in any other way @@ -57,11 +64,30 @@ import org.apache.spark.util.SerializableConfiguration *

  • Schema on-read
  • * */ -class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValues: Boolean) extends ParquetFileFormat { +class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValues: Boolean, + avroTableSchema: Schema) extends ParquetFileFormat { + private lazy val tableSchemaAsMessageType: HOption[MessageType] = { + if (avroTableSchema == null) { + HOption.empty() + } else { + HOption.ofNullable( + ParquetTableSchemaResolver.convertAvroSchemaToParquet(avroTableSchema, new Configuration()) + ) + } + } + private lazy val hasTimestampMillisFieldInTableSchema = if (avroTableSchema == null) { + true + } else { + AvroSchemaRepair.hasTimestampMillisField(avroTableSchema) + } + private lazy val supportBatchWithTableSchema = !hasTimestampMillisFieldInTableSchema override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = { val conf = sparkSession.sessionState.conf - conf.parquetVectorizedReaderEnabled && schema.forall(_.dataType.isInstanceOf[AtomicType]) + conf.parquetVectorizedReaderEnabled && + schema.forall(_.dataType.isInstanceOf[AtomicType]) && + ParquetUtils.isBatchReadSupportedForSchema(conf, schema) && + supportBatchWithTableSchema } def supportsColumnar(sparkSession: SparkSession, schema: StructType): Boolean = { @@ -113,8 +139,10 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu SQLConf.LEGACY_PARQUET_NANOS_AS_LONG.key, SQLConf.LEGACY_PARQUET_NANOS_AS_LONG.defaultValueString).toBoolean ) - hadoopConf.setBoolean(SQLConf.PARQUET_INFER_TIMESTAMP_NTZ_ENABLED.key, sparkSession.sessionState.conf.parquetInferTimestampNTZEnabled) - hadoopConf.setBoolean(SQLConf.LEGACY_PARQUET_NANOS_AS_LONG.key, sparkSession.sessionState.conf.legacyParquetNanosAsLong) + hadoopConf.setBoolean(SQLConf.PARQUET_INFER_TIMESTAMP_NTZ_ENABLED.key, + sparkSession.sessionState.conf.parquetInferTimestampNTZEnabled) + hadoopConf.setBoolean(SQLConf.LEGACY_PARQUET_NANOS_AS_LONG.key, + sparkSession.sessionState.conf.legacyParquetNanosAsLong) val internalSchemaStr = hadoopConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA) // For Spark DataSource v1, there's no Physical Plan projection/schema pruning w/in Spark itself, // therefore it's safe to do schema projection here @@ -123,6 +151,7 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu pruneInternalSchema(internalSchemaStr, requiredSchema) hadoopConf.set(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA, prunedInternalSchemaStr) } + hadoopConf.set(ENABLE_LOGICAL_TIMESTAMP_REPAIR, hasTimestampMillisFieldInTableSchema.toString) val broadcastedHadoopConf = sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) @@ -133,9 +162,7 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu val resultSchema = StructType(partitionSchema.fields ++ requiredSchema.fields) val sqlConf = sparkSession.sessionState.conf val enableOffHeapColumnVector = sqlConf.offHeapColumnVectorEnabled - val enableVectorizedReader: Boolean = - sqlConf.parquetVectorizedReaderEnabled && - resultSchema.forall(_.dataType.isInstanceOf[AtomicType]) + val enableVectorizedReader: Boolean = supportsColumnar(sparkSession, resultSchema) val enableRecordFilter: Boolean = sqlConf.parquetRecordFilterEnabled val timestampConversion: Boolean = sqlConf.isParquetINT96TimestampConversion val capacity = sqlConf.parquetVectorizedReaderBatchSize @@ -155,22 +182,18 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu val returningBatch = sparkSession.sessionState.conf.parquetVectorizedReaderEnabled && supportsColumnar(sparkSession, resultSchema).toString.equals("true") - (file: PartitionedFile) => { assert(!shouldAppendPartitionValues || file.partitionValues.numFields == partitionSchema.size) val filePath = file.filePath.toPath val split = new FileSplit(filePath, file.start, file.length, Array.empty[String]) - val sharedConf = broadcastedHadoopConf.value.value // Fetch internal schema val internalSchemaStr = sharedConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA) // Internal schema has to be pruned at this point val querySchemaOption = SerDeHelper.fromJson(internalSchemaStr) - var shouldUseInternalSchema = !isNullOrEmpty(internalSchemaStr) && querySchemaOption.isPresent - val tablePath = sharedConf.get(SparkInternalSchemaConverter.HOODIE_TABLE_PATH) val fileSchema = if (shouldUseInternalSchema) { val commitInstantTime = FSUtils.getCommitTime(filePath.getName).toLong; @@ -180,12 +203,18 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu null } - lazy val footerFileMetaData = - ParquetFooterReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData + val originalFooter = ParquetFooterReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS) + val fileFooter = if (hasTimestampMillisFieldInTableSchema) { + repairFooterSchema(originalFooter, tableSchemaAsMessageType); + } else { + originalFooter + } + lazy val footerFileMetaData: FileMetaData = fileFooter.getFileMetaData + // Try to push down filters when filter push-down is enabled. val pushed = if (enableParquetFilterPushDown) { val parquetSchema = footerFileMetaData.getSchema - val parquetFilters = if (HoodieSparkUtils.gteqSpark3_2_1) { + val parquetFilters = { // NOTE: Below code could only be compiled against >= Spark 3.2.1, // and unfortunately won't compile against Spark 3.2.0 // However this code is runtime-compatible w/ both Spark 3.2.0 and >= Spark 3.2.1 @@ -200,19 +229,6 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu pushDownInFilterThreshold, isCaseSensitive, datetimeRebaseSpec) - } else { - // Spark 3.2.0 - val datetimeRebaseMode = - Spark32PlusDataSourceUtils.datetimeRebaseMode(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) - createParquetFilters( - parquetSchema, - pushDownDate, - pushDownTimestamp, - pushDownDecimal, - pushDownStringStartWith, - pushDownInFilterThreshold, - isCaseSensitive, - datetimeRebaseMode) } filters.map(rebuildFilterFromParquet(_, fileSchema, querySchemaOption.orElse(null))) // Collects all converted Parquet filter predicates. Notice that not all predicates can be @@ -246,9 +262,7 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu val typeChangeInfos: java.util.Map[Integer, Pair[DataType, DataType]] = if (shouldUseInternalSchema) { val mergedInternalSchema = new InternalSchemaMerger(fileSchema, querySchemaOption.get(), true, true).mergeSchema() val mergedSchema = SparkInternalSchemaConverter.constructSparkSchemaFromInternalSchema(mergedInternalSchema) - hadoopAttemptConf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, mergedSchema.json) - SparkInternalSchemaConverter.collectTypeChangedCols(querySchemaOption.get(), mergedInternalSchema) } else { val (implicitTypeChangeInfo, sparkRequestSchema) = HoodieParquetFileFormatHelper.buildImplicitSchemaChangeInfo(hadoopAttemptConf, footerFileMetaData, requiredSchema) @@ -284,7 +298,7 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu enableOffHeapColumnVector && taskContext.isDefined, capacity, typeChangeInfos) - } else if (HoodieSparkUtils.gteqSpark3_2_1) { + } else { // NOTE: Below code could only be compiled against >= Spark 3.2.1, // and unfortunately won't compile against Spark 3.2.0 // However this code is runtime-compatible w/ both Spark 3.2.0 and >= Spark 3.2.1 @@ -300,18 +314,6 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu int96RebaseSpec.timeZone, enableOffHeapColumnVector && taskContext.isDefined, capacity) - } else { - // Spark 3.2.0 - val datetimeRebaseMode = - Spark32PlusDataSourceUtils.datetimeRebaseMode(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) - val int96RebaseMode = - Spark32PlusDataSourceUtils.int96RebaseMode(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead) - createVectorizedParquetRecordReader( - convertTz.orNull, - datetimeRebaseMode.toString, - int96RebaseMode.toString, - enableOffHeapColumnVector && taskContext.isDefined, - capacity) } // SPARK-37089: We cannot register a task completion listener to close this iterator here @@ -324,7 +326,6 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu val iter = new RecordReaderIterator(vectorizedReader) try { vectorizedReader.initialize(split, hadoopAttemptContext) - // NOTE: We're making appending of the partitioned values to the rows read from the // data file configurable if (shouldAppendPartitionValues) { @@ -333,7 +334,6 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu } else { vectorizedReader.initBatch(StructType(Nil), InternalRow.empty) } - if (returningBatch) { vectorizedReader.enableReturningBatches() } @@ -349,7 +349,7 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu } } else { logDebug(s"Falling back to parquet-mr") - val readSupport = if (HoodieSparkUtils.gteqSpark3_2_1) { + val readSupport = { // ParquetRecordReader returns InternalRow // NOTE: Below code could only be compiled against >= Spark 3.2.1, // and unfortunately won't compile against Spark 3.2.0 @@ -358,21 +358,13 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu DataSourceUtils.int96RebaseSpec(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead) val datetimeRebaseSpec = DataSourceUtils.datetimeRebaseSpec(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) - new ParquetReadSupport( + new HoodieParquetReadSupport( convertTz, enableVectorizedReader = false, + enableTimestampFieldRepair = true, datetimeRebaseSpec, - int96RebaseSpec) - } else { - val datetimeRebaseMode = - Spark32PlusDataSourceUtils.datetimeRebaseMode(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) - val int96RebaseMode = - Spark32PlusDataSourceUtils.int96RebaseMode(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead) - createParquetReadSupport( - convertTz, - /* enableVectorizedReader = */ false, - datetimeRebaseMode, - int96RebaseMode) + int96RebaseSpec, + tableSchemaAsMessageType) } val reader = if (pushed.isDefined && enableRecordFilter) { @@ -389,21 +381,8 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu val unsafeProjection = if (typeChangeInfos.isEmpty) { GenerateUnsafeProjection.generate(fullSchema, fullSchema) } else { - // find type changed. - val newFullSchema = new StructType(requiredSchema.fields.zipWithIndex.map { case (f, i) => - if (typeChangeInfos.containsKey(i)) { - StructField(f.name, typeChangeInfos.get(i).getRight, f.nullable, f.metadata) - } else f - }).toAttributes ++ partitionSchema.toAttributes - val castSchema = newFullSchema.zipWithIndex.map { case (attr, i) => - if (typeChangeInfos.containsKey(i)) { - val srcType = typeChangeInfos.get(i).getRight - val dstType = typeChangeInfos.get(i).getLeft - val needTimeZone = Cast.needsTimeZone(srcType, dstType) - Cast(attr, dstType, if (needTimeZone) timeZoneId else None) - } else attr - } - GenerateUnsafeProjection.generate(castSchema, newFullSchema) + HoodieLegacyParquetFileFormatHelper.generateUnsafeProjection( + fullSchema, timeZoneId, typeChangeInfos, requiredSchema, partitionSchema, sparkAdapter.getSchemaUtils) } // NOTE: We're making appending of the partitioned values to the rows read from the @@ -428,43 +407,6 @@ class Spark34LegacyHoodieParquetFileFormat(private val shouldAppendPartitionValu } object Spark34LegacyHoodieParquetFileFormat { - - /** - * NOTE: This method is specific to Spark 3.2.0 - */ - private def createParquetFilters(args: Any*): ParquetFilters = { - // NOTE: ParquetFilters ctor args contain Scala enum, therefore we can't look it - // up by arg types, and have to instead rely on the number of args based on individual class; - // the ctor order is not guaranteed - val ctor = classOf[ParquetFilters].getConstructors.maxBy(_.getParameterCount) - ctor.newInstance(args.map(_.asInstanceOf[AnyRef]): _*) - .asInstanceOf[ParquetFilters] - } - - /** - * NOTE: This method is specific to Spark 3.2.0 - */ - private def createParquetReadSupport(args: Any*): ParquetReadSupport = { - // NOTE: ParquetReadSupport ctor args contain Scala enum, therefore we can't look it - // up by arg types, and have to instead rely on the number of args based on individual class; - // the ctor order is not guaranteed - val ctor = classOf[ParquetReadSupport].getConstructors.maxBy(_.getParameterCount) - ctor.newInstance(args.map(_.asInstanceOf[AnyRef]): _*) - .asInstanceOf[ParquetReadSupport] - } - - /** - * NOTE: This method is specific to Spark 3.2.0 - */ - private def createVectorizedParquetRecordReader(args: Any*): VectorizedParquetRecordReader = { - // NOTE: ParquetReadSupport ctor args contain Scala enum, therefore we can't look it - // up by arg types, and have to instead rely on the number of args based on individual class; - // the ctor order is not guaranteed - val ctor = classOf[VectorizedParquetRecordReader].getConstructors.maxBy(_.getParameterCount) - ctor.newInstance(args.map(_.asInstanceOf[AnyRef]): _*) - .asInstanceOf[VectorizedParquetRecordReader] - } - def pruneInternalSchema(internalSchemaStr: String, requiredSchema: StructType): String = { val querySchemaOption = SerDeHelper.fromJson(internalSchemaStr) if (querySchemaOption.isPresent && requiredSchema.nonEmpty) { @@ -531,4 +473,20 @@ object Spark34LegacyHoodieParquetFileFormat { } } } + + // Helper to repair the schema if needed + private def repairFooterSchema(original: ParquetMetadata, + tableSchemaOpt: HOption[org.apache.parquet.schema.MessageType]): ParquetMetadata = { + val repairedSchema = SchemaRepair.repairLogicalTypes(original.getFileMetaData.getSchema, tableSchemaOpt) + val oldMeta = original.getFileMetaData + new ParquetMetadata( + new FileMetaData( + repairedSchema, + oldMeta.getKeyValueMetaData, + oldMeta.getCreatedBy, + oldMeta.getFileDecryptor + ), + original.getBlocks + ) + } } diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestService.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestService.java index ad1918eabf8b2..e22fefa436803 100644 --- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestService.java +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestService.java @@ -29,10 +29,10 @@ import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; import org.apache.hadoop.hive.metastore.IHMSHandler; import org.apache.hadoop.hive.metastore.RetryingHMSHandler; +import java.lang.reflect.Constructor; import org.apache.hadoop.hive.metastore.TSetIpAddressProcessor; import org.apache.hadoop.hive.metastore.TUGIBasedProcessor; import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.thrift.TUGIContainingTransport; import org.apache.hive.service.server.HiveServer2; import org.apache.thrift.TProcessor; import org.apache.thrift.protocol.TBinaryProtocol; @@ -41,7 +41,6 @@ import org.apache.thrift.transport.TFramedTransport; import org.apache.thrift.transport.TServerSocket; import org.apache.thrift.transport.TServerTransport; -import org.apache.thrift.transport.TSocket; import org.apache.thrift.transport.TTransport; import org.apache.thrift.transport.TTransportException; import org.apache.thrift.transport.TTransportFactory; @@ -51,7 +50,6 @@ import java.io.File; import java.io.IOException; import java.net.InetSocketAddress; -import java.net.SocketException; import java.nio.file.Files; import java.util.HashMap; import java.util.Map; @@ -149,7 +147,15 @@ public HiveConf configureHive(Configuration hadoopConf, String localHiveLocation hadoopConf.set("datanucleus.schema.autoCreateTables", "true"); hadoopConf.set("datanucleus.autoCreateSchema", "true"); hadoopConf.set("datanucleus.fixedDatastore", "false"); + // Additional DataNucleus properties for Hive 3.x compatibility + hadoopConf.set("datanucleus.schema.autoCreateAll", "true"); + hadoopConf.set("datanucleus.validateTables", "false"); + hadoopConf.set("datanucleus.validateConstraints", "false"); HiveConf conf = new HiveConf(hadoopConf, HiveConf.class); + // Also set in HiveConf for Hive 3.x + conf.set("datanucleus.schema.autoCreateAll", "true"); + conf.set("datanucleus.validateTables", "false"); + conf.set("datanucleus.validateConstraints", "false"); conf.setBoolVar(ConfVars.HIVE_IN_TEST, true); conf.setBoolVar(ConfVars.METASTORE_SCHEMA_VERIFICATION, false); final int hs2ThriftPort = hadoopConf.getInt(ConfVars.HIVE_SERVER2_THRIFT_PORT.varname, HS2_THRIFT_PORT); @@ -244,7 +250,18 @@ private ChainedTTransportFactory(TTransportFactory parentTransFactory, TTranspor @Override public TTransport getTransport(TTransport trans) { - return childTransFactory.getTransport(parentTransFactory.getTransport(trans)); + try { + TTransport parentTransport = parentTransFactory.getTransport(trans); + return childTransFactory.getTransport(parentTransport); + } catch (Exception e) { + // In Hive 2.x, TTransportFactory.getTransport() doesn't throw checked exceptions, + // but the underlying methods might throw TTransportException or other exceptions. + // Wrap any exception in RuntimeException for compatibility with Hive 2.x signature. + if (e instanceof RuntimeException) { + throw (RuntimeException) e; + } + throw new RuntimeException("Failed to get transport", e); + } } } @@ -257,17 +274,6 @@ public TServerSocketKeepAlive(int port) throws TTransportException { public TServerSocketKeepAlive(InetSocketAddress address) throws TTransportException { super(address, 0); } - - @Override - protected TSocket acceptImpl() throws TTransportException { - TSocket ts = super.acceptImpl(); - try { - ts.getSocket().setKeepAlive(true); - } catch (SocketException e) { - throw new TTransportException(e); - } - return ts; - } } private TServer startMetaStore(HiveConf conf) throws IOException { @@ -288,13 +294,20 @@ private TServer startMetaStore(HiveConf conf) throws IOException { TProcessor processor; TTransportFactory transFactory; - HiveMetaStore.HMSHandler baseHandler = new HiveMetaStore.HMSHandler("new db based metaserver", conf, false); + // Use reflection to handle different HMSHandler constructor signatures between Hive 2.x and 3.x + // Hive 2.x: HMSHandler(String name, HiveConf conf, boolean allowEmbedded) + // Hive 3.x: HMSHandler(String name, HiveConf conf) + HiveMetaStore.HMSHandler baseHandler = createHMSHandler(conf); IHMSHandler handler = RetryingHMSHandler.getProxy(conf, baseHandler, true); if (conf.getBoolVar(ConfVars.METASTORE_EXECUTE_SET_UGI)) { + // Use reflection to handle different TUGIContainingTransport classes between Hive 2.x and 3.x + // Hive 2.x uses: org.apache.hadoop.hive.thrift.TUGIContainingTransport + // Hive 3.x uses: org.apache.hadoop.hive.metastore.security.TUGIContainingTransport + TTransportFactory tugiFactory = createTUGIContainingTransportFactory(); transFactory = useFramedTransport - ? new ChainedTTransportFactory(new TFramedTransport.Factory(), new TUGIContainingTransport.Factory()) - : new TUGIContainingTransport.Factory(); + ? new ChainedTTransportFactory(new TFramedTransport.Factory(), tugiFactory) + : tugiFactory; processor = new TUGIBasedProcessor<>(handler); LOG.info("Starting DB backed MetaStore Server with SetUGI enabled"); @@ -315,4 +328,85 @@ private TServer startMetaStore(HiveConf conf) throws IOException { throw new IOException(x); } } + + /** + * Creates an HMSHandler instance using reflection to support both Hive 2.x and 3.x. + * Hive 3.1.3 uses: HMSHandler(String name, Configuration conf) or HMSHandler(String name, Configuration conf, boolean) + * Hive 2.x uses: HMSHandler(String name, HiveConf conf, boolean allowEmbedded) + * Some versions may use: HMSHandler(String name, HiveConf conf) + */ + private HiveMetaStore.HMSHandler createHMSHandler(HiveConf conf) throws IOException { + String handlerName = "new db based metaserver"; + Class hmsHandlerClass = HiveMetaStore.HMSHandler.class; + + // Try Hive 3.x constructor with Configuration (2 parameters: String, Configuration) + try { + Constructor constructor = hmsHandlerClass.getConstructor(String.class, Configuration.class); + return (HiveMetaStore.HMSHandler) constructor.newInstance(handlerName, conf); + } catch (NoSuchMethodException e) { + // Continue to next option + } catch (Exception e) { + throw new IOException("Failed to create HMSHandler using (String, Configuration) constructor", e); + } + + // Try Hive 3.x constructor with Configuration (3 parameters: String, Configuration, boolean) + try { + Constructor constructor = hmsHandlerClass.getConstructor(String.class, Configuration.class, boolean.class); + return (HiveMetaStore.HMSHandler) constructor.newInstance(handlerName, conf, false); + } catch (NoSuchMethodException e) { + // Continue to next option + } catch (Exception e) { + throw new IOException("Failed to create HMSHandler using (String, Configuration, boolean) constructor", e); + } + + // Try Hive 3.x constructor with HiveConf (2 parameters: String, HiveConf) + try { + Constructor constructor = hmsHandlerClass.getConstructor(String.class, HiveConf.class); + return (HiveMetaStore.HMSHandler) constructor.newInstance(handlerName, conf); + } catch (NoSuchMethodException e) { + // Continue to next option + } catch (Exception e) { + throw new IOException("Failed to create HMSHandler using (String, HiveConf) constructor", e); + } + + // Try Hive 2.x constructor (3 parameters: String, HiveConf, boolean) + try { + Constructor constructor = hmsHandlerClass.getConstructor(String.class, HiveConf.class, boolean.class); + return (HiveMetaStore.HMSHandler) constructor.newInstance(handlerName, conf, false); + } catch (NoSuchMethodException e) { + throw new IOException("Failed to create HMSHandler. No compatible constructor found. " + + "Available constructors: " + java.util.Arrays.toString(hmsHandlerClass.getConstructors()), e); + } catch (Exception e) { + throw new IOException("Failed to create HMSHandler using (String, HiveConf, boolean) constructor", e); + } + } + + /** + * Creates a TUGIContainingTransport.Factory instance using reflection to support both Hive 2.x and 3.x. + * Hive 2.x uses: org.apache.hadoop.hive.thrift.TUGIContainingTransport + * Hive 3.x uses: org.apache.hadoop.hive.metastore.security.TUGIContainingTransport + */ + private TTransportFactory createTUGIContainingTransportFactory() throws IOException { + // Try Hive 3.x first (metastore.security package) + try { + Class factoryClass = Class.forName("org.apache.hadoop.hive.metastore.security.TUGIContainingTransport$Factory"); + Constructor factoryConstructor = factoryClass.getConstructor(); + return (TTransportFactory) factoryConstructor.newInstance(); + } catch (ClassNotFoundException e) { + // Hive 3.x class not found, try Hive 2.x + } catch (Exception e) { + throw new IOException("Failed to create TUGIContainingTransport.Factory using Hive 3.x class", e); + } + + // Try Hive 2.x (thrift package) + try { + Class factoryClass = Class.forName("org.apache.hadoop.hive.thrift.TUGIContainingTransport$Factory"); + Constructor factoryConstructor = factoryClass.getConstructor(); + return (TTransportFactory) factoryConstructor.newInstance(); + } catch (ClassNotFoundException e) { + throw new IOException("Failed to create TUGIContainingTransport.Factory. Neither Hive 2.x nor 3.x class found", e); + } catch (Exception e) { + throw new IOException("Failed to create TUGIContainingTransport.Factory using Hive 2.x class", e); + } + } } diff --git a/hudi-timeline-service/pom.xml b/hudi-timeline-service/pom.xml index f771f29154ebf..b462b5d18b278 100644 --- a/hudi-timeline-service/pom.xml +++ b/hudi-timeline-service/pom.xml @@ -88,10 +88,38 @@ kryo-shaded
    + + + org.eclipse.jetty + jetty-server + + + org.eclipse.jetty + jetty-servlet + + + org.eclipse.jetty + jetty-http + + + org.eclipse.jetty + jetty-io + org.eclipse.jetty jetty-util - ${jetty.version} + + + org.eclipse.jetty + jetty-webapp + + + org.eclipse.jetty + jetty-xml + + + org.eclipse.jetty + jetty-security @@ -118,6 +146,13 @@ io.javalin javalin ${javalin.version} + + + + org.eclipse.jetty + * + + @@ -139,6 +174,15 @@ javax.servlet * + + + org.eclipse.jetty + * + + + org.eclipse.jetty.aggregate + * + diff --git a/hudi-utilities/pom.xml b/hudi-utilities/pom.xml index c61cf64968c2e..cb28e3a3ec264 100644 --- a/hudi-utilities/pom.xml +++ b/hudi-utilities/pom.xml @@ -66,6 +66,28 @@ com.github.os72 protoc-jar-maven-plugin + 3.11.4 + + + proto-test-compile + generate-test-sources + + run + + + test + ${project.build.directory}/generated-test-sources/protobuf + + src/test/resources + + + + + + com.google.protobuf:protoc:${proto.version} + ${protoc.version} + true + @@ -132,6 +154,12 @@ + + org.apache.hudi + hudi-spark-common_${scala.binary.version} + ${project.version} + + @@ -186,6 +214,14 @@ org.apache.kafka kafka-clients ${kafka.version} + + + + + io.netty + * + + @@ -202,6 +238,12 @@ com.google.protobuf protobuf-java + + + + io.netty + * + @@ -368,6 +410,11 @@ kafka-schema-registry-client ${confluent.version} + + io.confluent + kafka-protobuf-serializer + ${confluent.version} + @@ -487,5 +534,19 @@ + + + + org.apache.thrift + libthrift + ${thrift.version} + test + + + org.apache.hudi + hudi-spark-common_2.12 + 0.14.2-SNAPSHOT + test + diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java index 856b5266c97cb..79cfa0b1e9c2a 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java @@ -174,6 +174,8 @@ public class HoodieMetadataTableValidator implements Serializable { private final String taskLabels; + private final List throwables = new ArrayList<>(); + public HoodieMetadataTableValidator(JavaSparkContext jsc, Config cfg) { this.jsc = jsc; this.cfg = cfg; @@ -191,6 +193,27 @@ public HoodieMetadataTableValidator(JavaSparkContext jsc, Config cfg) { this.taskLabels = generateValidationTaskLabels(); } + /** + * Returns list of Throwable which were encountered during validation. This method is useful + * when ignoreFailed parameter is set to true. + */ + public List getThrowables() { + return throwables; + } + + /** + * Returns true if there is a validation failure encountered during validation. + * This method is useful when ignoreFailed parameter is set to true. + */ + public boolean hasValidationFailure() { + for (Throwable throwable : throwables) { + if (throwable instanceof HoodieValidationException) { + return true; + } + } + return false; + } + private String generateValidationTaskLabels() { List labelList = new ArrayList<>(); if (cfg.validateLatestBaseFiles) { @@ -430,6 +453,7 @@ private boolean doHoodieMetadataTableValidationOnce() { if (!cfg.ignoreFailed) { throw e; } + throwables.add(e); return false; } } @@ -494,12 +518,12 @@ public boolean doMetadataTableValidation() { HoodieMetadataValidationContext fsBasedContext = new HoodieMetadataValidationContext(engineContext, cfg, metaClient, false)) { Set finalBaseFilesForCleaning = baseFilesForCleaning; - List> result = new ArrayList<>( + List> result = new ArrayList<>( engineContext.parallelize(allPartitions, allPartitions.size()).map(partitionPath -> { try { validateFilesInPartition(metadataTableBasedContext, fsBasedContext, partitionPath, finalBaseFilesForCleaning); LOG.info(String.format("Metadata table validation succeeded for partition %s (partition %s)", partitionPath, taskLabels)); - return Pair.of(true, ""); + return Pair.of(true, null); } catch (HoodieValidationException e) { LOG.error( String.format("Metadata table validation failed for partition %s due to HoodieValidationException (partition %s)", @@ -507,26 +531,25 @@ public boolean doMetadataTableValidation() { if (!cfg.ignoreFailed) { throw e; } - return Pair.of(false, e.getMessage() + " for partition: " + partitionPath); + return Pair.of(false, new HoodieValidationException("Validation failed for partition: " + partitionPath, e)); } }).collectAsList()); try { validateRecordIndex(engineContext, metaClient, metadataTableBasedContext.getTableMetadata()); - result.add(Pair.of(true, "")); + result.add(Pair.of(true, null)); } catch (HoodieValidationException e) { - LOG.error( - "Metadata table validation failed due to HoodieValidationException in record index validation", e); - if (!cfg.ignoreFailed) { - throw e; - } - result.add(Pair.of(false, e.getMessage())); + handleValidationException( + e, result, "Metadata table validation failed due to HoodieValidationException in partition stats validation"); } - for (Pair res : result) { + for (Pair res : result) { finalResult &= res.getKey(); if (res.getKey().equals(false)) { - LOG.error("Metadata Validation failed for table: " + cfg.basePath + " with error: " + res.getValue()); + LOG.error("Metadata Validation failed for table: {}", cfg.basePath, res.getValue()); + if (res.getRight() != null) { + throwables.add(res.getRight()); + } } } @@ -544,6 +567,14 @@ public boolean doMetadataTableValidation() { } } + private void handleValidationException(HoodieValidationException e, List> result, String errorMsg) { + LOG.error("{} for table: {} ", errorMsg, cfg.basePath, e); + if (!cfg.ignoreFailed) { + throw e; + } + result.add(Pair.of(false, e)); + } + /** * Check metadata is initialized and available to ready. * If not we will log.warn and skip current validation. @@ -577,7 +608,7 @@ private boolean checkMetadataTableIsAvailable() { /** * Compare the listing partitions result between metadata table and fileSystem. */ - private List validatePartitions(HoodieSparkEngineContext engineContext, String basePath) { + List validatePartitions(HoodieSparkEngineContext engineContext, String basePath) { // compare partitions List allPartitionPathsFromFS = FSUtils.getAllPartitionPaths(engineContext, basePath, false, cfg.assumeDatePartitioning); HoodieTimeline completedTimeline = metaClient.getCommitsTimeline().filterCompletedInstants(); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deser/KafkaAvroSchemaDeserializer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deser/KafkaAvroSchemaDeserializer.java index 246be5f8ec614..4673eceed1577 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deser/KafkaAvroSchemaDeserializer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deser/KafkaAvroSchemaDeserializer.java @@ -60,7 +60,6 @@ public void configure(Map configs, boolean isKey) { /** * We need to inject sourceSchema instead of reader schema during deserialization or later stages of the pipeline. * - * @param includeSchemaAndVersion * @param topic * @param isKey * @param payload @@ -70,13 +69,12 @@ public void configure(Map configs, boolean isKey) { */ @Override protected Object deserialize( - boolean includeSchemaAndVersion, String topic, Boolean isKey, byte[] payload, Schema readerSchema) throws SerializationException { - return super.deserialize(includeSchemaAndVersion, topic, isKey, payload, sourceSchema); + return super.deserialize(topic, isKey, payload, sourceSchema); } protected TypedProperties getConvertToTypedProperties(Map configs) { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/postprocessor/ChainedSchemaPostProcessor.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/postprocessor/ChainedSchemaPostProcessor.java index 0295e80bed8be..b5cb01d72a69a 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/postprocessor/ChainedSchemaPostProcessor.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/postprocessor/ChainedSchemaPostProcessor.java @@ -48,6 +48,6 @@ public Schema processSchema(Schema schema) { for (SchemaPostProcessor processor : processors) { targetSchema = processor.processSchema(targetSchema); } - return targetSchema; + return schema; } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamer.java index 11998f2cfacdc..8ed11bd323e85 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamer.java @@ -19,8 +19,7 @@ package org.apache.hudi.utilities.streamer; -import org.apache.hudi.DataSourceWriteOptions; -import org.apache.hudi.HoodieWriterUtils; +import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.async.AsyncClusteringService; import org.apache.hudi.async.AsyncCompactService; import org.apache.hudi.async.SparkAsyncClusteringService; @@ -51,6 +50,7 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.HoodieWriterUtils; import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieClusteringUpdateException; import org.apache.hudi.exception.HoodieException; @@ -176,7 +176,7 @@ private static TypedProperties combineProperties(Config cfg, Option writeOptions = new HashMap<>(); + writeOptions.put(DataSourceWriteOptions.TABLE_NAME().key(), "test_table"); + writeOptions.put("hoodie.table.name", "test_table"); + writeOptions.put(DataSourceWriteOptions.TABLE_TYPE().key(), "MERGE_ON_READ"); + writeOptions.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key"); + writeOptions.put(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp"); + writeOptions.put(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition_path"); + + Dataset inserts = makeInsertDf("000", 5).cache(); + inserts.write().format("hudi").options(writeOptions) + .option(DataSourceWriteOptions.OPERATION().key(), WriteOperationType.BULK_INSERT.value()) + .option(HoodieMetadataConfig.RECORD_INDEX_ENABLE_PROP.key(), "true") + .option(HoodieMetadataConfig.RECORD_INDEX_MIN_FILE_GROUP_COUNT_PROP.key(), "1") + .option(HoodieMetadataConfig.RECORD_INDEX_MAX_FILE_GROUP_COUNT_PROP.key(), "1") + .mode(SaveMode.Overwrite) + .save(basePath); + Dataset updates = makeUpdateDf("001", 5).cache(); + updates.write().format("hudi").options(writeOptions) + .option(DataSourceWriteOptions.OPERATION().key(), WriteOperationType.UPSERT.value()) + .option(HoodieMetadataConfig.RECORD_INDEX_ENABLE_PROP.key(), "true") + .option(HoodieMetadataConfig.RECORD_INDEX_MIN_FILE_GROUP_COUNT_PROP.key(), "1") + .option(HoodieMetadataConfig.RECORD_INDEX_MAX_FILE_GROUP_COUNT_PROP.key(), "1") + .mode(SaveMode.Append) + .save(basePath); + + // validate MDT + HoodieMetadataTableValidator.Config config = new HoodieMetadataTableValidator.Config(); + config.basePath = basePath; + config.validateLatestFileSlices = true; + config.validateAllFileGroups = true; + HoodieMetadataTableValidator validator = new HoodieMetadataTableValidator(jsc, config); + assertTrue(validator.run()); + assertFalse(validator.hasValidationFailure()); + assertTrue(validator.getThrowables().isEmpty()); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testAdditionalPartitionsinMDT(boolean testFailureCase) throws InterruptedException { + Map writeOptions = new HashMap<>(); + writeOptions.put(DataSourceWriteOptions.TABLE_NAME().key(), "test_table"); + writeOptions.put("hoodie.table.name", "test_table"); + writeOptions.put(DataSourceWriteOptions.TABLE_TYPE().key(), "MERGE_ON_READ"); + writeOptions.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key"); + writeOptions.put(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp"); + writeOptions.put(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition_path"); + + // constructor of HoodieMetadataValidator instantiates HoodieTableMetaClient. hence creating an actual table. but rest of tests is mocked. + Dataset inserts = makeInsertDf("000", 5).cache(); + inserts.write().format("hudi").options(writeOptions) + .option(DataSourceWriteOptions.OPERATION().key(), WriteOperationType.BULK_INSERT.value()) + .mode(SaveMode.Overwrite) + .save(basePath); + + HoodieMetadataTableValidator.Config config = new HoodieMetadataTableValidator.Config(); + config.basePath = basePath; + config.validateLatestFileSlices = true; + config.validateAllFileGroups = true; + MockHoodieMetadataTableValidator validator = new MockHoodieMetadataTableValidator(jsc, config); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + HoodieTableMetaClient metaClient = mock(HoodieTableMetaClient.class); + + String partition1 = "PARTITION1"; + String partition2 = "PARTITION2"; + String partition3 = "PARTITION3"; + + // mock list of partitions to return from MDT to have 1 additional partition compared to FS based listing. + List mdtPartitions = Arrays.asList(partition1, partition2, partition3); + validator.setMetadataPartitionsToReturn(mdtPartitions); + List fsPartitions = Arrays.asList(partition1, partition2); + validator.setFsPartitionsToReturn(fsPartitions); + + // mock completed timeline. + HoodieTimeline commitsTimeline = mock(HoodieTimeline.class); + HoodieTimeline completedTimeline = mock(HoodieTimeline.class); + when(metaClient.getCommitsTimeline()).thenReturn(commitsTimeline); + when(commitsTimeline.filterCompletedInstants()).thenReturn(completedTimeline); + + if (testFailureCase) { + // 3rd partition which is additional in MDT should have creation time before last instant in timeline. + + String partition3CreationTime = HoodieActiveTimeline.createNewInstantTime(); + Thread.sleep(100); + String lastIntantCreationTime = HoodieActiveTimeline.createNewInstantTime(); + + HoodieInstant lastInstant = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, lastIntantCreationTime); + when(completedTimeline.lastInstant()).thenReturn(Option.of(lastInstant)); + validator.setPartitionCreationTime(Option.of(partition3CreationTime)); + // validate that exception is thrown since MDT has one additional partition. + assertThrows(HoodieValidationException.class, () -> { + validator.validatePartitions(engineContext, basePath); + }); + } else { + // 3rd partition creation time is > last completed instant + HoodieInstant lastInstant = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, HoodieActiveTimeline.createNewInstantTime()); + when(completedTimeline.lastInstant()).thenReturn(Option.of(lastInstant)); + Thread.sleep(100); + validator.setPartitionCreationTime(Option.of(HoodieActiveTimeline.createNewInstantTime())); + + // validate that all 3 partitions are returned + assertEquals(mdtPartitions, validator.validatePartitions(engineContext, basePath)); + } + } + + class MockHoodieMetadataTableValidator extends HoodieMetadataTableValidator { + + private List metadataPartitionsToReturn; + private List fsPartitionsToReturn; + private Option partitionCreationTime; + + public MockHoodieMetadataTableValidator(JavaSparkContext jsc, Config cfg) { + super(jsc, cfg); + } + + void setMetadataPartitionsToReturn(List metadataPartitionsToReturn) { + this.metadataPartitionsToReturn = metadataPartitionsToReturn; + } + + void setFsPartitionsToReturn(List fsPartitionsToReturn) { + this.fsPartitionsToReturn = fsPartitionsToReturn; + } + + void setPartitionCreationTime(Option partitionCreationTime) { + this.partitionCreationTime = partitionCreationTime; + } + } + + @Test + public void testRliValidationFalsePositiveCase() throws IOException { + Map writeOptions = new HashMap<>(); + writeOptions.put(DataSourceWriteOptions.TABLE_NAME().key(), "test_table"); + writeOptions.put("hoodie.table.name", "test_table"); + writeOptions.put(DataSourceWriteOptions.TABLE_TYPE().key(), "MERGE_ON_READ"); + writeOptions.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key"); + writeOptions.put(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp"); + writeOptions.put(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition_path"); + + Dataset inserts = makeInsertDf("000", 5).cache(); + inserts.write().format("hudi").options(writeOptions) + .option(DataSourceWriteOptions.OPERATION().key(), WriteOperationType.BULK_INSERT.value()) + .option(HoodieMetadataConfig.RECORD_INDEX_ENABLE_PROP.key(), "true") + .option(HoodieMetadataConfig.RECORD_INDEX_MIN_FILE_GROUP_COUNT_PROP.key(), "1") + .option(HoodieMetadataConfig.RECORD_INDEX_MAX_FILE_GROUP_COUNT_PROP.key(), "1") + .mode(SaveMode.Overwrite) + .save(basePath); + Dataset updates = makeUpdateDf("001", 5).cache(); + updates.write().format("hudi").options(writeOptions) + .option(DataSourceWriteOptions.OPERATION().key(), WriteOperationType.UPSERT.value()) + .option(HoodieMetadataConfig.RECORD_INDEX_ENABLE_PROP.key(), "true") + .option(HoodieMetadataConfig.RECORD_INDEX_MIN_FILE_GROUP_COUNT_PROP.key(), "1") + .option(HoodieMetadataConfig.RECORD_INDEX_MAX_FILE_GROUP_COUNT_PROP.key(), "1") + .mode(SaveMode.Append) + .save(basePath); + + Dataset inserts2 = makeInsertDf("002", 5).cache(); + inserts2.write().format("hudi").options(writeOptions) + .option(DataSourceWriteOptions.OPERATION().key(), WriteOperationType.BULK_INSERT.value()) + .option(HoodieMetadataConfig.RECORD_INDEX_ENABLE_PROP.key(), "true") + .option(HoodieMetadataConfig.RECORD_INDEX_MIN_FILE_GROUP_COUNT_PROP.key(), "1") + .option(HoodieMetadataConfig.RECORD_INDEX_MAX_FILE_GROUP_COUNT_PROP.key(), "1") + .mode(SaveMode.Append) + .save(basePath); + + // validate MDT + HoodieMetadataTableValidator.Config config = new HoodieMetadataTableValidator.Config(); + config.basePath = "file://" + basePath; + config.validateLatestFileSlices = true; + config.validateAllFileGroups = true; + + // lets ensure we have a pending commit when FS based polling is done. and the commit completes when MDT is polled. + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(basePath).setConf( + new Configuration(jsc.hadoopConfiguration())).build(); + // moving out the completed commit meta file to a temp location + HoodieInstant lastInstant = metaClient.getActiveTimeline().filterCompletedInstants().lastInstant().get(); + String latestCompletedCommitMetaFile = basePath + "/.hoodie/" + lastInstant.getFileName(); + String tempDir = getTempLocation(); + String destFilePath = tempDir + "/" + lastInstant.getFileName(); + new File(latestCompletedCommitMetaFile).renameTo(new File(destFilePath)); + + MockHoodieMetadataTableValidatorForRli validator = new MockHoodieMetadataTableValidatorForRli(jsc, config); + validator.setOriginalFilePath(latestCompletedCommitMetaFile); + validator.setDestFilePath(destFilePath); + assertTrue(validator.run()); + assertFalse(validator.hasValidationFailure()); + assertTrue(validator.getThrowables().isEmpty()); + } + + /** + * Class to assist with testing a false positive case with RLI validation. + */ + static class MockHoodieMetadataTableValidatorForRli extends HoodieMetadataTableValidator { + + private String destFilePath; + private String originalFilePath; + + public MockHoodieMetadataTableValidatorForRli(JavaSparkContext jsc, Config cfg) { + super(jsc, cfg); + } + + public void setDestFilePath(String destFilePath) { + this.destFilePath = destFilePath; + } + + public void setOriginalFilePath(String originalFilePath) { + this.originalFilePath = originalFilePath; + } + } + + private String getTempLocation() { + try { + String folderName = "temp_location"; + java.nio.file.Path tempPath = tempDir.resolve(folderName); + java.nio.file.Files.createDirectories(tempPath); + return tempPath.toAbsolutePath().toString(); + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + } + + protected Dataset makeInsertDf(String instantTime, Integer n) { + List records = dataGen.generateInserts(instantTime, n).stream() + .map(r -> recordToString(r).get()).collect(Collectors.toList()); + JavaRDD rdd = jsc.parallelize(records); + return sparkSession.read().json(rdd); + } + + protected Dataset makeUpdateDf(String instantTime, Integer n) { + try { + List records = dataGen.generateUpdates(instantTime, n).stream() + .map(r -> recordToString(r).get()).collect(Collectors.toList()); + JavaRDD rdd = jsc.parallelize(records); + return sparkSession.read().json(rdd); + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerTestBase.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerTestBase.java index 87f875642be33..343ac46a6e8a9 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerTestBase.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerTestBase.java @@ -161,6 +161,7 @@ protected static void prepareInitialConfigs(FileSystem dfs, String dfsBasePath, UtilitiesTestBase.Helpers.copyToDFS("streamer-config/target.avsc", dfs, dfsBasePath + "/target.avsc"); UtilitiesTestBase.Helpers.copyToDFS("streamer-config/target-flattened.avsc", dfs, dfsBasePath + "/target-flattened.avsc"); + UtilitiesTestBase.Helpers.copyToDFS("streamer-config/source-timestamp-millis.avsc", dfs, dfsBasePath + "/source-timestamp-millis.avsc"); UtilitiesTestBase.Helpers.copyToDFS("streamer-config/source_short_trip_uber.avsc", dfs, dfsBasePath + "/source_short_trip_uber.avsc"); UtilitiesTestBase.Helpers.copyToDFS("streamer-config/source_uber.avsc", dfs, dfsBasePath + "/source_uber.avsc"); UtilitiesTestBase.Helpers.copyToDFS("streamer-config/target_short_trip_uber.avsc", dfs, dfsBasePath + "/target_short_trip_uber.avsc"); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamer.java index 60ed1b6732a58..7b21ef0880239 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamer.java @@ -22,6 +22,7 @@ import org.apache.hudi.DataSourceReadOptions; import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.HoodieSparkRecordMerger; +import org.apache.hudi.HoodieSparkUtils; import org.apache.hudi.HoodieSparkUtils$; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.transaction.lock.InProcessLockProvider; @@ -31,7 +32,10 @@ import org.apache.hudi.common.config.HoodieStorageConfig; import org.apache.hudi.common.config.LockConfiguration; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.DefaultHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecordMerger; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; @@ -45,10 +49,12 @@ import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.HoodieTableVersion; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineUtils; +import org.apache.hudi.common.table.view.FileSystemViewManager; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestUtils; @@ -91,6 +97,7 @@ import org.apache.hudi.utilities.streamer.NoNewDataTerminationStrategy; import org.apache.hudi.utilities.testutils.JdbcTestUtils; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; +import org.apache.hudi.utilities.testutils.sources.AbstractBaseTestSource; import org.apache.hudi.utilities.testutils.sources.DistributedTestDataSource; import org.apache.hudi.utilities.transform.SqlQueryBasedTransformer; import org.apache.hudi.utilities.transform.Transformer; @@ -131,6 +138,8 @@ import org.slf4j.LoggerFactory; import java.io.IOException; +import java.net.URI; +import java.nio.file.Paths; import java.sql.Connection; import java.sql.DriverManager; import java.time.Instant; @@ -637,6 +646,503 @@ public void testSchemaEvolution(String tableType, boolean useUserProvidedSchema, defaultSchemaProviderClassName = FilebasedSchemaProvider.class.getName(); } + @Test + public void testTimestampMillis() throws Exception { + String tableBasePath = basePath + "/testTimestampMillis"; + defaultSchemaProviderClassName = FilebasedSchemaProvider.class.getName(); + // Insert data produced with Schema A, pass Schema A + HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, Collections.singletonList(TestIdentityTransformer.class.getName()), + PROPS_FILENAME_TEST_SOURCE, false, true, false, null, HoodieTableType.MERGE_ON_READ.name()); + cfg.payloadClassName = DefaultHoodieRecordPayload.class.getName(); + cfg.configs.add("hoodie.streamer.schemaprovider.source.schema.file=" + basePath + "/source-timestamp-millis.avsc"); + cfg.configs.add("hoodie.streamer.schemaprovider.target.schema.file=" + basePath + "/source-timestamp-millis.avsc"); + cfg.configs.add(String.format("%s=%s", HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT.key(), "0")); + cfg.configs.add("hoodie.datasource.write.row.writer.enable=false"); + + new HoodieDeltaStreamer(cfg, jsc).sync(); + assertRecordCount(1000, tableBasePath, sqlContext); + TestHelpers.assertCommitMetadata("00000", tableBasePath, fs, 1); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setBasePath(cfg.targetBasePath) + .setConf(new Configuration()).build(); + TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); + Schema tableSchema = tableSchemaResolver.getTableAvroSchema(false); + assertEquals("timestamp-millis", tableSchema.getField("current_ts").schema().getLogicalType().getName()); + assertEquals(1000, sparkSession.read().options(hudiOpts).format("org.apache.hudi").load(tableBasePath).filter("current_ts > '1980-01-01'").count()); + + cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, Collections.singletonList(TestIdentityTransformer.class.getName()), + PROPS_FILENAME_TEST_SOURCE, false, true, false, null, HoodieTableType.MERGE_ON_READ.name()); + cfg.payloadClassName = DefaultHoodieRecordPayload.class.getName(); + cfg.configs.add("hoodie.streamer.schemaprovider.source.schema.file=" + basePath + "/source-timestamp-millis.avsc"); + cfg.configs.add("hoodie.streamer.schemaprovider.target.schema.file=" + basePath + "/source-timestamp-millis.avsc"); + cfg.configs.add(String.format("%s=%s", HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT.key(), "0")); + cfg.configs.add("hoodie.datasource.write.row.writer.enable=false"); + + new HoodieDeltaStreamer(cfg, jsc).sync(); + assertRecordCount(1450, tableBasePath, sqlContext); + TestHelpers.assertCommitMetadata("00001", tableBasePath, fs, 2); + tableSchema = tableSchemaResolver.getTableAvroSchema(false); + assertEquals("timestamp-millis", tableSchema.getField("current_ts").schema().getLogicalType().getName()); + sqlContext.clearCache(); + assertEquals(1450, sqlContext.read().options(hudiOpts).format("org.apache.hudi").load(tableBasePath).filter("current_ts > '1980-01-01'").count()); + assertEquals(1450, sqlContext.read().options(hudiOpts).format("org.apache.hudi").load(tableBasePath).filter("current_ts < '2080-01-01'").count()); + assertEquals(0, sqlContext.read().options(hudiOpts).format("org.apache.hudi").load(tableBasePath).filter("current_ts < '1980-01-01'").count()); + } + + @Test + public void testLogicalTypes() throws Exception { + try { + String tableBasePath = basePath + "/testTimestampMillis"; + defaultSchemaProviderClassName = TestHoodieDeltaStreamerSchemaEvolutionBase.TestSchemaProvider.class.getName(); + + if (HoodieSparkUtils.isSpark3_3()) { + TestHoodieDeltaStreamerSchemaEvolutionBase.TestSchemaProvider.sourceSchema = HoodieTestDataGenerator.AVRO_TRIP_LOGICAL_TYPES_SCHEMA_NO_LTS; + TestHoodieDeltaStreamerSchemaEvolutionBase.TestSchemaProvider.targetSchema = HoodieTestDataGenerator.AVRO_TRIP_LOGICAL_TYPES_SCHEMA_NO_LTS; + AbstractBaseTestSource.schemaStr = HoodieTestDataGenerator.TRIP_LOGICAL_TYPES_SCHEMA_NO_LTS; + AbstractBaseTestSource.avroSchema = HoodieTestDataGenerator.AVRO_TRIP_LOGICAL_TYPES_SCHEMA_NO_LTS; + } else { + TestHoodieDeltaStreamerSchemaEvolutionBase.TestSchemaProvider.sourceSchema = HoodieTestDataGenerator.AVRO_TRIP_LOGICAL_TYPES_SCHEMA; + TestHoodieDeltaStreamerSchemaEvolutionBase.TestSchemaProvider.targetSchema = HoodieTestDataGenerator.AVRO_TRIP_LOGICAL_TYPES_SCHEMA; + AbstractBaseTestSource.schemaStr = HoodieTestDataGenerator.TRIP_LOGICAL_TYPES_SCHEMA; + AbstractBaseTestSource.avroSchema = HoodieTestDataGenerator.AVRO_TRIP_LOGICAL_TYPES_SCHEMA; + } + + // Insert data produced with Schema A, pass Schema A + HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, Collections.singletonList(TestIdentityTransformer.class.getName()), + PROPS_FILENAME_TEST_SOURCE, false, true, false, null, HoodieTableType.MERGE_ON_READ.name()); + cfg.payloadClassName = DefaultHoodieRecordPayload.class.getName(); + cfg.configs.add(String.format("%s=%s", HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT.key(), "0")); + cfg.configs.add("hoodie.datasource.write.row.writer.enable=false"); + + new HoodieDeltaStreamer(cfg, jsc).sync(); + assertRecordCount(1000, tableBasePath, sqlContext); + TestHelpers.assertCommitMetadata("00000", tableBasePath, fs, 1); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setBasePath(cfg.targetBasePath) + .setConf(new Configuration()).build(); + TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); + Schema tableSchema = tableSchemaResolver.getTableAvroSchema(false); + Map hudiOpts = new HashMap<>(); + hudiOpts.put("hoodie.datasource.write.recordkey.field", "id"); + logicalAssertions(tableSchema, tableBasePath, hudiOpts, HoodieTableVersion.current().versionCode()); + + cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, Collections.singletonList(TestIdentityTransformer.class.getName()), + PROPS_FILENAME_TEST_SOURCE, false, true, false, null, HoodieTableType.MERGE_ON_READ.name()); + cfg.payloadClassName = DefaultHoodieRecordPayload.class.getName(); + cfg.configs.add(String.format("%s=%s", HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT.key(), "0")); + cfg.configs.add("hoodie.datasource.write.row.writer.enable=false"); + + new HoodieDeltaStreamer(cfg, jsc).sync(); + assertRecordCount(1450, tableBasePath, sqlContext); + TestHelpers.assertCommitMetadata("00001", tableBasePath, fs, 2); + tableSchemaResolver = new TableSchemaResolver(metaClient); + tableSchema = tableSchemaResolver.getTableAvroSchema(false); + logicalAssertions(tableSchema, tableBasePath, hudiOpts, HoodieTableVersion.current().versionCode()); + } finally { + defaultSchemaProviderClassName = FilebasedSchemaProvider.class.getName(); + AbstractBaseTestSource.schemaStr = HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; + AbstractBaseTestSource.avroSchema = HoodieTestDataGenerator.AVRO_SCHEMA; + } + } + + private void logicalAssertions(Schema tableSchema, String tableBasePath, Map hudiOpts, int tableVersion) { + assertEquals("timestamp-micros", tableSchema.getField("ts_micros").schema().getLogicalType().getName()); + assertEquals("date", tableSchema.getField("event_date").schema().getLogicalType().getName()); + + sqlContext.clearCache(); + Dataset df = sqlContext.read() + .options(hudiOpts) + .format("org.apache.hudi") + .load(tableBasePath); + + long totalCount = df.count(); + long expectedHalf = totalCount / 2; + long tolerance = totalCount / 20; + if (totalCount < 100) { + tolerance = totalCount / 4; + } + + assertHalfSplit(df, "ts_micros > timestamp('2020-06-01 12:00:00Z')", expectedHalf, tolerance, "ts_micros > threshold"); + assertHalfSplit(df, "ts_micros < timestamp('2020-06-01 12:00:00Z')", expectedHalf, tolerance, "ts_micros < threshold"); + assertBoundaryCounts(df, "ts_micros > timestamp('2020-06-01 12:00:00.000001Z')", "ts_micros <= timestamp('2020-06-01 12:00:00.000001Z')", totalCount); + assertBoundaryCounts(df, "ts_micros < timestamp('2020-06-01 11:59:59.999999Z')", "ts_micros >= timestamp('2020-06-01 11:59:59.999999Z')", totalCount); + + if (!HoodieSparkUtils.isSpark3_3()) { + assertHalfSplit(df, "local_ts_millis > CAST('2015-05-20 12:34:56' AS TIMESTAMP_NTZ)", expectedHalf, tolerance, "local_ts_millis > threshold"); + assertHalfSplit(df, "local_ts_millis < CAST('2015-05-20 12:34:56' AS TIMESTAMP_NTZ)", expectedHalf, tolerance, "local_ts_millis < threshold"); + assertBoundaryCounts(df, "local_ts_millis > CAST('2015-05-20 12:34:56.001' AS TIMESTAMP_NTZ)", "local_ts_millis <= CAST('2015-05-20 12:34:56.001' AS TIMESTAMP_NTZ)", totalCount); + assertBoundaryCounts(df, "local_ts_millis < CAST('2015-05-20 12:34:55.999' AS TIMESTAMP_NTZ)", "local_ts_millis >= CAST('2015-05-20 12:34:55.999' AS TIMESTAMP_NTZ)", totalCount); + + assertHalfSplit(df, "local_ts_micros > CAST('2017-07-07 07:07:07' AS TIMESTAMP_NTZ)", expectedHalf, tolerance, "local_ts_micros > threshold"); + assertHalfSplit(df, "local_ts_micros < CAST('2017-07-07 07:07:07' AS TIMESTAMP_NTZ)", expectedHalf, tolerance, "local_ts_micros < threshold"); + assertBoundaryCounts(df, "local_ts_micros > CAST('2017-07-07 07:07:07.000001' AS TIMESTAMP_NTZ)", "local_ts_micros <= CAST('2017-07-07 07:07:07.000001' AS TIMESTAMP_NTZ)", totalCount); + assertBoundaryCounts(df, "local_ts_micros < CAST('2017-07-07 07:07:06.999999' AS TIMESTAMP_NTZ)", "local_ts_micros >= CAST('2017-07-07 07:07:06.999999' AS TIMESTAMP_NTZ)", totalCount); + } + + assertHalfSplit(df, "event_date > date('2000-01-01')", expectedHalf, tolerance, "event_date > threshold"); + assertHalfSplit(df, "event_date < date('2000-01-01')", expectedHalf, tolerance, "event_date < threshold"); + assertBoundaryCounts(df, "event_date > date('2000-01-02')", "event_date <= date('2000-01-02')", totalCount); + assertBoundaryCounts(df, "event_date < date('1999-12-31')", "event_date >= date('1999-12-31')", totalCount); + } + + private void assertHalfSplit(Dataset df, String filterExpr, long expectedHalf, long tolerance, String msg) { + long count = df.filter(filterExpr).count(); + assertTrue(Math.abs(count - expectedHalf) <= tolerance, msg + " (got=" + count + ", expected=" + expectedHalf + ")"); + } + + private void assertBoundaryCounts(Dataset df, String exprZero, String exprTotal, long totalCount) { + assertEquals(0, df.filter(exprZero).count(), exprZero); + assertEquals(totalCount, df.filter(exprTotal).count(), exprTotal); + } + + @ParameterizedTest + @CsvSource(value = {"SIX,AVRO,CLUSTER", "CURRENT,AVRO,NONE", "CURRENT,AVRO,CLUSTER", "CURRENT,SPARK,NONE", "CURRENT,SPARK,CLUSTER"}) + void testCOWLogicalRepair(String tableVersion, String recordType, String operation) throws Throwable { + timestampNTZCompatibility(() -> { + try { + String dirName = "trips_logical_types_json_cow_write"; + String dataPath = basePath + "/" + dirName; + java.nio.file.Path zipOutput = Paths.get(new URI(dataPath)); + HoodieTestUtils.extractZipToDirectory("logical-repair/" + dirName + ".zip", zipOutput, getClass()); + String tableBasePath = zipOutput.toString(); + + TypedProperties properties = new TypedProperties(); + String schemaPath = getClass().getClassLoader().getResource("logical-repair/schema.avsc").toURI().toString(); + properties.setProperty("hoodie.streamer.schemaprovider.source.schema.file", schemaPath); + properties.setProperty("hoodie.streamer.schemaprovider.target.schema.file", schemaPath); + String inputDataPath = getClass().getClassLoader().getResource("logical-repair/cow_write_updates/2").toURI().toString(); + properties.setProperty("hoodie.streamer.source.dfs.root", inputDataPath); + properties.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); + properties.setProperty("hoodie.datasource.write.precombine.field", "timestamp"); + properties.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path"); + properties.setProperty("hoodie.datasource.write.keygenerator.class", "org.apache.hudi.keygen.SimpleKeyGenerator"); + properties.setProperty("hoodie.cleaner.policy", "KEEP_LATEST_COMMITS"); + properties.setProperty("hoodie.compact.inline", "false"); + properties.setProperty("hoodie.metatata.enable", "true"); + properties.setProperty("hoodie.parquet.small.file.limit", "-1"); + properties.setProperty("hoodie.cleaner.commits.retained", "10"); + Option propt = Option.of(properties); + + new HoodieStreamer(prepCfgForCowLogicalRepair(tableBasePath, "456"), jsc, propt).sync(); + inputDataPath = getClass().getClassLoader().getResource("logical-repair/cow_write_updates/3").toURI().toString(); + propt.get().setProperty("hoodie.streamer.source.dfs.root", inputDataPath); + if ("CLUSTER".equals(operation)) { + propt.get().setProperty("hoodie.clustering.inline", "true"); + propt.get().setProperty("hoodie.clustering.inline.max.commits", "1"); + propt.get().setProperty("hoodie.clustering.plan.strategy.single.group.clustering.enabled", "true"); + propt.get().setProperty("hoodie.clustering.plan.strategy.sort.columns", "ts_millis,_row_key"); + } + new HoodieStreamer(prepCfgForCowLogicalRepair(tableBasePath, "789"), jsc, propt).sync(); + String prevTimezone = sparkSession.conf().get("spark.sql.session.timeZone"); + try { + sparkSession.conf().set("spark.sql.session.timeZone", "UTC"); + sparkSession.conf().set("spark.sql.parquet.enableVectorizedReader", "false"); + Dataset df = sparkSession.read().format("hudi").load(tableBasePath); + assertDataframe(df, 16, 16); + + if ("CLUSTER".equals(operation)) { + // after we cluster, the raw parquet should be correct + + // Validate raw parquet files + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setConf(hadoopConf) + .setBasePath(tableBasePath) + .build(); + + HoodieTimeline completedCommitsTimeline = metaClient.getCommitsTimeline().filterCompletedInstants(); + Option latestInstant = completedCommitsTimeline.lastInstant(); + assertTrue(latestInstant.isPresent(), "No completed commits found"); + + List baseFilePaths = collectLatestBaseFilePaths(metaClient); + + assertEquals(4, baseFilePaths.size()); + + // Read raw parquet files + Dataset rawParquetDf = sparkSession.read().parquet(baseFilePaths.toArray(new String[0])); + assertDataframe(rawParquetDf, 15, 15); + } + } finally { + sparkSession.conf().set("spark.sql.session.timeZone", prevTimezone); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + } + + @ParameterizedTest + @CsvSource(value = { + "SIX,AVRO,CLUSTER,AVRO", + "CURRENT,AVRO,NONE,AVRO", + "CURRENT,AVRO,CLUSTER,AVRO", + "CURRENT,AVRO,COMPACT,AVRO", + "CURRENT,AVRO,NONE,PARQUET", + "CURRENT,AVRO,CLUSTER,PARQUET", + "CURRENT,AVRO,COMPACT,PARQUET", + "CURRENT,SPARK,NONE,PARQUET", + "CURRENT,SPARK,CLUSTER,PARQUET", + "CURRENT,SPARK,COMPACT,PARQUET"}) + void testMORLogicalRepair(String tableVersion, String recordType, String operation, String logBlockType) throws Throwable { + timestampNTZCompatibility(() -> { + try { + String tableSuffix; + String logFormatValue; + if ("AVRO".equals(logBlockType)) { + logFormatValue = "avro"; + tableSuffix = "avro_log"; + } else { + logFormatValue = "parquet"; + tableSuffix = "parquet_log"; + } + + String dirName = "trips_logical_types_json_mor_write_" + tableSuffix; + String dataPath = basePath + "/" + dirName; + java.nio.file.Path zipOutput = Paths.get(new URI(dataPath)); + HoodieTestUtils.extractZipToDirectory("logical-repair/" + dirName + ".zip", zipOutput, getClass()); + String tableBasePath = zipOutput.toString(); + + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder() + .setConf(hadoopConf) + .setBasePath(tableBasePath) + .build(); + + // validate no compaction and clustering instants present in the timeline + HoodieTimeline completedTimeline = metaClient.getActiveTimeline().filterCompletedInstants(); + assertFalse(completedTimeline.getInstants().stream().anyMatch(i -> i.getAction().equals(HoodieTimeline.COMPACTION_ACTION))); + assertFalse(completedTimeline.getInstants().stream().anyMatch(i -> i.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION))); + + TypedProperties properties = new TypedProperties(); + String schemaPath = getClass().getClassLoader().getResource("logical-repair/schema.avsc").toURI().toString(); + properties.setProperty("hoodie.streamer.schemaprovider.source.schema.file", schemaPath); + properties.setProperty("hoodie.streamer.schemaprovider.target.schema.file", schemaPath); + String inputDataPath = getClass().getClassLoader().getResource("logical-repair/mor_write_updates/5").toURI().toString(); + properties.setProperty("hoodie.streamer.source.dfs.root", inputDataPath); + String mergerClass = getMergerClassForRecordType(recordType); + String tableVersionString = getTableVersionCode(tableVersion); + + properties.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); + properties.setProperty("hoodie.datasource.write.precombine.field", "timestamp"); + properties.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path"); + properties.setProperty("hoodie.datasource.write.keygenerator.class", "org.apache.hudi.keygen.SimpleKeyGenerator"); + properties.setProperty("hoodie.cleaner.policy", "KEEP_LATEST_COMMITS"); + properties.setProperty("hoodie.metatata.enable", "true"); + properties.setProperty("hoodie.parquet.small.file.limit", "-1"); + properties.setProperty("hoodie.cleaner.commits.retained", "10"); + properties.setProperty(HoodieStorageConfig.LOGFILE_DATA_BLOCK_FORMAT.key(), logFormatValue); + + boolean disableCompaction; + if ("COMPACT".equals(operation)) { + properties.setProperty("hoodie.compact.inline", "true"); + properties.setProperty("hoodie.compact.inline.max.delta.commits", "1"); + disableCompaction = false; + // validate that there are no completed compaction (commit) instants in timeline. + } else { + properties.setProperty("hoodie.compact.inline", "false"); + disableCompaction = true; + } + + if ("CLUSTER".equals(operation)) { + properties.setProperty("hoodie.clustering.inline", "true"); + properties.setProperty("hoodie.clustering.inline.max.commits", "1"); + properties.setProperty("hoodie.clustering.plan.strategy.single.group.clustering.enabled", "true"); + properties.setProperty("hoodie.clustering.plan.strategy.sort.columns", "ts_millis,_row_key"); + } + + Option propt = Option.of(properties); + + new HoodieStreamer(prepCfgForMorLogicalRepair(tableBasePath, dirName, "123", disableCompaction), jsc, propt).sync(); + + String prevTimezone = sparkSession.conf().get("spark.sql.session.timeZone"); + try { + sparkSession.conf().set("spark.sql.parquet.enableVectorizedReader", "false"); + sparkSession.conf().set("spark.sql.session.timeZone", "UTC"); + Dataset df = sparkSession.read().format("hudi").load(tableBasePath); + + assertDataframe(df, 12, 14); + + metaClient = HoodieTableMetaClient.builder() + .setConf(hadoopConf) + .setBasePath(tableBasePath) + .build(); + + if ("CLUSTER".equals(operation)) { + // after we cluster, the raw parquet should be correct + + // Validate raw parquet files + HoodieTimeline completedCommitsTimeline = metaClient.getCommitsTimeline().filterCompletedInstants(); + Option latestInstant = completedCommitsTimeline.lastInstant(); + assertTrue(latestInstant.isPresent(), "No completed commits found"); + + List baseFilePaths = collectLatestBaseFilePaths(metaClient); + assertEquals(3, baseFilePaths.size()); + + // Read raw parquet files + Dataset rawParquetDf = sparkSession.read().parquet(baseFilePaths.toArray(new String[0])); + assertDataframe(rawParquetDf, 12, 14); + } else if ("COMPACT".equals(operation)) { + // after compaction some files should be ok + // Validate raw parquet files + HoodieTimeline completedCommitsTimeline = metaClient.getCommitsTimeline().filterCompletedInstants(); + Option latestInstant = completedCommitsTimeline.lastInstant(); + assertTrue(latestInstant.isPresent(), "No completed commits found"); + + List baseFilePaths = collectLatestBaseFilePaths(metaClient); + assertEquals(7, baseFilePaths.size()); + + // Read raw parquet files + Dataset rawParquetDf = sparkSession.read().parquet(baseFilePaths.stream() + // only read the compacted ones, the others are still incorrect + .filter(path -> path.contains(latestInstant.get().getTimestamp())) + .toArray(String[]::new)); + assertDataframe(rawParquetDf, 8, 8); + } + } finally { + sparkSession.conf().set("spark.sql.session.timeZone", prevTimezone); + sparkSession.conf().set("spark.sql.parquet.enableVectorizedReader", "true"); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + } + + public static void assertDataframe(Dataset df, int above, int below) { + List rows = df.collectAsList(); + assertEquals(above + below, rows.size()); + + for (Row row : rows) { + String val = row.getString(6); + int hash = val.hashCode(); + if ((hash & 1) == 0) { + assertEquals("2020-01-01T00:00:00.001Z", row.getTimestamp(15).toInstant().toString()); + assertEquals("2020-06-01T12:00:00.000001Z", row.getTimestamp(16).toInstant().toString()); + assertEquals("2015-05-20T12:34:56.001", row.get(17).toString()); + assertEquals("2017-07-07T07:07:07.000001", row.get(18).toString()); + } else { + assertEquals("2019-12-31T23:59:59.999Z", row.getTimestamp(15).toInstant().toString()); + assertEquals("2020-06-01T11:59:59.999999Z", row.getTimestamp(16).toInstant().toString()); + assertEquals("2015-05-20T12:34:55.999", row.get(17).toString()); + assertEquals("2017-07-07T07:07:06.999999", row.get(18).toString()); + } + } + + assertEquals(above, df.filter("ts_millis > timestamp('2020-01-01 00:00:00Z')").count()); + assertEquals(below, df.filter("ts_millis < timestamp('2020-01-01 00:00:00Z')").count()); + assertEquals(0, df.filter("ts_millis > timestamp('2020-01-01 00:00:00.001Z')").count()); + assertEquals(0, df.filter("ts_millis < timestamp('2019-12-31 23:59:59.999Z')").count()); + + assertEquals(above, df.filter("ts_micros > timestamp('2020-06-01 12:00:00Z')").count()); + assertEquals(below, df.filter("ts_micros < timestamp('2020-06-01 12:00:00Z')").count()); + assertEquals(0, df.filter("ts_micros > timestamp('2020-06-01 12:00:00.000001Z')").count()); + assertEquals(0, df.filter("ts_micros < timestamp('2020-06-01 11:59:59.999999Z')").count()); + + assertEquals(above, df.filter("local_ts_millis > CAST('2015-05-20 12:34:56' AS TIMESTAMP_NTZ)").count()); + assertEquals(below, df.filter("local_ts_millis < CAST('2015-05-20 12:34:56' AS TIMESTAMP_NTZ)").count()); + assertEquals(0, df.filter("local_ts_millis > CAST('2015-05-20 12:34:56.001' AS TIMESTAMP_NTZ)").count()); + assertEquals(0, df.filter("local_ts_millis < CAST('2015-05-20 12:34:55.999' AS TIMESTAMP_NTZ)").count()); + + assertEquals(above, df.filter("local_ts_micros > CAST('2017-07-07 07:07:07' AS TIMESTAMP_NTZ)").count()); + assertEquals(below, df.filter("local_ts_micros < CAST('2017-07-07 07:07:07' AS TIMESTAMP_NTZ)").count()); + assertEquals(0, df.filter("local_ts_micros > CAST('2017-07-07 07:07:07.000001' AS TIMESTAMP_NTZ)").count()); + assertEquals(0, df.filter("local_ts_micros < CAST('2017-07-07 07:07:06.999999' AS TIMESTAMP_NTZ)").count()); + } + + private List collectLatestBaseFilePaths(HoodieTableMetaClient metaClient) { + List baseFilePaths = new ArrayList<>(); + HoodieTableFileSystemView fsView = null; + try { + fsView = FileSystemViewManager.createInMemoryFileSystemView( + new HoodieLocalEngineContext(metaClient.getHadoopConf()), + metaClient, HoodieMetadataConfig.newBuilder().enable(false).build()); + fsView.loadAllPartitions(); + final HoodieTableFileSystemView fileSystemView = fsView; + fsView.getPartitionNames().forEach(partitionName -> + fileSystemView.getLatestFileSlices(partitionName).forEach(fileSlice -> { + assertTrue(fileSlice.getLogFiles().collect(Collectors.toList()).isEmpty(), "File slice should not have log files"); + Option latestBaseFile = fileSlice.getBaseFile(); + assertTrue(latestBaseFile.isPresent(), "Base file should be present"); + baseFilePaths.add(latestBaseFile.get().getPath()); + })); + } finally { + if (fsView != null) { + fsView.close(); + } + } + return baseFilePaths; + } + + private String getMergerClassForRecordType(String recordType) { + switch (recordType) { + case "AVRO": + return HoodieAvroRecordMerger.class.getName(); + case "SPARK": + return HoodieSparkRecordMerger.class.getName(); + default: + throw new IllegalArgumentException("Invalid record type: " + recordType); + } + } + + private String getTableVersionCode(String tableVersion) { + switch (tableVersion) { + case "SIX": + return String.valueOf(HoodieTableVersion.SIX.versionCode()); + case "CURRENT": + return String.valueOf(HoodieTableVersion.current().versionCode()); + default: + throw new IllegalArgumentException("Invalid table version: " + tableVersion); + } + } + + private HoodieStreamer.Config prepCfgForCowLogicalRepair(String tableBasePath, + String ignoreCheckpoint) throws Exception { + HoodieStreamer.Config cfg = new HoodieStreamer.Config(); + cfg.targetBasePath = tableBasePath; + cfg.tableType = "COPY_ON_WRITE"; + cfg.targetTableName = "trips_logical_types_json_cow_write"; + cfg.sourceClassName = "org.apache.hudi.utilities.sources.JsonDFSSource"; + cfg.schemaProviderClassName = "org.apache.hudi.utilities.schema.FilebasedSchemaProvider"; + cfg.sourceOrderingField = "timestamp"; + cfg.ignoreCheckpoint = ignoreCheckpoint; + cfg.operation = WriteOperationType.UPSERT; + cfg.forceDisableCompaction = true; + return cfg; + } + + private HoodieStreamer.Config prepCfgForMorLogicalRepair(String tableBasePath, + String tableName, + String ignoreCheckpoint, + boolean disableCompaction) throws Exception { + HoodieStreamer.Config cfg = new HoodieStreamer.Config(); + cfg.targetBasePath = tableBasePath; + cfg.tableType = "MERGE_ON_READ"; + cfg.targetTableName = tableName; + cfg.sourceClassName = "org.apache.hudi.utilities.sources.JsonDFSSource"; + cfg.schemaProviderClassName = "org.apache.hudi.utilities.schema.FilebasedSchemaProvider"; + cfg.sourceOrderingField = "timestamp"; + cfg.ignoreCheckpoint = ignoreCheckpoint; + cfg.operation = WriteOperationType.UPSERT; + cfg.forceDisableCompaction = disableCompaction; + return cfg; + } + + private static Stream continuousModeArgs() { + return Stream.of( + Arguments.of("AVRO", "CURRENT"), + Arguments.of("SPARK", "CURRENT"), + Arguments.of("AVRO", "EIGHT"), + Arguments.of("SPARK", "EIGHT"), + Arguments.of("AVRO", "SIX") + ); + } + + private static Stream continuousModeMorArgs() { + return Stream.of( + Arguments.of("AVRO", "CURRENT"), + Arguments.of("AVRO", "EIGHT"), + Arguments.of("AVRO", "SIX") + ); + } + @Timeout(600) @ParameterizedTest @EnumSource(value = HoodieRecordType.class, names = {"AVRO", "SPARK"}) @@ -2302,7 +2808,9 @@ public void testCsvDFSSourceNoHeaderWithoutSchemaProviderAndWithTransformer() th testCsvDFSSource(false, '\t', false, Collections.singletonList(TripsWithDistanceTransformer.class.getName())); }, "Should error out when doing the transformation."); LOG.debug("Expected error during transformation", e); - assertTrue(e.getMessage().contains("cannot resolve 'begin_lat' given input columns:")); + // first version for Spark >= 3.3, the second one is for Spark < 3.3 + assertTrue(e.getMessage().contains("A column or function parameter with name `begin_lat` cannot be resolved. Did you mean one of the following?") + || e.getMessage().contains("cannot resolve 'begin_lat' given input columns:")); } @Test @@ -2890,7 +3398,7 @@ public static class TestSpecificPartitionTransformer implements Transformer { @Override public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, Dataset rowDataset, TypedProperties properties) { - Dataset toReturn = rowDataset.filter("partition_path == '" + HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH + "'"); + Dataset toReturn = rowDataset.filter("partition_path == '" + org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH + "'"); return toReturn; } } @@ -2929,4 +3437,26 @@ private static Stream testORCDFSSource() { arguments(true, Collections.singletonList(TripsWithDistanceTransformer.class.getName())) ); } + + public static void timestampNTZCompatibility(Runnable r) throws Throwable { + // TODO: Remove this when we get rid of spark3.3. TimestampNTZ needs this config + // to be set to true to work. + boolean isSpark33 = HoodieSparkUtils.isSpark3_3(); + String propertyValue = null; + if (isSpark33) { + propertyValue = System.getProperty("spark.testing"); + System.setProperty("spark.testing", "true"); + } + try { + r.run(); + } finally { + if (isSpark33) { + if (propertyValue == null) { + System.clearProperty("spark.testing"); + } else { + System.setProperty("spark.testing", propertyValue); + } + } + } + } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamerSchemaEvolutionQuick.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamerSchemaEvolutionQuick.java index de21b33fff4e6..be0b2f78924b1 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamerSchemaEvolutionQuick.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestHoodieDeltaStreamerSchemaEvolutionQuick.java @@ -59,25 +59,13 @@ public void teardown() throws Exception { protected static Stream testArgs() { Stream.Builder b = Stream.builder(); - //only testing row-writer enabled for now - for (Boolean rowWriterEnable : new Boolean[] {true}) { - for (Boolean nullForDeletedCols : new Boolean[] {false, true}) { - for (Boolean useKafkaSource : new Boolean[] {false, true}) { - for (Boolean addFilegroups : new Boolean[] {false, true}) { - for (Boolean multiLogFiles : new Boolean[] {false, true}) { - for (Boolean shouldCluster : new Boolean[] {false, true}) { - for (String tableType : new String[] {"COPY_ON_WRITE", "MERGE_ON_READ"}) { - if (!multiLogFiles || tableType.equals("MERGE_ON_READ")) { - b.add(Arguments.of(tableType, shouldCluster, false, rowWriterEnable, addFilegroups, multiLogFiles, useKafkaSource, nullForDeletedCols)); - } - } - } - b.add(Arguments.of("MERGE_ON_READ", false, true, rowWriterEnable, addFilegroups, multiLogFiles, useKafkaSource, nullForDeletedCols)); - } - } - } - } - } + b.add(Arguments.of("COPY_ON_WRITE", true, false, true, false, false, true, false, true)); + b.add(Arguments.of("COPY_ON_WRITE", true, false, true, false, false, true, true, false)); + b.add(Arguments.of("COPY_ON_WRITE", true, false, false, false, false, true, true, true)); + b.add(Arguments.of("MERGE_ON_READ", true, false, false, true, true, true, true, true)); + b.add(Arguments.of("MERGE_ON_READ", false, true, true, true, true, true, true, false)); + b.add(Arguments.of("MERGE_ON_READ", false, true, true, true, true, true, true, false)); + b.add(Arguments.of("MERGE_ON_READ", false, false, true, true, true, false, true, false)); return b.build(); } @@ -119,13 +107,14 @@ protected static Stream testParamsWithSchemaTransformer() { @ParameterizedTest @MethodSource("testArgs") public void testBase(String tableType, - Boolean shouldCluster, - Boolean shouldCompact, - Boolean rowWriterEnable, - Boolean addFilegroups, - Boolean multiLogFiles, - Boolean useKafkaSource, - Boolean allowNullForDeletedCols) throws Exception { + Boolean shouldCluster, + Boolean shouldCompact, + Boolean rowWriterEnable, + Boolean addFilegroups, + Boolean multiLogFiles, + Boolean useKafkaSource, + Boolean allowNullForDeletedCols, + Boolean useVectorization) throws Exception { this.tableType = tableType; this.shouldCluster = shouldCluster; this.shouldCompact = shouldCompact; @@ -141,6 +130,9 @@ public void testBase(String tableType, PARQUET_SOURCE_ROOT = basePath + "parquetFilesDfs" + ++testNum; tableBasePath = basePath + "test_parquet_table" + testNum; this.deltaStreamer = new HoodieDeltaStreamer(getDeltaStreamerConfig(allowNullForDeletedCols), jsc); + if (!useVectorization) { + sparkSession.conf().set("spark.sql.parquet.enableVectorizedReader", "false"); + } //first write String datapath = String.class.getResource("/data/schema-evolution/startTestEverything.json").getPath(); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deser/TestKafkaAvroSchemaDeserializer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deser/TestKafkaAvroSchemaDeserializer.java index 16d190ac45d15..ab2a89c62c6a2 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deser/TestKafkaAvroSchemaDeserializer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deser/TestKafkaAvroSchemaDeserializer.java @@ -105,7 +105,7 @@ public void testKafkaAvroSchemaDeserializer() { avroDeserializer.configure(new HashMap(config), false); bytesOrigRecord = avroSerializer.serialize(topic, avroRecord); // record is serialized in orig schema and deserialized using same schema. - assertEquals(avroRecord, avroDeserializer.deserialize(false, topic, false, bytesOrigRecord, origSchema)); + assertEquals(avroRecord, avroDeserializer.deserialize(topic, false, bytesOrigRecord, origSchema)); IndexedRecord avroRecordWithAllField = createExtendUserRecord(); byte[] bytesExtendedRecord = avroSerializer.serialize(topic, avroRecordWithAllField); @@ -115,12 +115,12 @@ public void testKafkaAvroSchemaDeserializer() { avroDeserializer = new KafkaAvroSchemaDeserializer(schemaRegistry, new HashMap(config)); avroDeserializer.configure(new HashMap(config), false); // record is serialized w/ evolved schema, and deserialized w/ evolved schema - IndexedRecord avroRecordWithAllFieldActual = (IndexedRecord) avroDeserializer.deserialize(false, topic, false, bytesExtendedRecord, evolSchema); + IndexedRecord avroRecordWithAllFieldActual = (IndexedRecord) avroDeserializer.deserialize(topic, false, bytesExtendedRecord, evolSchema); assertEquals(avroRecordWithAllField, avroRecordWithAllFieldActual); assertEquals(avroRecordWithAllFieldActual.getSchema(), evolSchema); // read old record w/ evolved schema. - IndexedRecord actualRec = (IndexedRecord) avroDeserializer.deserialize(false, topic, false, bytesOrigRecord, origSchema); + IndexedRecord actualRec = (IndexedRecord) avroDeserializer.deserialize(topic, false, bytesOrigRecord, origSchema); // record won't be equal to original record as we read w/ evolved schema. "age" will be added w/ default value of null assertNotEquals(avroRecord, actualRec); GenericRecord genericRecord = (GenericRecord) actualRec; diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractBaseTestSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractBaseTestSource.java index 56d435ddf0f17..8611eb36c7416 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractBaseTestSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/sources/AbstractBaseTestSource.java @@ -29,6 +29,7 @@ import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.sources.AvroSource; +import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Row; @@ -49,6 +50,9 @@ public abstract class AbstractBaseTestSource extends AvroSource { + public static String schemaStr = HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; + public static Schema avroSchema = HoodieTestDataGenerator.AVRO_SCHEMA; + private static final Logger LOG = LoggerFactory.getLogger(AbstractBaseTestSource.class); public static final int DEFAULT_PARTITION_NUM = 0; @@ -112,8 +116,8 @@ protected static Stream fetchNextBatch(TypedProperties props, int HoodieTestDataGenerator dataGenerator = dataGeneratorMap.get(partition); // generate `sourceLimit` number of upserts each time. - int numExistingKeys = dataGenerator.getNumExistingKeys(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA); - LOG.info("NumExistingKeys=" + numExistingKeys); + int numExistingKeys = dataGenerator.getNumExistingKeys(schemaStr); + LOG.info("NumExistingKeys={}", numExistingKeys); int numUpdates = Math.min(numExistingKeys, sourceLimit / 2); int numInserts = sourceLimit - numUpdates; @@ -140,15 +144,15 @@ protected static Stream fetchNextBatch(TypedProperties props, int LOG.info("After adjustments => NumInserts=" + numInserts + ", NumUpdates=" + (numUpdates - 50) + ", NumDeletes=50, maxUniqueRecords=" + maxUniqueKeys); // if we generate update followed by deletes -> some keys in update batch might be picked up for deletes. Hence generating delete batch followed by updates - deleteStream = dataGenerator.generateUniqueDeleteRecordStream(instantTime, 50).map(AbstractBaseTestSource::toGenericRecord); - updateStream = dataGenerator.generateUniqueUpdatesStream(instantTime, numUpdates - 50, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + deleteStream = dataGenerator.generateUniqueDeleteRecordStream(instantTime, 50, schemaStr).map(AbstractBaseTestSource::toGenericRecord); + updateStream = dataGenerator.generateUniqueUpdatesStream(instantTime, numUpdates - 50, schemaStr) .map(AbstractBaseTestSource::toGenericRecord); } else { - LOG.info("After adjustments => NumInserts=" + numInserts + ", NumUpdates=" + numUpdates + ", maxUniqueRecords=" + maxUniqueKeys); - updateStream = dataGenerator.generateUniqueUpdatesStream(instantTime, numUpdates, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + LOG.info("After adjustments => NumInserts={}, NumUpdates={}, maxUniqueRecords={}", numInserts, numUpdates, maxUniqueKeys); + updateStream = dataGenerator.generateUniqueUpdatesStream(instantTime, numUpdates, schemaStr) .map(AbstractBaseTestSource::toGenericRecord); } - Stream insertStream = dataGenerator.generateInsertsStream(instantTime, numInserts, false, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + Stream insertStream = dataGenerator.generateInsertsStream(instantTime, numInserts, false, schemaStr, true) .map(AbstractBaseTestSource::toGenericRecord); if (Boolean.valueOf(props.getOrDefault("hoodie.test.source.generate.inserts", "false").toString())) { return insertStream; @@ -159,7 +163,7 @@ protected static Stream fetchNextBatch(TypedProperties props, int private static GenericRecord toGenericRecord(HoodieRecord hoodieRecord) { try { RawTripTestPayload payload = (RawTripTestPayload) hoodieRecord.getData(); - return (GenericRecord) payload.getRecordToInsert(HoodieTestDataGenerator.AVRO_SCHEMA); + return (GenericRecord) payload.getRecordToInsert(avroSchema); } catch (IOException e) { return null; } diff --git a/hudi-utilities/src/test/resources/logical-repair/README.md b/hudi-utilities/src/test/resources/logical-repair/README.md new file mode 100644 index 0000000000000..3aea5f2ae6906 --- /dev/null +++ b/hudi-utilities/src/test/resources/logical-repair/README.md @@ -0,0 +1,88 @@ + + + +Test assets + +trips_logical_types_json_cow_write.zip: + +this table was created with two deltastreamer writes: + +write 0 with 0.15.0: +inserts to partition 1, partition 2, partition 3 + +write 1 with 0.15.0: +inserts to partition 3 + +this gives us a table with 3 partitions, partition 1 and 2 have 1 file each and partition 3 has 2. + +Then we provide updates in cow_write_updates: + +write 2 done in the test: +inserts to partition 3, partition 4 + +write 3 done in the test: +updates to partition 3 + +this gives us a final table: + +partition 1: +1 base file written with 0.15.0 +partition 2: +1 base file written with 0.15.0 +1 base file written with 1.1 +partition 3: +1 base file written with 1.1 that contains some 0.15.0 written records +1 base file written with 0.15.0 +1 base file written with 1.1 +partition 4: +1 base file written with 1.1 + + +trips_logical_types_json_mor_write_avro_log.zip/trips_logical_types_json_mor_write_parquet_log.zip +the two tables were created with the same steps, but the avro table uses avro log files and the parquet table uses parquet files + +write 0 with 0.15.0: +inserts to partition 1, 2, 3 + +write 1 with 0.15.0: +inserts to partition 3 + +write 2 with 0.15.0: +updates to 1 file in partition 3 and 1 file in partition 2 + +write 3 with 0.15.0: +inserts to partition 3 + +write 4 with 0.15.0 +inserts to partition 3 and updates to 1 file in partition 3 + +write 5 done in the tests: +updates to 2 files in partition 3 and inserts to partition 3 + +The final table will be + +partition 1: +fg1: base file with 0.15.0 +partition 2: +fg1: base file with 0.15.0 + log file with 0.15.0 +partition 3: +fg1: base file with 0.15.0 + log file with 0.15.0 + log file with 1.1 +fg2: base file with 0.15.0 + log file with 1.1 +fg3: base file with 1.1 +fg4: base file with 0.15 + log file with 0.15 +fg5: base file with 0.15 diff --git a/hudi-utilities/src/test/resources/logical-repair/cow_write_updates/2/data.json b/hudi-utilities/src/test/resources/logical-repair/cow_write_updates/2/data.json new file mode 100644 index 0000000000000..dd6e2a05616be --- /dev/null +++ b/hudi-utilities/src/test/resources/logical-repair/cow_write_updates/2/data.json @@ -0,0 +1,6 @@ +{"timestamp": 1761335069636, "_row_key": "3f3ef947-c3e9-48a5-b08f-6cabbc6d6533", "partition_path": "2016/03/15", "trip_type": "BLACK", "rider": "rider-002", "driver": "driver-002", "begin_lat": 0.3017074196681322, "begin_lon": 0.5626109152945691, "end_lat": 0.5649785382157525, "end_lon": 0.6160079798524531, "ts_millis": 1577836799999, "ts_micros": 1591012799999999, "local_ts_millis": 1432125295999, "local_ts_micros": 1499411226999999, "event_date": 10956, "dec_fixed_small": [0, -44, 48], "dec_fixed_large": [13, -76, -38, 95, 75, 113, 119, 20], "_hoodie_is_deleted": false, "partition": "2015/03/16"} +{"timestamp": 1761335069636, "_row_key": "d8725d53-826a-45d8-9b70-b812d06d9dd0", "partition_path": "2016/03/15", "trip_type": "UBERX", "rider": "rider-002", "driver": "driver-002", "begin_lat": 0.27641854803317645, "begin_lon": 0.31700440770954075, "end_lat": 0.16654733508021524, "end_lon": 0.3555821110759497, "ts_millis": 1577836800001, "ts_micros": 1591012800000001, "local_ts_millis": 1432125296001, "local_ts_micros": 1499411227000001, "event_date": 10958, "dec_fixed_small": [0, -44, 50], "dec_fixed_large": [13, -76, -38, 95, 75, 113, 119, 22], "_hoodie_is_deleted": false, "partition": "2015/03/17"} +{"timestamp": 1761335069636, "_row_key": "1d4f0480-5300-4f68-8ebb-1ff70ff5c6ea", "partition_path": "2016/03/15", "trip_type": "UBERX", "rider": "rider-002", "driver": "driver-002", "begin_lat": 0.9074176919785227, "begin_lon": 0.4117236492462387, "end_lat": 0.6994811148788228, "end_lon": 0.3772709703853857, "ts_millis": 1577836800001, "ts_micros": 1591012800000001, "local_ts_millis": 1432125296001, "local_ts_micros": 1499411227000001, "event_date": 10958, "dec_fixed_small": [0, -44, 50], "dec_fixed_large": [13, -76, -38, 95, 75, 113, 119, 22], "_hoodie_is_deleted": false, "partition": "2015/03/16"} +{"timestamp": 1761335069636, "_row_key": "a6816ca4-60c8-4bab-a77a-31fa2c000987", "partition_path": "2015/03/18", "trip_type": "BLACK", "rider": "rider-002", "driver": "driver-002", "begin_lat": 0.4444402031840541, "begin_lon": 0.33692826304653933, "end_lat": 0.3043284603831268, "end_lon": 0.11042503421042937, "ts_millis": 1577836800001, "ts_micros": 1591012800000001, "local_ts_millis": 1432125296001, "local_ts_micros": 1499411227000001, "event_date": 10958, "dec_fixed_small": [0, -44, 50], "dec_fixed_large": [13, -76, -38, 95, 75, 113, 119, 22], "_hoodie_is_deleted": false, "partition": "2015/03/17"} +{"timestamp": 1761335069636, "_row_key": "b1555002-2fe1-4687-bb60-406a52f16bb5", "partition_path": "2015/03/18", "trip_type": "BLACK", "rider": "rider-002", "driver": "driver-002", "begin_lat": 0.43246488469100974, "begin_lon": 0.3648005645136184, "end_lat": 0.3781839595846225, "end_lon": 0.4638740649211893, "ts_millis": 1577836799999, "ts_micros": 1591012799999999, "local_ts_millis": 1432125295999, "local_ts_micros": 1499411226999999, "event_date": 10956, "dec_fixed_small": [0, -44, 48], "dec_fixed_large": [13, -76, -38, 95, 75, 113, 119, 20], "_hoodie_is_deleted": false, "partition": "2015/03/16"} +{"timestamp": 1761335069636, "_row_key": "615c45b5-57ad-489d-aad8-6fe563b513f7", "partition_path": "2015/03/18", "trip_type": "BLACK", "rider": "rider-002", "driver": "driver-002", "begin_lat": 0.75916203985879, "begin_lon": 0.49855855157343465, "end_lat": 0.7432577319020379, "end_lon": 0.33072999799294, "ts_millis": 1577836800001, "ts_micros": 1591012800000001, "local_ts_millis": 1432125296001, "local_ts_micros": 1499411227000001, "event_date": 10958, "dec_fixed_small": [0, -44, 50], "dec_fixed_large": [13, -76, -38, 95, 75, 113, 119, 22], "_hoodie_is_deleted": false, "partition": "2015/03/16"} diff --git a/hudi-utilities/src/test/resources/logical-repair/cow_write_updates/3/data.json b/hudi-utilities/src/test/resources/logical-repair/cow_write_updates/3/data.json new file mode 100644 index 0000000000000..105ce19370e16 --- /dev/null +++ b/hudi-utilities/src/test/resources/logical-repair/cow_write_updates/3/data.json @@ -0,0 +1,3 @@ +{"timestamp": 1761335069637, "_row_key": "fe818873-3af0-4fcd-90ea-f5ad0e7565fb", "partition_path": "2016/03/15", "trip_type": "BLACK", "rider": "rider-003", "driver": "driver-003", "begin_lat": 0.9122211080491403, "begin_lon": 0.23232697706220873, "end_lat": 0.8967870566670471, "end_lon": 0.05065495500664263, "ts_millis": 1577836800001, "ts_micros": 1591012800000001, "local_ts_millis": 1432125296001, "local_ts_micros": 1499411227000001, "event_date": 10958, "dec_fixed_small": [0, -44, 50], "dec_fixed_large": [13, -76, -38, 95, 75, 113, 119, 22], "_hoodie_is_deleted": false, "partition": "2016/03/15"} +{"timestamp": 1761335069637, "_row_key": "0af34a1a-c231-4c6b-8a5c-50d8e36a0ff1", "partition_path": "2016/03/15", "trip_type": "BLACK", "rider": "rider-003", "driver": "driver-003", "begin_lat": 0.26483577112225265, "begin_lon": 0.26862954952340434, "end_lat": 0.2727902211619275, "end_lon": 0.9138712331657564, "ts_millis": 1577836799999, "ts_micros": 1591012799999999, "local_ts_millis": 1432125295999, "local_ts_micros": 1499411226999999, "event_date": 10956, "dec_fixed_small": [0, -44, 48], "dec_fixed_large": [13, -76, -38, 95, 75, 113, 119, 20], "_hoodie_is_deleted": false, "partition": "2016/03/15"} +{"timestamp": 1761335069637, "_row_key": "f448d495-69cf-4d28-afa8-3af2459636ee", "partition_path": "2016/03/15", "trip_type": "UBERX", "rider": "rider-003", "driver": "driver-003", "begin_lat": 0.8873308149149347, "begin_lon": 0.358940823441969, "end_lat": 0.2646927323955117, "end_lon": 0.4091537968746116, "ts_millis": 1577836799999, "ts_micros": 1591012799999999, "local_ts_millis": 1432125295999, "local_ts_micros": 1499411226999999, "event_date": 10956, "dec_fixed_small": [0, -44, 48], "dec_fixed_large": [13, -76, -38, 95, 75, 113, 119, 20], "_hoodie_is_deleted": false, "partition": "2016/03/15"} diff --git a/hudi-utilities/src/test/resources/logical-repair/mor_write_updates/5/data.json b/hudi-utilities/src/test/resources/logical-repair/mor_write_updates/5/data.json new file mode 100644 index 0000000000000..48109422242fe --- /dev/null +++ b/hudi-utilities/src/test/resources/logical-repair/mor_write_updates/5/data.json @@ -0,0 +1,3 @@ +{"timestamp": 1761341703299, "_row_key": "092ed4ad-0e67-4df7-a051-c66ff30f08a7", "partition_path": "2015/03/17", "trip_type": "UBERX", "rider": "rider-005", "driver": "driver-005", "begin_lat": 0.20841305367042184, "begin_lon": 0.41269017191959156, "end_lat": 0.6266431410358951, "end_lon": 0.4514006891788409, "ts_millis": 1577836799999, "ts_micros": 1591012799999999, "local_ts_millis": 1432125295999, "local_ts_micros": 1499411226999999, "event_date": 10956, "dec_fixed_small": [0, -44, 48], "dec_fixed_large": [13, -76, -38, 95, 75, 113, 119, 20], "_hoodie_is_deleted": false, "partition": "2015/03/17"} +{"timestamp": 1761341703299, "_row_key": "5affef5c-f36f-4374-80f3-e7c6d3c38d25", "partition_path": "2015/03/17", "trip_type": "BLACK", "rider": "rider-005", "driver": "driver-005", "begin_lat": 0.3287214805934826, "begin_lon": 0.4292459075453131, "end_lat": 0.8027879467022967, "end_lon": 0.07863000273562926, "ts_millis": 1577836800001, "ts_micros": 1591012800000001, "local_ts_millis": 1432125296001, "local_ts_micros": 1499411227000001, "event_date": 10958, "dec_fixed_small": [0, -44, 50], "dec_fixed_large": [13, -76, -38, 95, 75, 113, 119, 22], "_hoodie_is_deleted": false, "partition": "2015/03/17"} +{"timestamp": 1761341703299, "_row_key": "5b22bcfd-e2a2-4cb9-b6c5-f643223c48c8", "partition_path": "2015/03/17", "trip_type": "UBERX", "rider": "rider-005", "driver": "driver-005", "begin_lat": 0.2218333443775823, "begin_lon": 0.441127835026775, "end_lat": 0.39946947397642374, "end_lon": 0.5064153585372088, "ts_millis": 1577836800001, "ts_micros": 1591012800000001, "local_ts_millis": 1432125296001, "local_ts_micros": 1499411227000001, "event_date": 10958, "dec_fixed_small": [0, -44, 50], "dec_fixed_large": [13, -76, -38, 95, 75, 113, 119, 22], "_hoodie_is_deleted": false, "partition": "2015/03/17"} diff --git a/hudi-utilities/src/test/resources/logical-repair/schema.avsc b/hudi-utilities/src/test/resources/logical-repair/schema.avsc new file mode 100644 index 0000000000000..6392fa92d7826 --- /dev/null +++ b/hudi-utilities/src/test/resources/logical-repair/schema.avsc @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "type" : "record", + "name" : "triprec", + "fields" : [ { + "name" : "timestamp", + "type" : "long" + }, { + "name" : "_row_key", + "type" : "string" + }, { + "name" : "partition_path", + "type" : [ "null", "string" ], + "default" : null + }, { + "name" : "trip_type", + "type" : { + "type" : "enum", + "name" : "TripType", + "symbols" : [ "UNKNOWN", "UBERX", "BLACK" ], + "default" : "UNKNOWN" + } + }, { + "name" : "rider", + "type" : "string" + }, { + "name" : "driver", + "type" : "string" + }, { + "name" : "begin_lat", + "type" : "double" + }, { + "name" : "begin_lon", + "type" : "double" + }, { + "name" : "end_lat", + "type" : "double" + }, { + "name" : "end_lon", + "type" : "double" + }, { + "name" : "ts_millis", + "type" : { + "type" : "long", + "logicalType" : "timestamp-millis" + } + }, { + "name" : "ts_micros", + "type" : { + "type" : "long", + "logicalType" : "timestamp-micros" + } + }, { + "name" : "local_ts_millis", + "type" : { + "type" : "long", + "logicalType" : "local-timestamp-millis" + } + }, { + "name" : "local_ts_micros", + "type" : { + "type" : "long", + "logicalType" : "local-timestamp-micros" + } + }, { + "name" : "event_date", + "type" : { + "type" : "int", + "logicalType" : "date" + } + }, { + "name" : "dec_fixed_small", + "type" : { + "type" : "fixed", + "name" : "decFixedSmall", + "size" : 3, + "logicalType" : "decimal", + "precision" : 5, + "scale" : 2 + } + }, { + "name" : "dec_fixed_large", + "type" : { + "type" : "fixed", + "name" : "decFixedLarge", + "size" : 8, + "logicalType" : "decimal", + "precision" : 18, + "scale" : 9 + } + }, { + "name" : "_hoodie_is_deleted", + "type" : "boolean", + "default" : false + } ] +} diff --git a/hudi-utilities/src/test/resources/logical-repair/trips_logical_types_json_cow_write.zip b/hudi-utilities/src/test/resources/logical-repair/trips_logical_types_json_cow_write.zip new file mode 100644 index 0000000000000..901120035f1e1 Binary files /dev/null and b/hudi-utilities/src/test/resources/logical-repair/trips_logical_types_json_cow_write.zip differ diff --git a/hudi-utilities/src/test/resources/logical-repair/trips_logical_types_json_mor_write_avro_log.zip b/hudi-utilities/src/test/resources/logical-repair/trips_logical_types_json_mor_write_avro_log.zip new file mode 100644 index 0000000000000..f6ef371ffea7e Binary files /dev/null and b/hudi-utilities/src/test/resources/logical-repair/trips_logical_types_json_mor_write_avro_log.zip differ diff --git a/hudi-utilities/src/test/resources/logical-repair/trips_logical_types_json_mor_write_parquet_log.zip b/hudi-utilities/src/test/resources/logical-repair/trips_logical_types_json_mor_write_parquet_log.zip new file mode 100644 index 0000000000000..c3d02f98eaa12 Binary files /dev/null and b/hudi-utilities/src/test/resources/logical-repair/trips_logical_types_json_mor_write_parquet_log.zip differ diff --git a/hudi-utilities/src/test/resources/streamer-config/source-timestamp-millis.avsc b/hudi-utilities/src/test/resources/streamer-config/source-timestamp-millis.avsc new file mode 100644 index 0000000000000..5d47b96aa9379 --- /dev/null +++ b/hudi-utilities/src/test/resources/streamer-config/source-timestamp-millis.avsc @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "type" : "record", + "name" : "triprec", + "fields" : [ + { + "name": "timestamp", + "type": "long" + }, + { + "name": "_row_key", + "type": "string" + }, + { + "name": "partition_path", + "type": "string" + }, + { + "name": "trip_type", + "type": { + "type": "enum", + "name": "TripType", + "symbols": [ + "UNKNOWN", + "UBERX", + "BLACK" + ], + "default": "UNKNOWN" + } + }, + { + "name": "rider", + "type": "string" + }, + { + "name": "driver", + "type": "string" + }, + { + "name": "begin_lat", + "type": "double" + }, + { + "name": "begin_lon", + "type": "double" + }, + { + "name": "end_lat", + "type": "double" + }, + { + "name": "end_lon", + "type": "double" + }, + { + "name": "distance_in_meters", + "type": "int" + }, + { + "name": "seconds_since_epoch", + "type": "long" + }, + { + "name": "weight", + "type": "float" + }, + { + "name": "nation", + "type": "bytes" + }, + { + "name": "current_date", + "type": { + "type": "int", + "logicalType": "date" + } + }, + { + "name": "current_ts", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + }, + { + "name": "height", + "type": { + "type": "fixed", + "name": "abc", + "size": 5, + "logicalType": "decimal", + "precision": 10, + "scale": 6 + } + }, + { + "name": "city_to_state", + "type": { + "type": "map", + "values": "string" + } + }, + { + "name": "fare", + "type": { + "type": "record", + "name": "fare", + "fields": [ + { + "name": "amount", + "type": "double" + }, + { + "name": "currency", + "type": "string" + } + ] + } + }, + { + "name": "tip_history", + "type": { + "type": "array", + "items": { + "type": "record", + "name": "tip_history", + "fields": [ + { + "name": "amount", + "type": "double" + }, + { + "name": "currency", + "type": "string" + } + ] + } + } + }, + { + "name": "_hoodie_is_deleted", + "type": "boolean", + "default": false + } + ] +} diff --git a/pom.xml b/pom.xml index 4e9424e86654c..9f6e1eef7bb8a 100644 --- a/pom.xml +++ b/pom.xml @@ -100,11 +100,12 @@ ${fasterxml.spark3.version} 2.0.0 2.8.0 + 3.0.0 2.8.1 ${pulsar.spark.scala12.version} 2.4.5 3.1.1.4 - 5.3.4 + 5.5.0 2.17 3.0.1-b12 1.10.1 @@ -228,6 +229,8 @@ 2.7.3 2.1.1 1.1.8.3 + true + true @@ -963,6 +966,12 @@ orc-core ${orc.spark.version} compile + + + ${hive.groupid} + hive-exec + + @@ -994,6 +1003,12 @@ spark-sql_${scala.binary.version} ${spark.version} provided + + + org.apache.orc + orc-core + + org.apache.spark @@ -1013,6 +1028,16 @@ tests ${spark.version} test + + + org.codehaus.janino + janino + + + org.codehaus.janino + commons-compiler + + org.apache.spark @@ -1037,6 +1062,16 @@ tests ${spark.version} test + + + org.codehaus.janino + janino + + + org.codehaus.janino + commons-compiler + + org.apache.spark @@ -1516,6 +1551,18 @@ org.apache.hbase * + + org.apache.hive + hive-storage-api + + + org.codehaus.janino + commons-compiler + + + org.codehaus.janino + janino + @@ -1572,6 +1619,18 @@ org.apache.hbase * + + org.apache.hive + hive-storage-api + + + org.codehaus.janino + commons-compiler + + + org.codehaus.janino + janino + @@ -1736,6 +1795,75 @@ + + + + io.confluent + kafka-avro-serializer + ${confluent.version} + + + io.confluent + common-config + ${confluent.version} + + + io.confluent + common-utils + ${confluent.version} + + + io.confluent + kafka-schema-registry-client + ${confluent.version} + + + io.confluent + kafka-protobuf-serializer + ${confluent.version} + + + + + org.eclipse.jetty + jetty-server + ${jetty.version} + + + org.eclipse.jetty + jetty-servlet + ${jetty.version} + + + org.eclipse.jetty + jetty-http + ${jetty.version} + + + org.eclipse.jetty + jetty-io + ${jetty.version} + + + org.eclipse.jetty + jetty-util + ${jetty.version} + + + org.eclipse.jetty + jetty-webapp + ${jetty.version} + + + org.eclipse.jetty + jetty-xml + ${jetty.version} + + + org.eclipse.jetty + jetty-security + ${jetty.version} + @@ -2267,6 +2395,8 @@ hudi-spark3.2plus-common ${scalatest.spark3.version} ${kafka.spark3.version} + 2.3.10 + 2.8.1