apache · deniskuzZ · Dec 17, 2025 · Dec 15, 2025
diff --git a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java
@@ -431,6 +431,44 @@ public static void setDefaultValues(Record record, List<Types.NestedField> field
     }
   }
 
+  /**
+   * Sets a value into a {@link Record} using a struct-only field path (top-level column or nested
+   * through structs). Intermediate struct records are created as needed.
+   *
+   * <p>If the path traverses a non-struct type (e.g. list/map), the operation is ignored.
+   */
+  public static void setStructField(Record root, String[] path, Object value) {
+    if (root == null || path == null || path.length == 0) {
+      return;
+    }
+    Record current = root;
+    Types.StructType currentStruct = root.struct();
+
+    for (int i = 0; i < path.length - 1; i++) {
+      String fieldName = path[i];
+      Types.NestedField field = currentStruct.field(fieldName);
+      if (field == null || !field.type().isStructType()) {
+        return;
+      }
+      Types.StructType nestedStruct = field.type().asStructType();
+      current = getOrCreateStructRecord(current, fieldName, nestedStruct);
+      currentStruct = nestedStruct;
+    }
+
+    current.setField(path[path.length - 1], value);
+  }
+
+  private static Record getOrCreateStructRecord(
+      Record parent, String fieldName, Types.StructType structType) {
+    Object value = parent.getField(fieldName);
+    if (value instanceof Record) {
+      return (Record) value;
+    }
+    Record record = GenericRecord.create(structType);
+    parent.setField(fieldName, record);
+    return record;
+  }
+
   // Special method for nested structs that always applies defaults to null fields
   private static void setDefaultValuesForNestedStruct(Record record, List<Types.NestedField> fields) {
     for (Types.NestedField field : fields) {

diff --git a/...ceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveFileWriterFactory.java b/...ceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveFileWriterFactory.java
@@ -20,7 +20,6 @@
 package org.apache.iceberg.mr.hive.writer;
 
 import java.util.Map;
-import java.util.function.Supplier;
 import org.apache.iceberg.FileFormat;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.SortOrder;
@@ -33,12 +32,13 @@
 import org.apache.iceberg.data.parquet.GenericParquetWriter;
 import org.apache.iceberg.orc.ORC;
 import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.parquet.VariantShreddingFunction;
 import org.apache.iceberg.parquet.VariantUtil;
 
 class HiveFileWriterFactory extends BaseFileWriterFactory<Record> {
 
   private final Map<String, String> properties;
-  private Supplier<Record> sampleRecord = null;
+  private Record sampleRecord = null;
 
   HiveFileWriterFactory(
       Table table,
@@ -85,9 +85,34 @@ protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) {
   @Override
   protected void configureDataWrite(Parquet.DataWriteBuilder builder) {
     builder.createWriterFunc(GenericParquetWriter::create);
-    // Configure variant shredding function if conditions are met:
-    VariantUtil.variantShreddingFunc(dataSchema(), sampleRecord, properties)
-        .ifPresent(builder::variantShreddingFunc);
+    // Configure variant shredding if enabled and a sample record is available
+    if (VariantUtil.shouldUseVariantShredding(properties, dataSchema())) {
+      setVariantShreddingFunc(builder, VariantUtil.variantShreddingFunc(sampleRecord, dataSchema()));
+    }
+  }
+
+  /**
+   * Sets a {@link VariantShreddingFunction} on the underlying Parquet write builder.
+   *
+   * <p>{@link Parquet.DataWriteBuilder} does not expose {@code variantShreddingFunc} directly; it is set on an
+   * internal write builder held in the private {@code appenderBuilder} field. This method uses reflection to
+   * access that internal builder and invoke {@code variantShreddingFunc(VariantShreddingFunction)}.
+   *
+   * TODO: Replace with {@code DataWriteBuilder.variantShreddingFunc(VariantShreddingFunction)}
+   * once it becomes publicly available.
+   */
+  private static void setVariantShreddingFunc(Parquet.DataWriteBuilder dataWriteBuilder,
+      VariantShreddingFunction fn) {
+    try {
+      java.lang.reflect.Field field = dataWriteBuilder.getClass().getDeclaredField("appenderBuilder");
+      field.setAccessible(true);
+      Object writeBuilder = field.get(dataWriteBuilder);
+      writeBuilder.getClass()
+          .getMethod("variantShreddingFunc", VariantShreddingFunction.class)
+          .invoke(writeBuilder, fn);
+    } catch (ReflectiveOperationException e) {
+      throw new RuntimeException(e);
+    }
   }
 
   @Override
@@ -164,7 +189,7 @@ HiveFileWriterFactory build() {
    * Set a sample record to use for data-driven variant shredding schema generation.
    * Should be called before the Parquet writer is created.
    */
-  public void initialize(Supplier<Record> record) {
+  public void initialize(Record record) {
     if (sampleRecord == null) {
       sampleRecord = record;
     }

diff --git a/...r/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergCopyOnWriteRecordWriter.java b/...r/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergCopyOnWriteRecordWriter.java
@@ -21,48 +21,32 @@
 
 import java.io.IOException;
 import java.util.List;
-import java.util.Set;
 import org.apache.hadoop.io.Writable;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.DataFiles;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.data.GenericRecord;
 import org.apache.iceberg.data.Record;
 import org.apache.iceberg.deletes.PositionDelete;
-import org.apache.iceberg.hive.HiveSchemaUtil;
 import org.apache.iceberg.io.DataWriteResult;
 import org.apache.iceberg.io.OutputFileFactory;
 import org.apache.iceberg.mr.hive.FilesForCommit;
 import org.apache.iceberg.mr.hive.IcebergAcidUtil;
 import org.apache.iceberg.mr.hive.writer.WriterBuilder.Context;
 import org.apache.iceberg.mr.mapred.Container;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
-import org.apache.iceberg.types.Types;
 
-class HiveIcebergCopyOnWriteRecordWriter extends HiveIcebergWriterBase {
-
-  private final int currentSpecId;
-  private final Set<String> missingColumns;
-  private final List<Types.NestedField> missingOrStructFields;
+class HiveIcebergCopyOnWriteRecordWriter extends SchemaInferringDefaultsWriter {
 
   private final GenericRecord rowDataTemplate;
   private final List<DataFile> replacedDataFiles;
 
-  private final HiveFileWriterFactory fileWriterFactory;
-
   HiveIcebergCopyOnWriteRecordWriter(Table table, HiveFileWriterFactory writerFactory,
       OutputFileFactory deleteFileFactory, Context context) {
-    super(table, newDataWriter(table, writerFactory, deleteFileFactory, context));
+    super(table, writerFactory, deleteFileFactory, context);
 
-    this.currentSpecId = table.spec().specId();
     this.rowDataTemplate = GenericRecord.create(table.schema());
     this.replacedDataFiles = Lists.newArrayList();
-
-    this.missingColumns = context.missingColumns();
-    this.missingOrStructFields = specs.get(currentSpecId).schema().asStruct().fields().stream()
-        .filter(field -> missingColumns.contains(field.name()) || field.type().isStructType())
-        .toList();
-    this.fileWriterFactory = writerFactory;
   }
 
   @Override
@@ -82,9 +66,7 @@ public void write(Writable row) throws IOException {
             .build();
       replacedDataFiles.add(dataFile);
     } else {
-      HiveSchemaUtil.setDefaultValues(rowData, missingOrStructFields, missingColumns);
-      fileWriterFactory.initialize(() -> rowData);
-      writer.write(rowData, specs.get(currentSpecId), partition(rowData, currentSpecId));
+      writeOrBuffer(rowData);
     }
   }
 

diff --git a/...berg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergRecordWriter.java b/...berg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergRecordWriter.java
@@ -21,45 +21,27 @@
 
 import java.io.IOException;
 import java.util.List;
-import java.util.Set;
 import org.apache.hadoop.io.Writable;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.data.Record;
-import org.apache.iceberg.hive.HiveSchemaUtil;
 import org.apache.iceberg.io.DataWriteResult;
 import org.apache.iceberg.io.OutputFileFactory;
 import org.apache.iceberg.mr.hive.FilesForCommit;
 import org.apache.iceberg.mr.hive.writer.WriterBuilder.Context;
 import org.apache.iceberg.mr.mapred.Container;
-import org.apache.iceberg.types.Types;
 
-class HiveIcebergRecordWriter extends HiveIcebergWriterBase {
-
-  private final int currentSpecId;
-  private final Set<String> missingColumns;
-  private final List<Types.NestedField> missingOrStructFields;
-
-  private final HiveFileWriterFactory fileWriterFactory;
+class HiveIcebergRecordWriter extends SchemaInferringDefaultsWriter {
 
   HiveIcebergRecordWriter(Table table, HiveFileWriterFactory fileWriterFactory,
       OutputFileFactory dataFileFactory, Context context) {
-    super(table, newDataWriter(table, fileWriterFactory, dataFileFactory, context));
-
-    this.currentSpecId = table.spec().specId();
-    this.missingColumns = context.missingColumns();
-    this.missingOrStructFields = specs.get(currentSpecId).schema().asStruct().fields().stream()
-        .filter(field -> missingColumns.contains(field.name()) || field.type().isStructType())
-        .toList();
-    this.fileWriterFactory = fileWriterFactory;
+    super(table, fileWriterFactory, dataFileFactory, context);
   }
 
   @Override
   public void write(Writable row) throws IOException {
     Record record = ((Container<Record>) row).get();
-    HiveSchemaUtil.setDefaultValues(record, missingOrStructFields, missingColumns);
-    fileWriterFactory.initialize(() -> record);
-    writer.write(record, specs.get(currentSpecId), partition(record, currentSpecId));
+    writeOrBuffer(record);
   }
 
   @Override

diff --git a/...ceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergWriterBase.java b/...ceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveIcebergWriterBase.java
@@ -80,7 +80,7 @@ public void close(boolean abort) throws IOException {
           .retry(3)
           .suppressFailureWhenFinished()
           .onFailure((file, exception) -> LOG.debug("Failed on to remove file {} on abort", file, exception))
-          .run(file -> io.deleteFile(file.path().toString()));
+          .run(file -> io.deleteFile(file.location()));
       LOG.warn("HiveIcebergWriter is closed with abort");
     }
 

diff --git a/...andler/src/main/java/org/apache/iceberg/mr/hive/writer/SchemaInferringDefaultsWriter.java b/...andler/src/main/java/org/apache/iceberg/mr/hive/writer/SchemaInferringDefaultsWriter.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.mr.hive.writer;
+
+import java.io.IOException;
+import java.util.BitSet;
+import java.util.List;
+import java.util.Set;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.data.GenericRecord;
+import org.apache.iceberg.data.Record;
+import org.apache.iceberg.hive.HiveSchemaUtil;
+import org.apache.iceberg.io.OutputFileFactory;
+import org.apache.iceberg.mr.hive.writer.WriterBuilder.Context;
+import org.apache.iceberg.parquet.VariantUtil;
+import org.apache.iceberg.parquet.VariantUtil.VariantField;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.types.Types.NestedField;
+
+abstract class SchemaInferringDefaultsWriter extends HiveIcebergWriterBase {
+
+  private static final int VARIANT_SAMPLE_BUFFER_SIZE = 100;
+
+  private final HiveFileWriterFactory fileWriterFactory;
+
+  private final int currentSpecId;
+  private final Set<String> missingColumns;
+  private final List<NestedField> missingOrStructFields;
+
+  private final List<VariantField> variantFields;
+  private final BitSet sampledVariantFields;
+
+  private final List<Record> buffer;
+  private final Record accumulatedSample;
+  private boolean sampleInitialized = false;
+
+  SchemaInferringDefaultsWriter(
+      Table table,
+      HiveFileWriterFactory fileWriterFactory,
+      OutputFileFactory dataFileFactory,
+      Context context) {
+
+    super(table, newDataWriter(table, fileWriterFactory, dataFileFactory, context));
+    Schema schema = table.schema();
+    this.fileWriterFactory = fileWriterFactory;
+
+    this.currentSpecId = table.spec().specId();
+    this.missingColumns = context.missingColumns();
+    this.missingOrStructFields = schema.columns().stream()
+        .filter(field -> missingColumns.contains(field.name()) || field.type().isStructType())
+        .toList();
+
+    this.variantFields = VariantUtil.variantFieldsForShredding(table.properties(), schema);
+    this.sampledVariantFields = new BitSet(variantFields.size());
+
+    boolean shouldBuffer = !variantFields.isEmpty();
+    this.buffer = shouldBuffer ? Lists.newArrayListWithCapacity(VARIANT_SAMPLE_BUFFER_SIZE) : null;
+    this.accumulatedSample = shouldBuffer ? GenericRecord.create(schema) : null;
+  }
+
+  protected void writeOrBuffer(Record record) {
+    HiveSchemaUtil.setDefaultValues(record, missingOrStructFields, missingColumns);
+
+    if (buffer != null && !sampleInitialized) {
+      accumulateSample(record);
+
+      if (allVariantFieldsSampled() || buffer.size() >= VARIANT_SAMPLE_BUFFER_SIZE) {
+        // Use accumulated sample for schema inference
+        fileWriterFactory.initialize(accumulatedSample);
+        sampleInitialized = true;
+
+        flushBufferedRecords();
+      } else {
+        buffer.add(record.copy());
+        return;
+      }
+    }
+    writeRecord(record);
+  }
+
+  private void writeRecord(Record record) {
+    writer.write(record, specs.get(currentSpecId), partition(record, currentSpecId));
+  }
+
+  private void flushBufferedRecords() {
+    for (Record bufferedRecord : buffer) {
+      writeRecord(bufferedRecord);
+    }
+    buffer.clear();
+  }
+
+  private boolean allVariantFieldsSampled() {
+    return sampledVariantFields.nextClearBit(0) >= variantFields.size();
+  }
+
+  private void accumulateSample(Record record) {
+    if (accumulatedSample == null || allVariantFieldsSampled()) {
+      return;
+    }
+    for (int fieldIndex = sampledVariantFields.nextClearBit(0);
+         fieldIndex < variantFields.size();
+         fieldIndex = sampledVariantFields.nextClearBit(fieldIndex + 1)) {
+      trySampleVariantField(fieldIndex, record);
+    }
+  }
+
+  private void trySampleVariantField(int fieldIndex, Record record) {
+    VariantField variantField = variantFields.get(fieldIndex);
+    Object val = safeGet(variantField, record);
+    if (!VariantUtil.isShreddable(val)) {
+      return;
+    }
+    HiveSchemaUtil.setStructField(accumulatedSample, variantField.path(), val);
+    sampledVariantFields.set(fieldIndex);
+  }
+
+  private static Object safeGet(VariantField variantField, Record record) {
+    try {
+      return variantField.accessor().get(record);
+    } catch (RuntimeException e) {
+      // Treat unexpected access failures as "no sample" and keep scanning.
+      return null;
+    }
+  }
+
+  @Override
+  public void close(boolean abort) throws IOException {
+    if (buffer != null) {
+      if (abort) {
+        // Don't write anything on abort. Just drop any buffered records.
+        buffer.clear();
+      } else if (!buffer.isEmpty()) {
+        if (!sampleInitialized) {
+          // Use whatever we have accumulated so far
+          fileWriterFactory.initialize(accumulatedSample);
+        }
+        flushBufferedRecords();
+      }
+    }
+    super.close(abort);
+  }
+
+}