apache · aihuaxu · Aug 24, 2025 · Nov 2, 2025 · Jan 9, 2026 · Jan 15, 2026
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantWriters.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantWriters.java
@@ -275,8 +275,14 @@ private ShreddedVariantWriter(
     @Override
     public void write(int repetitionLevel, VariantValue value) {
       if (typedWriter.types().contains(value.type())) {
-        typedWriter.write(repetitionLevel, value);
-        writeNull(valueWriter, repetitionLevel, valueDefinitionLevel);
+        try {
+          typedWriter.write(repetitionLevel, value);
+          writeNull(valueWriter, repetitionLevel, valueDefinitionLevel);
+        } catch (IllegalArgumentException e) {
+          // Fall back to value field if typed write fails (e.g., decimal scale mismatch)
+          valueWriter.write(repetitionLevel, value);
+          writeNull(typedWriter, repetitionLevel, typedDefinitionLevel);
+        }
       } else {
         valueWriter.write(repetitionLevel, value);
         writeNull(typedWriter, repetitionLevel, typedDefinitionLevel);

diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java
@@ -51,7 +51,6 @@ class ParquetWriter<T> implements FileAppender<T>, Closeable {
   private final Map<String, String> metadata;
   private final ParquetProperties props;
   private final CodecFactory.BytesCompressor compressor;
-  private final MessageType parquetSchema;
   private final ParquetValueWriter<T> model;
   private final MetricsConfig metricsConfig;
   private final int columnIndexTruncateLength;
@@ -60,6 +59,7 @@ class ParquetWriter<T> implements FileAppender<T>, Closeable {
   private final Configuration conf;
   private final InternalFileEncryptor fileEncryptor;
 
+  private MessageType parquetSchema;
   private ColumnChunkPageWriteStore pageStore = null;
   private ColumnWriteStore writeStore;
   private long recordCount = 0;
@@ -134,6 +134,32 @@ private void ensureWriterInitialized() {
 
   @Override
   public void add(T value) {
+    if (model instanceof WriterLazyInitializable lazy) {
+      if (lazy.needsInitialization()) {
+        model.write(0, value);
+        recordCount += 1;
+
+        if (!lazy.needsInitialization()) {
+          WriterLazyInitializable.InitializationResult result =
+              lazy.initialize(
+                  props, compressor, rowGroupOrdinal, columnIndexTruncateLength, fileEncryptor);
+          this.parquetSchema = result.getSchema();
+          this.pageStore.close();
+          this.pageStore = result.getPageStore();
+          this.writeStore.close();
+          this.writeStore = result.getWriteStore();
+
+          // Re-initialize the file writer with the new schema
+          ensureWriterInitialized();
+
+          // Buffered rows were already written with endRecord() calls
+          // in the lazy writer's initialization, so we don't call endRecord() here
+          checkSize();
+        }
+        return;
+      }
+    }
+
     recordCount += 1;
     model.write(0, value);
     writeStore.endRecord();
@@ -255,6 +281,24 @@ private void startRowGroup() {
   public void close() throws IOException {
     if (!closed) {
       this.closed = true;
+
+      if (model instanceof WriterLazyInitializable lazy) {
+        // If initialization is not triggered with few data, lazy writer needs to initialize and
+        // process remaining buffered data
+        if (lazy.needsInitialization()) {
+          WriterLazyInitializable.InitializationResult result =
+              lazy.initialize(
+                  props, compressor, rowGroupOrdinal, columnIndexTruncateLength, fileEncryptor);
+          this.parquetSchema = result.getSchema();
+          this.pageStore.close();
+          this.pageStore = result.getPageStore();
+          this.writeStore.close();
+          this.writeStore = result.getWriteStore();
+
+          ensureWriterInitialized();
+        }
+      }
+
       flushRowGroup(true);
       writeStore.close();
       if (writer != null) {

diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/WriterLazyInitializable.java b/parquet/src/main/java/org/apache/iceberg/parquet/WriterLazyInitializable.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.parquet;
+
+import org.apache.parquet.column.ColumnWriteStore;
+import org.apache.parquet.column.ParquetProperties;
+import org.apache.parquet.compression.CompressionCodecFactory;
+import org.apache.parquet.crypto.InternalFileEncryptor;
+import org.apache.parquet.hadoop.ColumnChunkPageWriteStore;
+import org.apache.parquet.schema.MessageType;
+
+/**
+ * Interface for ParquetValueWriters that need to defer initialization until they can analyze the
+ * data. This is useful for scenarios like variant shredding where the schema needs to be inferred
+ * from the actual data before creating the writer structures.
+ *
+ * <p>Writers implementing this interface can buffer initial rows and perform schema inference
+ * before committing to a final Parquet schema.
+ */
+public interface WriterLazyInitializable {
+  /**
+   * Result returned by lazy initialization of a ParquetValueWriter required by ParquetWriter.
+   * Contains the finalized schema and write stores after schema inference or other initialization
+   * logic.
+   */
+  class InitializationResult {
+    private final MessageType schema;
+    private final ColumnChunkPageWriteStore pageStore;
+    private final ColumnWriteStore writeStore;
+
+    public InitializationResult(
+        MessageType schema, ColumnChunkPageWriteStore pageStore, ColumnWriteStore writeStore) {
+      this.schema = schema;
+      this.pageStore = pageStore;
+      this.writeStore = writeStore;
+    }
+
+    public MessageType getSchema() {
+      return schema;
+    }
+
+    public ColumnChunkPageWriteStore getPageStore() {
+      return pageStore;
+    }
+
+    public ColumnWriteStore getWriteStore() {
+      return writeStore;
+    }
+  }
+
+  /**
+   * Checks if this writer still needs initialization. This will return true until the writer has
+   * buffered enough data to perform initialization (e.g., schema inference).
+   *
+   * @return true if initialization is still needed, false if already initialized
+   */
+  boolean needsInitialization();
+
+  /**
+   * Performs initialization and returns the result containing updated schema and write stores. This
+   * method should only be called when {@link #needsInitialization()} returns true.
+   *
+   * @param props Parquet properties needed for creating write stores
+   * @param compressor Bytes compressor for compression
+   * @param rowGroupOrdinal The ordinal number of the current row group
+   * @param columnIndexTruncateLength The column index truncate length from ParquetWriter config
+   * @param fileEncryptor The file encryptor from ParquetWriter, may be null if encryption is
+   *     disabled
+   * @return InitializationResult containing the finalized schema and write stores
+   */
+  InitializationResult initialize(
+      ParquetProperties props,
+      CompressionCodecFactory.BytesInputCompressor compressor,
+      int rowGroupOrdinal,
+      int columnIndexTruncateLength,
+      InternalFileEncryptor fileEncryptor);
+}
diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java
@@ -109,4 +109,14 @@ private SparkSQLProperties() {}
 
   // Prefix for custom snapshot properties
   public static final String SNAPSHOT_PROPERTY_PREFIX = "spark.sql.iceberg.snapshot-property.";
+
+  // Controls whether to shred variant columns during write operations
+  public static final String SHRED_VARIANTS = "spark.sql.iceberg.shred-variants";
+  public static final boolean SHRED_VARIANTS_DEFAULT = true;
+
+  // Controls the buffer size for variant schema inference during writes
+  // This determines how many rows are buffered before inferring shredded schema
+  public static final String VARIANT_INFERENCE_BUFFER_SIZE =
+      "spark.sql.iceberg.variant.inference.buffer-size";
+  public static final int VARIANT_INFERENCE_BUFFER_SIZE_DEFAULT = 10;
 }
diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java
@@ -509,6 +509,18 @@ private Map<String, String> dataWriteProperties() {
         if (parquetCompressionLevel != null) {
           writeProperties.put(PARQUET_COMPRESSION_LEVEL, parquetCompressionLevel);
         }
+        boolean shouldShredVariants = shredVariants();
+        writeProperties.put(SparkSQLProperties.SHRED_VARIANTS, String.valueOf(shouldShredVariants));
+
+        // Add variant shredding configuration properties
+        if (shouldShredVariants) {
+          String variantBufferSize =
+              sessionConf.get(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, null);
+          if (variantBufferSize != null) {
+            writeProperties.put(
+                SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, variantBufferSize);
+          }
+        }
         break;
 
       case AVRO:
@@ -729,4 +741,13 @@ public DeleteGranularity deleteGranularity() {
         .defaultValue(DeleteGranularity.FILE)
         .parse();
   }
+
+  public boolean shredVariants() {
+    return confParser
+        .booleanConf()
+        .option(SparkWriteOptions.SHRED_VARIANTS)
+        .sessionConf(SparkSQLProperties.SHRED_VARIANTS)
+        .defaultValue(SparkSQLProperties.SHRED_VARIANTS_DEFAULT)
+        .parse();
+  }
 }
diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java
@@ -85,4 +85,7 @@ private SparkWriteOptions() {}
 
   // Overrides the delete granularity
   public static final String DELETE_GRANULARITY = "delete-granularity";
+
+  // Controls whether to shred variant columns during write operations
+  public static final String SHRED_VARIANTS = "shred-variants";
 }