Spark: Don't create empty partition replace operations

RussellSpitzer · RussellSpitzer · commit 5d65f8ed2936 · 2021-08-11T09:47:27.000-05:00
When attempting to insert overwrite with an empty dataset
we would previously throw an error. This patch causes spark
to skip any no-op partition replacement operations.
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java
@@ -33,6 +33,7 @@
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.spark.SparkWriteOptions;
 import org.apache.iceberg.types.Types;
@@ -188,6 +189,50 @@ public void testAppend() throws IOException {
     Assert.assertEquals("Result rows should match", expected, actual);
   }
 
+  @Test
+  public void testEmptyOverwrite() throws IOException {
+    File parent = temp.newFolder(format.toString());
+    File location = new File(parent, "test");
+
+    HadoopTables tables = new HadoopTables(CONF);
+    PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build();
+    Table table = tables.create(SCHEMA, spec, location.toString());
+
+    List<SimpleRecord> records = Lists.newArrayList(
+        new SimpleRecord(1, "a"),
+        new SimpleRecord(2, "b"),
+        new SimpleRecord(3, "c")
+    );
+
+    List<SimpleRecord> expected = records;
+    Dataset<Row> df = spark.createDataFrame(records, SimpleRecord.class);
+
+    df.select("id", "data").write()
+        .format("iceberg")
+        .option(SparkWriteOptions.WRITE_FORMAT, format.toString())
+        .mode(SaveMode.Append)
+        .save(location.toString());
+
+    Dataset<Row> empty = spark.createDataFrame(ImmutableList.of(), SimpleRecord.class);
+    // overwrite with 2*id to replace record 2, append 4 and 6
+    empty.select("id", "data").write()
+        .format("iceberg")
+        .option(SparkWriteOptions.WRITE_FORMAT, format.toString())
+        .mode(SaveMode.Overwrite)
+        .option("overwrite-mode", "dynamic")
+        .save(location.toString());
+
+    table.refresh();
+
+    Dataset<Row> result = spark.read()
+        .format("iceberg")
+        .load(location.toString());
+
+    List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
+    Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
+    Assert.assertEquals("Result rows should match", expected, actual);
+  }
+
   @Test
   public void testOverwrite() throws IOException {
     File parent = temp.newFolder(format.toString());
diff --git a/spark3/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java b/spark3/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java
@@ -262,10 +262,16 @@ public void commit(WriterCommitMessage[] messages) {
   private class DynamicOverwrite extends BaseBatchWrite {
     @Override
     public void commit(WriterCommitMessage[] messages) {
+      Iterable<DataFile> files = files(messages);
+      if (Iterables.size(files) == 0) {
+        LOG.info("Dynamic overwrite is empty, skipping commit");
+        return;
+      }
+
       ReplacePartitions dynamicOverwrite = table.newReplacePartitions();
 
       int numFiles = 0;
-      for (DataFile file : files(messages)) {
+      for (DataFile file : files) {
         numFiles += 1;
         dynamicOverwrite.addFile(file);
       }