diff --git a/core/src/main/java/org/apache/iceberg/util/SortOrderUtil.java b/core/src/main/java/org/apache/iceberg/util/SortOrderUtil.java index 37e0c1fffab0..4d7a631ab559 100644 --- a/core/src/main/java/org/apache/iceberg/util/SortOrderUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/SortOrderUtil.java @@ -46,6 +46,23 @@ public static SortOrder buildSortOrder(Table table, SortOrder sortOrder) { return buildSortOrder(table.schema(), table.spec(), sortOrder); } + /** + * Attempts to match a user-supplied {@link SortOrder} with an equivalent sort order from a {@link + * Table}. + * + * @param table the table to try and match the sort order against + * @param userSuppliedSortOrder the user supplied sort order to try and match with a table sort + * order + * @return the matching {@link SortOrder} from the table (with the orderId set) or {@link + * SortOrder#unsorted()} if no match is found. + */ + public static SortOrder maybeFindTableSortOrder(Table table, SortOrder userSuppliedSortOrder) { + return table.sortOrders().values().stream() + .filter(sortOrder -> sortOrder.sameOrder(userSuppliedSortOrder)) + .findFirst() + .orElseGet(SortOrder::unsorted); + } + /** * Build a final sort order that satisfies the clustering required by the partition spec. * diff --git a/core/src/test/java/org/apache/iceberg/util/TestSortOrderUtil.java b/core/src/test/java/org/apache/iceberg/util/TestSortOrderUtil.java index 02c81de93222..3757b70dd334 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestSortOrderUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestSortOrderUtil.java @@ -287,4 +287,68 @@ public void testSortOrderClusteringWithRedundantPartitionFieldsMissing() { .as("Should add spec fields as prefix") .isEqualTo(expected); } + + @Test + public void testFindSortOrderForTable() { + PartitionSpec spec = PartitionSpec.unpartitioned(); + SortOrder order = SortOrder.builderFor(SCHEMA).withOrderId(1).asc("id", NULLS_LAST).build(); + TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, order, 2); + + SortOrder tableSortOrder = table.sortOrder(); + + SortOrder actualOrder = SortOrderUtil.maybeFindTableSortOrder(table, tableSortOrder); + + assertThat(actualOrder).as("Should find current table sort order").isEqualTo(table.sortOrder()); + } + + @Test + public void testFindSortOrderForTableWithoutFieldId() { + PartitionSpec spec = PartitionSpec.unpartitioned(); + SortOrder order = SortOrder.builderFor(SCHEMA).withOrderId(1).asc("id", NULLS_LAST).build(); + TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, order, 2); + + SortOrder userSuppliedOrder = + SortOrder.builderFor(table.schema()).asc("id", NULLS_LAST).build(); + + SortOrder actualOrder = SortOrderUtil.maybeFindTableSortOrder(table, userSuppliedOrder); + + assertThat(actualOrder).as("Should find current table sort order").isEqualTo(table.sortOrder()); + } + + @Test + public void testFindSortOrderForTableThatIsNotCurrentOrder() { + PartitionSpec spec = PartitionSpec.unpartitioned(); + SortOrder order = SortOrder.builderFor(SCHEMA).withOrderId(1).asc("id", NULLS_LAST).build(); + TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, order, 2); + + table.replaceSortOrder().asc("data").desc("ts").commit(); + + SortOrder userSuppliedOrder = + SortOrder.builderFor(table.schema()).asc("id", NULLS_LAST).build(); + + SortOrder actualOrder = SortOrderUtil.maybeFindTableSortOrder(table, userSuppliedOrder); + + assertThat(actualOrder) + .as("Should find first sorted table sort order") + .isEqualTo(table.sortOrders().get(1)); + } + + @Test + public void testReturnsEmptyForFindingNonMatchingSortOrder() { + PartitionSpec spec = PartitionSpec.unpartitioned(); + SortOrder order = SortOrder.builderFor(SCHEMA).withOrderId(1).asc("id", NULLS_LAST).build(); + TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, order, 2); + + table.replaceSortOrder().asc("data").desc("ts").commit(); + + SortOrder userSuppliedOrder = + SortOrder.builderFor(table.schema()).desc("id", NULLS_LAST).build(); + + SortOrder actualOrder = SortOrderUtil.maybeFindTableSortOrder(table, userSuppliedOrder); + + assertThat(actualOrder) + .as( + "Should return unsorted order if user supplied order does not match any table sort order") + .isEqualTo(SortOrder.unsorted()); + } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java index 2788e160d526..b20ad1c86f71 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java @@ -268,6 +268,14 @@ public boolean preserveDataGrouping() { .parse(); } + public boolean preserveDataOrdering() { + return confParser + .booleanConf() + .sessionConf(SparkSQLProperties.PRESERVE_DATA_ORDERING) + .defaultValue(SparkSQLProperties.PRESERVE_DATA_ORDERING_DEFAULT) + .parse(); + } + public boolean aggregatePushDownEnabled() { return confParser .booleanConf() diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index 81139969f746..c5ff0609a051 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -43,6 +43,11 @@ private SparkSQLProperties() {} "spark.sql.iceberg.planning.preserve-data-grouping"; public static final boolean PRESERVE_DATA_GROUPING_DEFAULT = false; + // Controls whether to preserve data ordering and report it to Spark + public static final String PRESERVE_DATA_ORDERING = + "spark.sql.iceberg.planning.preserve-data-ordering"; + public static final boolean PRESERVE_DATA_ORDERING_DEFAULT = false; + // Controls whether to push down aggregate (MAX/MIN/COUNT) to Iceberg public static final String AGGREGATE_PUSH_DOWN_ENABLED = "spark.sql.iceberg.aggregate-push-down.enabled"; diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java index 96131e0e56dd..f85fb0dfb9ff 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java @@ -42,6 +42,7 @@ import org.apache.iceberg.FileFormat; import org.apache.iceberg.IsolationLevel; import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; import org.apache.iceberg.TableUtil; @@ -164,6 +165,22 @@ public int outputSpecId() { return outputSpecId; } + public SortOrder outputSortOrder() { + int outputSortOrderId = + confParser + .intConf() + .option(SparkWriteOptions.OUTPUT_SORT_ORDER_ID) + .defaultValue(SortOrder.unsorted().orderId()) + .parse(); + + Preconditions.checkArgument( + table.sortOrders().containsKey(outputSortOrderId), + "Output sort order id %s is not a valid sort order id for table", + outputSortOrderId); + + return table.sortOrders().get(outputSortOrderId); + } + public FileFormat dataFileFormat() { String valueAsString = confParser @@ -284,6 +301,21 @@ public SparkWriteRequirements writeRequirements() { table, distributionMode(), fanoutWriterEnabled(), dataAdvisoryPartitionSize()); } + public SparkWriteRequirements rewriteFilesWriteRequirements() { + Preconditions.checkNotNull( + rewrittenFileSetId(), "Can only use rewrite files write requirements during rewrite job!"); + + SortOrder outputSortOrder = outputSortOrder(); + if (outputSortOrder.isSorted()) { + LOG.info( + "Found explicit sort order {} set in job configuration. Going to apply that to the sort-order-id of the rewritten files", + Spark3Util.describe(outputSortOrder)); + return writeRequirements().withTableSortOrder(outputSortOrder); + } + + return writeRequirements(); + } + @VisibleForTesting DistributionMode distributionMode() { String modeName = diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java index 33db70bae587..1be02feaf0c0 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java @@ -54,6 +54,7 @@ private SparkWriteOptions() {} public static final String REWRITTEN_FILE_SCAN_TASK_SET_ID = "rewritten-file-scan-task-set-id"; public static final String OUTPUT_SPEC_ID = "output-spec-id"; + public static final String OUTPUT_SORT_ORDER_ID = "output-sort-order-id"; public static final String OVERWRITE_MODE = "overwrite-mode"; diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteRequirements.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteRequirements.java index 833e0e44e391..dd4bc863912f 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteRequirements.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteRequirements.java @@ -26,18 +26,32 @@ /** A set of requirements such as distribution and ordering reported to Spark during writes. */ public class SparkWriteRequirements { + public static final long NO_ADVISORY_PARTITION_SIZE = 0; public static final SparkWriteRequirements EMPTY = - new SparkWriteRequirements(Distributions.unspecified(), new SortOrder[0], 0); + new SparkWriteRequirements( + Distributions.unspecified(), + new SortOrder[0], + org.apache.iceberg.SortOrder.unsorted(), + NO_ADVISORY_PARTITION_SIZE); private final Distribution distribution; private final SortOrder[] ordering; + private final org.apache.iceberg.SortOrder icebergOrdering; private final long advisoryPartitionSize; SparkWriteRequirements( - Distribution distribution, SortOrder[] ordering, long advisoryPartitionSize) { + Distribution distribution, + SortOrder[] ordering, + org.apache.iceberg.SortOrder icebergOrdering, + long advisoryPartitionSize) { this.distribution = distribution; this.ordering = ordering; - this.advisoryPartitionSize = advisoryPartitionSize; + this.icebergOrdering = icebergOrdering; + // Spark prohibits requesting a particular advisory partition size without distribution + this.advisoryPartitionSize = + distribution instanceof UnspecifiedDistribution + ? NO_ADVISORY_PARTITION_SIZE + : advisoryPartitionSize; } public Distribution distribution() { @@ -48,12 +62,19 @@ public SortOrder[] ordering() { return ordering; } + public org.apache.iceberg.SortOrder icebergOrdering() { + return icebergOrdering; + } + public boolean hasOrdering() { return ordering.length != 0; } public long advisoryPartitionSize() { - // Spark prohibits requesting a particular advisory partition size without distribution - return distribution instanceof UnspecifiedDistribution ? 0 : advisoryPartitionSize; + return advisoryPartitionSize; + } + + public SparkWriteRequirements withTableSortOrder(org.apache.iceberg.SortOrder sortOrder) { + return new SparkWriteRequirements(distribution, ordering, sortOrder, advisoryPartitionSize); } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteUtil.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteUtil.java index 0d68a0d8cdd0..535674aba977 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteUtil.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteUtil.java @@ -55,13 +55,23 @@ public class SparkWriteUtil { private static final Expression[] PARTITION_FILE_CLUSTERING = clusterBy(SPEC_ID, PARTITION, FILE_PATH); - private static final SortOrder[] EMPTY_ORDERING = new SortOrder[0]; - private static final SortOrder[] EXISTING_ROW_ORDERING = orderBy(FILE_PATH, ROW_POSITION); - private static final SortOrder[] PARTITION_ORDERING = orderBy(SPEC_ID, PARTITION); - private static final SortOrder[] PARTITION_FILE_ORDERING = orderBy(SPEC_ID, PARTITION, FILE_PATH); - private static final SortOrder[] POSITION_DELETE_ORDERING = + private static final SortOrder[] EMPTY_SPARK_ORDERING = new SortOrder[0]; + private static final SortOrder[] EXISTING_ROW_SPARK_ORDERING = orderBy(FILE_PATH, ROW_POSITION); + private static final SortOrder[] PARTITION_SPARK_ORDERING = orderBy(SPEC_ID, PARTITION); + private static final SortOrder[] PARTITION_FILE_SPARK_ORDERING = + orderBy(SPEC_ID, PARTITION, FILE_PATH); + private static final SortOrder[] POSITION_DELETE_SPARK_ORDERING = orderBy(SPEC_ID, PARTITION, FILE_PATH, ROW_POSITION); + private static final SparkAndIcebergOrdering EXISTING_ROW_ORDERING = + SparkAndIcebergOrdering.unsorted().prependOrder(EXISTING_ROW_SPARK_ORDERING); + private static final SparkAndIcebergOrdering PARTITION_ORDERING = + SparkAndIcebergOrdering.unsorted().prependOrder(PARTITION_SPARK_ORDERING); + private static final SparkAndIcebergOrdering PARTITION_FILE_ORDERING = + SparkAndIcebergOrdering.unsorted().prependOrder(PARTITION_FILE_SPARK_ORDERING); + private static final SparkAndIcebergOrdering POSITION_DELETE_ORDERING = + SparkAndIcebergOrdering.unsorted().prependOrder(POSITION_DELETE_SPARK_ORDERING); + private SparkWriteUtil() {} /** Builds requirements for batch and micro-batch writes such as append or overwrite. */ @@ -69,8 +79,9 @@ public static SparkWriteRequirements writeRequirements( Table table, DistributionMode mode, boolean fanoutEnabled, long advisoryPartitionSize) { Distribution distribution = writeDistribution(table, mode); - SortOrder[] ordering = writeOrdering(table, fanoutEnabled); - return new SparkWriteRequirements(distribution, ordering, advisoryPartitionSize); + SparkAndIcebergOrdering ordering = writeOrdering(table, fanoutEnabled); + return new SparkWriteRequirements( + distribution, ordering.sparkOrder(), ordering.icebergOrder(), advisoryPartitionSize); } private static Distribution writeDistribution(Table table, DistributionMode mode) { @@ -82,7 +93,7 @@ private static Distribution writeDistribution(Table table, DistributionMode mode return Distributions.clustered(clustering(table)); case RANGE: - return Distributions.ordered(ordering(table)); + return Distributions.ordered(ordering(table).sparkOrder()); default: throw new IllegalArgumentException("Unsupported distribution mode: " + mode); @@ -99,8 +110,9 @@ public static SparkWriteRequirements copyOnWriteRequirements( if (command == DELETE || command == UPDATE) { Distribution distribution = copyOnWriteDeleteUpdateDistribution(table, mode); - SortOrder[] ordering = writeOrdering(table, fanoutEnabled); - return new SparkWriteRequirements(distribution, ordering, advisoryPartitionSize); + SparkAndIcebergOrdering ordering = writeOrdering(table, fanoutEnabled); + return new SparkWriteRequirements( + distribution, ordering.sparkOrder(), ordering.icebergOrder(), advisoryPartitionSize); } else { return writeRequirements(table, mode, fanoutEnabled, advisoryPartitionSize); } @@ -122,9 +134,9 @@ private static Distribution copyOnWriteDeleteUpdateDistribution( case RANGE: if (table.spec().isPartitioned() || table.sortOrder().isSorted()) { - return Distributions.ordered(ordering(table)); + return Distributions.ordered(ordering(table).sparkOrder()); } else { - return Distributions.ordered(EXISTING_ROW_ORDERING); + return Distributions.ordered(EXISTING_ROW_ORDERING.sparkOrder()); } default: @@ -142,12 +154,15 @@ public static SparkWriteRequirements positionDeltaRequirements( if (command == UPDATE || command == MERGE) { Distribution distribution = positionDeltaUpdateMergeDistribution(table, mode); - SortOrder[] ordering = positionDeltaUpdateMergeOrdering(table, fanoutEnabled); - return new SparkWriteRequirements(distribution, ordering, advisoryPartitionSize); + SparkAndIcebergOrdering ordering = positionDeltaUpdateMergeOrdering(table, fanoutEnabled); + return new SparkWriteRequirements( + distribution, ordering.sparkOrder(), ordering.icebergOrder(), advisoryPartitionSize); } else { Distribution distribution = positionDeltaDeleteDistribution(table, mode); - SortOrder[] ordering = fanoutEnabled ? EMPTY_ORDERING : POSITION_DELETE_ORDERING; - return new SparkWriteRequirements(distribution, ordering, advisoryPartitionSize); + SparkAndIcebergOrdering ordering = + fanoutEnabled ? SparkAndIcebergOrdering.unsorted() : POSITION_DELETE_ORDERING; + return new SparkWriteRequirements( + distribution, ordering.sparkOrder(), ordering.icebergOrder(), advisoryPartitionSize); } } @@ -167,9 +182,15 @@ private static Distribution positionDeltaUpdateMergeDistribution( case RANGE: if (table.spec().isUnpartitioned()) { - return Distributions.ordered(concat(PARTITION_FILE_ORDERING, ordering(table))); + return Distributions.ordered( + SparkAndIcebergOrdering.forTable(table) + .prependOrder(PARTITION_FILE_SPARK_ORDERING) + .sparkOrder()); } else { - return Distributions.ordered(concat(PARTITION_ORDERING, ordering(table))); + return Distributions.ordered( + SparkAndIcebergOrdering.forTable(table) + .prependOrder(PARTITION_SPARK_ORDERING) + .sparkOrder()); } default: @@ -177,11 +198,12 @@ private static Distribution positionDeltaUpdateMergeDistribution( } } - private static SortOrder[] positionDeltaUpdateMergeOrdering(Table table, boolean fanoutEnabled) { + private static SparkAndIcebergOrdering positionDeltaUpdateMergeOrdering( + Table table, boolean fanoutEnabled) { if (fanoutEnabled && table.sortOrder().isUnsorted()) { - return EMPTY_ORDERING; + return SparkAndIcebergOrdering.unsorted(); } else { - return concat(POSITION_DELETE_ORDERING, ordering(table)); + return SparkAndIcebergOrdering.forTable(table).prependOrder(POSITION_DELETE_SPARK_ORDERING); } } @@ -199,9 +221,9 @@ private static Distribution positionDeltaDeleteDistribution(Table table, Distrib case RANGE: if (table.spec().isUnpartitioned()) { - return Distributions.ordered(PARTITION_FILE_ORDERING); + return Distributions.ordered(PARTITION_FILE_ORDERING.sparkOrder()); } else { - return Distributions.ordered(PARTITION_ORDERING); + return Distributions.ordered(PARTITION_ORDERING.sparkOrder()); } default: @@ -213,9 +235,9 @@ private static Distribution positionDeltaDeleteDistribution(Table table, Distrib // - there is a defined table sort order, so it is clear how the data should be ordered // - the table is partitioned and fanout writers are disabled, // so records for one partition must be co-located within a task - private static SortOrder[] writeOrdering(Table table, boolean fanoutEnabled) { + private static SparkAndIcebergOrdering writeOrdering(Table table, boolean fanoutEnabled) { if (fanoutEnabled && table.sortOrder().isUnsorted()) { - return EMPTY_ORDERING; + return SparkAndIcebergOrdering.unsorted(); } else { return ordering(table); } @@ -225,8 +247,8 @@ private static Expression[] clustering(Table table) { return Spark3Util.toTransforms(table.spec()); } - private static SortOrder[] ordering(Table table) { - return Spark3Util.toOrdering(SortOrderUtil.buildSortOrder(table)); + private static SparkAndIcebergOrdering ordering(Table table) { + return SparkAndIcebergOrdering.forTable(table); } private static Expression[] concat(Expression[] clustering, Expression... otherClustering) { @@ -256,4 +278,39 @@ private static SortOrder[] orderBy(Expression... exprs) { private static SortOrder sort(Expression expr) { return Expressions.sort(expr, SortDirection.ASCENDING); } + + private static class SparkAndIcebergOrdering { + private static final SparkAndIcebergOrdering UNSORTED = + new SparkAndIcebergOrdering(org.apache.iceberg.SortOrder.unsorted(), EMPTY_SPARK_ORDERING); + + private final org.apache.iceberg.SortOrder icebergSortOrder; + private final SortOrder[] sparkSortOrder; + + private SparkAndIcebergOrdering( + org.apache.iceberg.SortOrder icebergSortOrder, SortOrder[] sparkSortOrder) { + this.icebergSortOrder = icebergSortOrder; + this.sparkSortOrder = sparkSortOrder; + } + + public static SparkAndIcebergOrdering forTable(Table table) { + return new SparkAndIcebergOrdering( + table.sortOrder(), Spark3Util.toOrdering(SortOrderUtil.buildSortOrder(table))); + } + + public static SparkAndIcebergOrdering unsorted() { + return UNSORTED; + } + + public SparkAndIcebergOrdering prependOrder(SortOrder[] ordering) { + return new SparkAndIcebergOrdering(icebergSortOrder, concat(ordering, sparkSortOrder)); + } + + public org.apache.iceberg.SortOrder icebergOrder() { + return icebergSortOrder; + } + + public SortOrder[] sparkOrder() { + return sparkSortOrder; + } + } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java index 569eb252cba5..1ba4c7e2fac2 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java @@ -47,10 +47,14 @@ import org.apache.spark.sql.connector.expressions.SortOrder; import org.apache.spark.sql.connector.write.RequiresDistributionAndOrdering; import org.apache.spark.sql.execution.datasources.v2.DistributionAndOrderingUtils$; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import scala.Option; abstract class SparkShufflingFileRewriteRunner extends SparkDataFileRewriteRunner { + private static final Logger LOG = LoggerFactory.getLogger(SparkShufflingFileRewriteRunner.class); + /** * The number of shuffle partitions to use for each output file. By default, this file rewriter * assumes each shuffle partition would become a separate output file. Attempting to generate @@ -119,6 +123,17 @@ public void doRewrite(String groupId, RewriteFileGroup fileGroup) { spec(fileGroup.outputSpecId()), fileGroup.expectedOutputFiles())); + org.apache.iceberg.SortOrder sortOrderInJobSpec = sortOrder(); + + org.apache.iceberg.SortOrder maybeMatchingTableSortOrder = + SortOrderUtil.maybeFindTableSortOrder(table(), sortOrder()); + + if (sortOrderInJobSpec.isSorted() && maybeMatchingTableSortOrder.isUnsorted()) { + LOG.warn( + "Sort order specified for job {} doesn't match any table sort orders, so going to not mark rewritten files as sorted in the manifest files", + Spark3Util.describe(sortOrderInJobSpec)); + } + sortedDF .write() .format("iceberg") @@ -126,6 +141,7 @@ public void doRewrite(String groupId, RewriteFileGroup fileGroup) { .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, fileGroup.maxOutputFileSize()) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") .option(SparkWriteOptions.OUTPUT_SPEC_ID, fileGroup.outputSpecId()) + .option(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, maybeMatchingTableSortOrder.orderId()) .mode("append") .save(groupId); } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowComparator.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowComparator.java new file mode 100644 index 000000000000..d1ffcea10149 --- /dev/null +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowComparator.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.util.Comparator; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.SortOrderComparators; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; + +/** + * A comparator for Spark {@link InternalRow} objects based on an Iceberg {@link SortOrder}. + * + *

This comparator adapts Spark's InternalRow to Iceberg's StructLike interface and delegates to + * Iceberg's existing {@link SortOrderComparators} infrastructure, which provides full support for: + * + *

+ * + *

This class is NOT thread-safe. + */ +class InternalRowComparator implements Comparator { + private final Comparator delegate; + private final InternalRowWrapper leftWrapper; + private final InternalRowWrapper rightWrapper; + + /** + * Creates a comparator for the given sort order and schemas. + * + * @param sortOrder the Iceberg sort order to use for comparison + * @param sparkSchema the Spark schema of the rows to compare + * @param icebergSchema the Iceberg schema of the rows to compare + */ + InternalRowComparator(SortOrder sortOrder, StructType sparkSchema, Schema icebergSchema) { + Preconditions.checkArgument( + sortOrder.isSorted(), "Cannot create comparator for unsorted order"); + Preconditions.checkNotNull(sparkSchema, "Spark schema cannot be null"); + Preconditions.checkNotNull(icebergSchema, "Iceberg schema cannot be null"); + + this.delegate = SortOrderComparators.forSchema(icebergSchema, sortOrder); + this.leftWrapper = new InternalRowWrapper(sparkSchema, icebergSchema.asStruct()); + this.rightWrapper = new InternalRowWrapper(sparkSchema, icebergSchema.asStruct()); + } + + @Override + public int compare(InternalRow row1, InternalRow row2) { + return delegate.compare(leftWrapper.wrap(row1), rightWrapper.wrap(row2)); + } +} diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/MergingPartitionReader.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/MergingPartitionReader.java new file mode 100644 index 000000000000..e4997c3eb94a --- /dev/null +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/MergingPartitionReader.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.io.IOException; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.util.SortedMerge; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.connector.read.PartitionReader; +import org.apache.spark.sql.types.StructType; + +/** + * A {@link PartitionReader} that performs a k-way merge of multiple sorted readers. + * + *

This reader takes multiple {@link PartitionReader}s (one per file), each producing sorted data + * according to the same {@link SortOrder}, and merges them into a single sorted stream using + * Iceberg's {@link SortedMerge} utility. + * + *

The merge is performed using a priority queue (heap) to efficiently select the next row from + * among all readers, maintaining the sort order with O(log k) comparisons per row, where k is the + * number of files being merged. + * + * @param the type of InternalRow being read + */ +class MergingPartitionReader implements PartitionReader { + private final List> readers; + private final CloseableIterator mergedIterator; + private T current = null; + private boolean closed = false; + + MergingPartitionReader( + List> readers, + SortOrder sortOrder, + StructType sparkSchema, + Schema icebergSchema) { + Preconditions.checkNotNull(readers, "Readers cannot be null"); + Preconditions.checkArgument(!readers.isEmpty(), "Readers cannot be empty"); + Preconditions.checkNotNull(sortOrder, "Sort order cannot be null"); + Preconditions.checkArgument(sortOrder.isSorted(), "Sort order must be sorted"); + + this.readers = readers; + + Comparator comparator = + (Comparator) new InternalRowComparator(sortOrder, sparkSchema, icebergSchema); + + List> iterables = + readers.stream().map(this::readerToIterable).collect(Collectors.toList()); + + SortedMerge sortedMerge = new SortedMerge<>(comparator, iterables); + this.mergedIterator = sortedMerge.iterator(); + } + + /** Converts a PartitionReader to a CloseableIterable for use with SortedMerge. */ + private CloseableIterable readerToIterable(PartitionReader reader) { + return new CloseableIterable() { + @Override + public CloseableIterator iterator() { + return new CloseableIterator() { + private boolean advanced = false; + private boolean hasNext = false; + + @Override + public boolean hasNext() { + if (!advanced) { + try { + hasNext = reader.next(); + advanced = true; + } catch (IOException e) { + throw new RuntimeException("Failed to advance reader", e); + } + } + return hasNext; + } + + @Override + public T next() { + if (!advanced) { + hasNext(); + } + advanced = false; + // Spark readers reuse InternalRow objects for performance (see + // SparkParquetReaders.java:547) + // Return a copy of the row to avoid corruption. + return (T) reader.get().copy(); + } + + @Override + public void close() throws IOException { + reader.close(); + } + }; + } + + @Override + public void close() throws IOException { + reader.close(); + } + }; + } + + @Override + public boolean next() throws IOException { + if (mergedIterator.hasNext()) { + this.current = mergedIterator.next(); + return true; + } + return false; + } + + @Override + public T get() { + return current; + } + + @Override + public void close() throws IOException { + if (closed) { + return; + } + + try { + mergedIterator.close(); + } finally { + closed = true; + } + } +} diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/MergingSortedRowDataReader.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/MergingSortedRowDataReader.java new file mode 100644 index 000000000000..180857b0f231 --- /dev/null +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/MergingSortedRowDataReader.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.iceberg.BaseScanTaskGroup; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.ScanTaskGroup; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.connector.metric.CustomTaskMetric; +import org.apache.spark.sql.connector.read.PartitionReader; +import org.apache.spark.sql.types.StructType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A {@link PartitionReader} that reads multiple sorted files and merges them into a single sorted + * stream. + * + *

This reader is used when {@code preserve-data-ordering} is enabled and the task group contains + * multiple files that all have the same sort order. It creates one {@link RowDataReader} per file + * and uses {@link MergingPartitionReader} to perform a k-way merge. + */ +class MergingSortedRowDataReader implements PartitionReader { + private static final Logger LOG = LoggerFactory.getLogger(MergingSortedRowDataReader.class); + + private final MergingPartitionReader mergingReader; + private final List fileReaders; + + MergingSortedRowDataReader(SparkInputPartition partition, int reportableSortOrderId) { + Table table = partition.table(); + ScanTaskGroup taskGroup = partition.taskGroup(); + Schema tableSchema = SnapshotUtil.schemaFor(table, partition.branch()); + Schema expectedSchema = partition.expectedSchema(); + + Preconditions.checkArgument( + reportableSortOrderId > 0, "Invalid sort order ID: %s", reportableSortOrderId); + Preconditions.checkArgument( + taskGroup.tasks().size() > 1, + "Merging reader requires multiple files, got %s", + taskGroup.tasks().size()); + + LOG.info( + "Creating merging reader for {} files with sort order ID {} in table {}", + taskGroup.tasks().size(), + reportableSortOrderId, + table.name()); + + SortOrder sortOrder = table.sortOrders().get(reportableSortOrderId); + Preconditions.checkNotNull( + sortOrder, + "Cannot find sort order with ID %s in table %s", + reportableSortOrderId, + table.name()); + + this.fileReaders = + taskGroup.tasks().stream() + .map( + task -> { + ScanTaskGroup singleTaskGroup = + new BaseScanTaskGroup<>(java.util.Collections.singletonList(task)); + + return new RowDataReader( + table, + singleTaskGroup, + tableSchema, + expectedSchema, + partition.isCaseSensitive(), + partition.cacheDeleteFilesOnExecutors()); + }) + .collect(Collectors.toList()); + + List> readers = + fileReaders.stream() + .map(reader -> (PartitionReader) reader) + .collect(Collectors.toList()); + + StructType sparkSchema = SparkSchemaUtil.convert(expectedSchema); + this.mergingReader = + new MergingPartitionReader<>(readers, sortOrder, sparkSchema, expectedSchema); + } + + @Override + public boolean next() throws IOException { + return mergingReader.next(); + } + + @Override + public InternalRow get() { + return mergingReader.get(); + } + + @Override + public void close() throws IOException { + mergingReader.close(); + } + + public CustomTaskMetric[] currentMetricsValues() { + long totalSplits = fileReaders.size(); + + long totalDeletes = + fileReaders.stream() + .flatMap(reader -> Arrays.stream(reader.currentMetricsValues())) + .filter( + metric -> metric instanceof org.apache.iceberg.spark.source.metrics.TaskNumDeletes) + .mapToLong(CustomTaskMetric::value) + .sum(); + + return new CustomTaskMetric[] { + new org.apache.iceberg.spark.source.metrics.TaskNumSplits(totalSplits), + new org.apache.iceberg.spark.source.metrics.TaskNumDeletes(totalDeletes) + }; + } +} diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java index 0626d0b43985..eb812338e149 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java @@ -91,16 +91,24 @@ public InputPartition[] planInputPartitions() { InputPartition[] partitions = new InputPartition[taskGroups.size()]; for (int index = 0; index < taskGroups.size(); index++) { + ScanTaskGroup taskGroup = taskGroups.get(index); + + Integer reportableSortOrderId = null; + if (readConf.preserveDataOrdering()) { + reportableSortOrderId = table.sortOrder().orderId(); + } + partitions[index] = new SparkInputPartition( groupingKeyType, - taskGroups.get(index), + taskGroup, tableBroadcast, branch, expectedSchemaString, caseSensitive, locations != null ? locations[index] : SparkPlanningUtil.NO_LOCATION_PREFERENCE, - cacheDeleteFilesOnExecutors); + cacheDeleteFilesOnExecutors, + reportableSortOrderId); } return partitions; @@ -160,6 +168,12 @@ private boolean useParquetBatchReads() { private boolean supportsParquetBatchReads(ScanTask task) { if (task instanceof ScanTaskGroup) { ScanTaskGroup taskGroup = (ScanTaskGroup) task; + + // Vectorized readers cannot merge sorted data from multiple files + if (readConf.preserveDataOrdering() && taskGroup.tasks().size() > 1) { + return false; + } + return taskGroup.tasks().stream().allMatch(this::supportsParquetBatchReads); } else if (task.isFileScanTask() && !task.isDataTask()) { @@ -200,6 +214,12 @@ private boolean useOrcBatchReads() { private boolean supportsOrcBatchReads(ScanTask task) { if (task instanceof ScanTaskGroup) { ScanTaskGroup taskGroup = (ScanTaskGroup) task; + + // Vectorized readers cannot merge sorted data from multiple files + if (readConf.preserveDataOrdering() && taskGroup.tasks().size() > 1) { + return false; + } + return taskGroup.tasks().stream().allMatch(this::supportsOrcBatchReads); } else if (task.isFileScanTask() && !task.isDataTask()) { diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkInputPartition.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkInputPartition.java index 99b1d78a86b0..e2e4037aaac0 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkInputPartition.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkInputPartition.java @@ -39,6 +39,7 @@ class SparkInputPartition implements InputPartition, HasPartitionKey, Serializab private final boolean caseSensitive; private final transient String[] preferredLocations; private final boolean cacheDeleteFilesOnExecutors; + private final Integer reportableSortOrderId; private transient Schema expectedSchema = null; @@ -50,7 +51,8 @@ class SparkInputPartition implements InputPartition, HasPartitionKey, Serializab String expectedSchemaString, boolean caseSensitive, String[] preferredLocations, - boolean cacheDeleteFilesOnExecutors) { + boolean cacheDeleteFilesOnExecutors, + Integer reportableSortOrderId) { this.groupingKeyType = groupingKeyType; this.taskGroup = taskGroup; this.tableBroadcast = tableBroadcast; @@ -59,6 +61,7 @@ class SparkInputPartition implements InputPartition, HasPartitionKey, Serializab this.caseSensitive = caseSensitive; this.preferredLocations = preferredLocations; this.cacheDeleteFilesOnExecutors = cacheDeleteFilesOnExecutors; + this.reportableSortOrderId = reportableSortOrderId; } @Override @@ -103,4 +106,8 @@ public Schema expectedSchema() { return expectedSchema; } + + public Integer reportableSortOrderId() { + return reportableSortOrderId; + } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java index 60dd1f318ca5..b5d7e1b2473e 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java @@ -172,7 +172,8 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) { expectedSchema, caseSensitive, locations != null ? locations[index] : SparkPlanningUtil.NO_LOCATION_PREFERENCE, - cacheDeleteFilesOnExecutors); + cacheDeleteFilesOnExecutors, + null); } return partitions; diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitioningAwareScan.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitioningAwareScan.java index c9726518ee4e..9ac86d3c0730 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitioningAwareScan.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitioningAwareScan.java @@ -48,7 +48,9 @@ import org.apache.iceberg.util.StructLikeSet; import org.apache.iceberg.util.TableScanUtil; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.expressions.SortOrder; import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.read.SupportsReportOrdering; import org.apache.spark.sql.connector.read.SupportsReportPartitioning; import org.apache.spark.sql.connector.read.partitioning.KeyGroupedPartitioning; import org.apache.spark.sql.connector.read.partitioning.Partitioning; @@ -57,12 +59,13 @@ import org.slf4j.LoggerFactory; abstract class SparkPartitioningAwareScan extends SparkScan - implements SupportsReportPartitioning { + implements SupportsReportPartitioning, SupportsReportOrdering { private static final Logger LOG = LoggerFactory.getLogger(SparkPartitioningAwareScan.class); private final Scan> scan; private final boolean preserveDataGrouping; + private final boolean preserveDataOrdering; private Set specs = null; // lazy cache of scanned specs private List tasks = null; // lazy cache of uncombined tasks @@ -82,6 +85,7 @@ abstract class SparkPartitioningAwareScan extends S this.scan = scan; this.preserveDataGrouping = readConf.preserveDataGrouping(); + this.preserveDataOrdering = readConf.preserveDataOrdering(); if (scan == null) { this.specs = Collections.emptySet(); @@ -114,6 +118,57 @@ public Partitioning outputPartitioning() { } } + @Override + public SortOrder[] outputOrdering() { + if (!preserveDataOrdering) { + return new SortOrder[0]; + } + + if (groupingKeyType().fields().isEmpty()) { + LOG.info("Not reporting ordering for unpartitioned table {}", table().name()); + return new SortOrder[0]; + } + + org.apache.iceberg.SortOrder currentSortOrder = table().sortOrder(); + if (currentSortOrder.isUnsorted()) { + return new SortOrder[0]; + } + + if (!allFilesHaveSortOrder(currentSortOrder.orderId())) { + LOG.info( + "Not all files have current table sort order {}, not reporting ordering", + currentSortOrder.orderId()); + return new SortOrder[0]; + } + + SortOrder[] ordering = Spark3Util.toOrdering(currentSortOrder); + LOG.info( + "Reporting sort order {} for table {}: {}", + currentSortOrder.orderId(), + table().name(), + ordering); + + return ordering; + } + + private boolean allFilesHaveSortOrder(int expectedSortOrderId) { + for (ScanTaskGroup taskGroup : taskGroups()) { + for (T task : taskGroup.tasks()) { + if (!(task instanceof org.apache.iceberg.FileScanTask)) { + continue; + } + + org.apache.iceberg.FileScanTask fileTask = (org.apache.iceberg.FileScanTask) task; + Integer fileSortOrderId = fileTask.file().sortOrderId(); + + if (fileSortOrderId == null || fileSortOrderId != expectedSortOrderId) { + return false; + } + } + } + return true; + } + @Override protected StructType groupingKeyType() { if (groupingKeyType == null) { diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java index d072397dc6a3..e0c842e9a6d7 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java @@ -182,7 +182,8 @@ public DeltaWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) { broadcastRewritableDeletes(), command, context, - writeProperties); + writeProperties, + writeRequirements.icebergOrdering()); } private Broadcast> broadcastRewritableDeletes() { @@ -392,18 +393,21 @@ private static class PositionDeltaWriteFactory implements DeltaWriterFactory { private final Command command; private final Context context; private final Map writeProperties; + private final org.apache.iceberg.SortOrder sortOrder; PositionDeltaWriteFactory( Broadcast tableBroadcast, Broadcast> rewritableDeletesBroadcast, Command command, Context context, - Map writeProperties) { + Map writeProperties, + org.apache.iceberg.SortOrder sortOrder) { this.tableBroadcast = tableBroadcast; this.rewritableDeletesBroadcast = rewritableDeletesBroadcast; this.command = command; this.context = context; this.writeProperties = writeProperties; + this.sortOrder = sortOrder; } @Override @@ -430,6 +434,7 @@ public DeltaWriter createWriter(int partitionId, long taskId) { .deleteFileFormat(context.deleteFileFormat()) .positionDeleteSparkType(context.deleteSparkType()) .writeProperties(writeProperties) + .dataSortOrder(sortOrder) .build(); if (command == DELETE) { diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkRowReaderFactory.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkRowReaderFactory.java index 23699aeb167c..9e05464ee05a 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkRowReaderFactory.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkRowReaderFactory.java @@ -42,7 +42,12 @@ public PartitionReader createReader(InputPartition inputPartition) SparkInputPartition partition = (SparkInputPartition) inputPartition; if (partition.allTasksOfType(FileScanTask.class)) { - return new RowDataReader(partition); + Integer reportableSortOrderId = partition.reportableSortOrderId(); + if (reportableSortOrderId != null && partition.taskGroup().tasks().size() > 1) { + return new MergingSortedRowDataReader(partition, reportableSortOrderId); + } else { + return new RowDataReader(partition); + } } else if (partition.allTasksOfType(ChangelogScanTask.class)) { return new ChangelogRowReader(partition); diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java index 106b296de098..c32cee882398 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java @@ -343,6 +343,13 @@ public CustomMetric[] supportedCustomMetrics() { } protected long adjustSplitSize(List tasks, long splitSize) { + if (readConf.preserveDataOrdering()) { + // Diable splitting tasks into multiple groups when we need to preserve ordering. + // This prevents multiple InputPartitions with the same partitionKey, which would + // cause Spark to suppress outputOrdering. + return Long.MAX_VALUE; + } + if (readConf.splitSizeOption() == null && readConf.adaptiveSplitSizeEnabled()) { long scanSize = tasks.stream().mapToLong(ScanTask::sizeBytes).sum(); int parallelism = readConf.parallelism(); diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java index c9a94090ef89..521cb8a2287f 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java @@ -205,7 +205,8 @@ private WriterFactory createWriterFactory() { writeSchema, dsSchema, useFanoutWriter, - writeProperties); + writeProperties, + writeRequirements.icebergOrdering()); } private void commitOperation(SnapshotUpdate operation, String description) { @@ -675,6 +676,7 @@ private static class WriterFactory implements DataWriterFactory, StreamingDataWr private final boolean useFanoutWriter; private final String queryId; private final Map writeProperties; + private final org.apache.iceberg.SortOrder sortOrder; protected WriterFactory( Broadcast
tableBroadcast, @@ -685,7 +687,8 @@ protected WriterFactory( Schema writeSchema, StructType dsSchema, boolean useFanoutWriter, - Map writeProperties) { + Map writeProperties, + org.apache.iceberg.SortOrder sortOrder) { this.tableBroadcast = tableBroadcast; this.format = format; this.outputSpecId = outputSpecId; @@ -695,6 +698,7 @@ protected WriterFactory( this.useFanoutWriter = useFanoutWriter; this.queryId = queryId; this.writeProperties = writeProperties; + this.sortOrder = sortOrder; } @Override @@ -719,6 +723,7 @@ public DataWriter createWriter(int partitionId, long taskId, long e .dataSchema(writeSchema) .dataSparkType(dsSchema) .writeProperties(writeProperties) + .dataSortOrder(sortOrder) .build(); Function rowLineageExtractor = new ExtractRowLineage(writeSchema); diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java index 89af7740d988..70230a91fc28 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java @@ -190,7 +190,9 @@ public StreamingWrite toStreaming() { } private SparkWriteRequirements writeRequirements() { - if (overwriteFiles) { + if (rewrittenFileSetId != null) { + return writeConf.rewriteFilesWriteRequirements(); + } else if (overwriteFiles) { return writeConf.copyOnWriteRequirements(copyOnWriteCommand); } else { return writeConf.writeRequirements(); diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestBase.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestBase.java index daf4e29ac075..5f39a14a1a9e 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestBase.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestBase.java @@ -53,13 +53,18 @@ import org.apache.spark.sql.execution.QueryExecution; import org.apache.spark.sql.execution.SparkPlan; import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec; +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper; import org.apache.spark.sql.internal.SQLConf; import org.apache.spark.sql.util.QueryExecutionListener; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; +import scala.PartialFunction; +import scala.collection.JavaConverters; public abstract class TestBase extends SparkTestHelperBase { + private static final AdaptiveSparkPlanHelper SPARK_HELPER = new AdaptiveSparkPlanHelper() {}; + protected static TestHiveMetastore metastore = null; protected static HiveConf hiveConf = null; protected static SparkSession spark = null; @@ -282,6 +287,25 @@ public void onFailure(String funcName, QueryExecution qe, Exception exception) { } } + /** Collect all nodes of a specific plan type from the plan tree. */ + protected List collectPlans(SparkPlan plan, Class planClass) { + scala.collection.Seq seq = + SPARK_HELPER.collect( + plan, + new PartialFunction() { + @Override + public T apply(SparkPlan p) { + return planClass.cast(p); + } + + @Override + public boolean isDefinedAt(SparkPlan p) { + return planClass.isInstance(p); + } + }); + return JavaConverters.seqAsJavaListConverter(seq).asJava(); + } + @FunctionalInterface protected interface Action { void invoke(); diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java index 61aacfa4589d..247119523756 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java @@ -45,6 +45,7 @@ import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.MERGE; import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.UPDATE; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException; import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.time.Duration; @@ -600,6 +601,27 @@ public void testDVWriteConf() { assertThat(writeConf.deleteFileFormat()).isEqualTo(FileFormat.PUFFIN); } + @TestTemplate + public void testSortOrderWriteConf() { + Table table = validationCatalog.loadTable(tableIdent); + + table.replaceSortOrder().asc("id").commit(); + + SparkWriteConf writeConf = + new SparkWriteConf( + spark, table, ImmutableMap.of(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, "1")); + + assertThat(writeConf.outputSortOrder()).isEqualTo(table.sortOrder()); + + SparkWriteConf writeConfForUnknownSortOrder = + new SparkWriteConf( + spark, table, ImmutableMap.of(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, "999")); + + assertThatIllegalArgumentException() + .isThrownBy(writeConfForUnknownSortOrder::outputSortOrder) + .withMessage("Output sort order id 999 is not a valid sort order id for table"); + } + private void testWriteProperties(List> propertiesSuite) { withSQLConf( propertiesSuite.get(0), diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 6d965f3dcc62..5d8a6da51c4e 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -1587,6 +1587,7 @@ public void testSimpleSort() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesSortOrderShouldMatchTableSortOrder(table); } @TestTemplate @@ -1623,6 +1624,7 @@ public void testSortAfterPartitionChange() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesSortOrderShouldMatchTableSortOrder(table); } @TestTemplate @@ -1654,6 +1656,9 @@ public void testSortCustomSortOrder() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + // Since the SortOrder isn't in the table spec, these data files should report having the + // default table sort order e.g. unsorted + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate @@ -1694,6 +1699,58 @@ public void testSortCustomSortOrderRequiresRepartition() { shouldHaveMultipleFiles(table); shouldHaveLastCommitUnsorted(table, "c2"); shouldHaveLastCommitSorted(table, "c3"); + // Since the table ordering is on C2, but we rewrote explicitly with C3 which isn't a sort order + // spec, the table files should report unsorted + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); + } + + @TestTemplate + public void testSortPastTableSortOrderGetsAppliedToFiles() { + int partitions = 4; + Table table = createTable(); + writeRecords(20, SCALE, partitions); + shouldHaveLastCommitUnsorted(table, "c3"); + + // Add a partition column so this requires repartitioning + table.updateSpec().addField("c1").commit(); + + // Add the sort order we want to use during the rewrite job + table.replaceSortOrder().asc("c3").commit(); + SortOrder c3SortOrder = table.sortOrder(); + + // Replace that sort order with a newer one that we aren't going to use, but is the current + // table ordering + table.replaceSortOrder().asc("c2").commit(); + shouldHaveFiles(table, 20); + + List originalData = currentData(); + long dataSizeBefore = testDataSize(table); + + RewriteDataFiles.Result result = + basicRewrite(table) + .sort(SortOrder.builderFor(table.schema()).asc("c3").build()) + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) / partitions)) + .execute(); + + assertThat(result.rewriteResults()).as("Should have 1 fileGroups").hasSize(1); + assertThat(result.rewrittenBytesCount()).isEqualTo(dataSizeBefore); + + table.refresh(); + + List postRewriteData = currentData(); + assertEquals("We shouldn't have changed the data", originalData, postRewriteData); + + shouldHaveSnapshots(table, 2); + shouldHaveACleanCache(table); + shouldHaveMultipleFiles(table); + shouldHaveLastCommitUnsorted(table, "c2"); + shouldHaveLastCommitSorted(table, "c3"); + // Since the table ordering is on C2, but we rewrote explicitly with C3 which is in the table + // sort order spec, the table files should report C3 sort order + dataFilesShouldHaveSortOrderIdMatching(table, c3SortOrder); } @TestTemplate @@ -1734,6 +1791,9 @@ public void testAutoSortShuffleOutput() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + // Since the sort order being applied here isn't anywhere on the table spec, all files despite + // being physically sorted should report unsorted in the manifest entry + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate @@ -2657,4 +2717,18 @@ public boolean matches(RewriteFileGroup argument) { return groupIDs.contains(argument.info().globalIndex()); } } + + private void dataFilesSortOrderShouldMatchTableSortOrder(Table table) { + dataFilesShouldHaveSortOrderIdMatching(table, table.sortOrder()); + } + + private void dataFilesShouldHaveSortOrderIdMatching(Table table, SortOrder sortOrder) { + try (CloseableIterable files = table.newScan().planFiles()) { + assertThat(files) + .extracting(fileScanTask -> fileScanTask.file().sortOrderId()) + .containsOnly(sortOrder.orderId()); + } catch (IOException e) { + throw new RuntimeException("Failed to close file scan tasks", e); + } + } } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowComparator.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowComparator.java new file mode 100644 index 000000000000..586a292e2f7d --- /dev/null +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowComparator.java @@ -0,0 +1,237 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.Comparator; +import org.apache.iceberg.NullOrder; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.types.Types; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; +import org.junit.jupiter.api.Test; + +public class TestInternalRowComparator { + + @Test + public void testAscendingInteger() { + Schema schema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("id").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + Comparator comparator = new InternalRowComparator(sortOrder, sparkSchema, schema); + + InternalRow row1 = createRow(1); + InternalRow row2 = createRow(2); + InternalRow row3 = createRow(1); + + assertThat(comparator.compare(row1, row2)).isLessThan(0); + assertThat(comparator.compare(row2, row1)).isGreaterThan(0); + assertThat(comparator.compare(row1, row3)).isEqualTo(0); + } + + @Test + public void testDescendingInteger() { + Schema schema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).desc("id").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + Comparator comparator = new InternalRowComparator(sortOrder, sparkSchema, schema); + + InternalRow row1 = createRow(1); + InternalRow row2 = createRow(2); + + // DESC: higher values come first + assertThat(comparator.compare(row1, row2)).isGreaterThan(0); + assertThat(comparator.compare(row2, row1)).isLessThan(0); + } + + @Test + public void testNullsFirst() { + Schema schema = new Schema(Types.NestedField.optional(1, "id", Types.IntegerType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("id", NullOrder.NULLS_FIRST).build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + Comparator comparator = new InternalRowComparator(sortOrder, sparkSchema, schema); + + InternalRow nullRow = createRow(new Object[] {null}); + InternalRow row1 = createRow(1); + + // NULLS_FIRST: null < 1 + assertThat(comparator.compare(nullRow, row1)).isLessThan(0); + assertThat(comparator.compare(row1, nullRow)).isGreaterThan(0); + } + + @Test + public void testNullsLast() { + Schema schema = new Schema(Types.NestedField.optional(1, "id", Types.IntegerType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("id", NullOrder.NULLS_LAST).build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + Comparator comparator = new InternalRowComparator(sortOrder, sparkSchema, schema); + + InternalRow nullRow = createRow(new Object[] {null}); + InternalRow row1 = createRow(1); + + // NULLS_LAST: null > 1 + assertThat(comparator.compare(nullRow, row1)).isGreaterThan(0); + assertThat(comparator.compare(row1, nullRow)).isLessThan(0); + } + + @Test + public void testMultipleFields() { + Schema schema = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("id").asc("data").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + Comparator comparator = new InternalRowComparator(sortOrder, sparkSchema, schema); + + InternalRow row1 = createRow(1, "a"); + InternalRow row2 = createRow(1, "b"); + InternalRow row3 = createRow(2, "a"); + + // Same id, compare by data + assertThat(comparator.compare(row1, row2)).isLessThan(0); + assertThat(comparator.compare(row2, row1)).isGreaterThan(0); + + // Different id, id takes precedence + assertThat(comparator.compare(row1, row3)).isLessThan(0); + assertThat(comparator.compare(row3, row1)).isGreaterThan(0); + } + + @Test + public void testMixedSortDirections() { + Schema schema = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("id").desc("data").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + Comparator comparator = new InternalRowComparator(sortOrder, sparkSchema, schema); + + InternalRow row1 = createRow(1, "a"); + InternalRow row2 = createRow(1, "b"); + + // Same id, data is DESC: "b" < "a" + assertThat(comparator.compare(row1, row2)).isGreaterThan(0); + assertThat(comparator.compare(row2, row1)).isLessThan(0); + } + + @Test + public void testLongType() { + Schema schema = new Schema(Types.NestedField.required(1, "value", Types.LongType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("value").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + Comparator comparator = new InternalRowComparator(sortOrder, sparkSchema, schema); + + InternalRow row1 = createRow(100L); + InternalRow row2 = createRow(200L); + + assertThat(comparator.compare(row1, row2)).isLessThan(0); + assertThat(comparator.compare(row2, row1)).isGreaterThan(0); + } + + @Test + public void testDoubleType() { + Schema schema = new Schema(Types.NestedField.required(1, "value", Types.DoubleType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("value").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + Comparator comparator = new InternalRowComparator(sortOrder, sparkSchema, schema); + + InternalRow row1 = createRow(1.5); + InternalRow row2 = createRow(2.5); + + assertThat(comparator.compare(row1, row2)).isLessThan(0); + assertThat(comparator.compare(row2, row1)).isGreaterThan(0); + } + + @Test + public void testStringType() { + Schema schema = new Schema(Types.NestedField.required(1, "name", Types.StringType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("name").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + Comparator comparator = new InternalRowComparator(sortOrder, sparkSchema, schema); + + InternalRow row1 = createRow("alice"); + InternalRow row2 = createRow("bob"); + + assertThat(comparator.compare(row1, row2)).isLessThan(0); + assertThat(comparator.compare(row2, row1)).isGreaterThan(0); + } + + @Test + public void testBothNulls() { + Schema schema = new Schema(Types.NestedField.optional(1, "id", Types.IntegerType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("id").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + Comparator comparator = new InternalRowComparator(sortOrder, sparkSchema, schema); + + InternalRow nullRow1 = createRow(new Object[] {null}); + InternalRow nullRow2 = createRow(new Object[] {null}); + + // Both null, should be equal + assertThat(comparator.compare(nullRow1, nullRow2)).isEqualTo(0); + } + + @Test + public void testUnsortedOrderThrows() { + Schema schema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + SortOrder unsortedOrder = SortOrder.unsorted(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + assertThatThrownBy(() -> new InternalRowComparator(unsortedOrder, sparkSchema, schema)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot create comparator for unsorted order"); + } + + // Helper methods to create test rows + private InternalRow createRow(Object... values) { + GenericInternalRow row = new GenericInternalRow(values.length); + for (int i = 0; i < values.length; i++) { + if (values[i] == null) { + row.setNullAt(i); + } else if (values[i] instanceof Integer) { + row.setInt(i, (Integer) values[i]); + } else if (values[i] instanceof Long) { + row.setLong(i, (Long) values[i]); + } else if (values[i] instanceof Double) { + row.setDouble(i, (Double) values[i]); + } else if (values[i] instanceof String) { + row.update(i, UTF8String.fromString((String) values[i])); + } else { + throw new IllegalArgumentException("Unsupported type: " + values[i].getClass()); + } + } + return row; + } +} diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestMergingPartitionReader.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestMergingPartitionReader.java new file mode 100644 index 000000000000..ff4ea5dcd141 --- /dev/null +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestMergingPartitionReader.java @@ -0,0 +1,309 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.util.List; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.types.Types; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; +import org.apache.spark.sql.connector.read.PartitionReader; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; +import org.junit.jupiter.api.Test; + +public class TestMergingPartitionReader { + + @Test + public void testTwoSortedReaders() throws IOException { + Schema schema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("id").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + // Reader 1: [1, 3, 5] + PartitionReader reader1 = createMockReader(ImmutableList.of(1, 3, 5)); + + // Reader 2: [2, 4, 6] + PartitionReader reader2 = createMockReader(ImmutableList.of(2, 4, 6)); + + MergingPartitionReader mergingReader = + new MergingPartitionReader<>( + ImmutableList.of(reader1, reader2), sortOrder, sparkSchema, schema); + + List results = readAll(mergingReader); + assertThat(results).containsExactly(1, 2, 3, 4, 5, 6); + + mergingReader.close(); + } + + @Test + public void testThreeSortedReaders() throws IOException { + Schema schema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("id").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + // Reader 1: [1, 4, 7] + PartitionReader reader1 = createMockReader(ImmutableList.of(1, 4, 7)); + + // Reader 2: [2, 5, 8] + PartitionReader reader2 = createMockReader(ImmutableList.of(2, 5, 8)); + + // Reader 3: [3, 6, 9] + PartitionReader reader3 = createMockReader(ImmutableList.of(3, 6, 9)); + + MergingPartitionReader mergingReader = + new MergingPartitionReader<>( + ImmutableList.of(reader1, reader2, reader3), sortOrder, sparkSchema, schema); + + List results = readAll(mergingReader); + assertThat(results).containsExactly(1, 2, 3, 4, 5, 6, 7, 8, 9); + + mergingReader.close(); + } + + @Test + public void testDescendingOrder() throws IOException { + Schema schema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).desc("id").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + // Reader 1: [5, 3, 1] (descending) + PartitionReader reader1 = createMockReader(ImmutableList.of(5, 3, 1)); + + // Reader 2: [6, 4, 2] (descending) + PartitionReader reader2 = createMockReader(ImmutableList.of(6, 4, 2)); + + MergingPartitionReader mergingReader = + new MergingPartitionReader<>( + ImmutableList.of(reader1, reader2), sortOrder, sparkSchema, schema); + + List results = readAll(mergingReader); + assertThat(results).containsExactly(6, 5, 4, 3, 2, 1); + + mergingReader.close(); + } + + @Test + public void testDuplicateValues() throws IOException { + Schema schema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("id").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + // Reader 1: [1, 2, 3] + PartitionReader reader1 = createMockReader(ImmutableList.of(1, 2, 3)); + + // Reader 2: [2, 3, 4] (has duplicates with reader1) + PartitionReader reader2 = createMockReader(ImmutableList.of(2, 3, 4)); + + MergingPartitionReader mergingReader = + new MergingPartitionReader<>( + ImmutableList.of(reader1, reader2), sortOrder, sparkSchema, schema); + + List results = readAll(mergingReader); + assertThat(results).containsExactly(1, 2, 2, 3, 3, 4); + + mergingReader.close(); + } + + @Test + public void testEmptyReader() throws IOException { + Schema schema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("id").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + // Reader 1: [1, 2, 3] + PartitionReader reader1 = createMockReader(ImmutableList.of(1, 2, 3)); + + // Reader 2: [] (empty) + PartitionReader reader2 = createMockReader(ImmutableList.of()); + + MergingPartitionReader mergingReader = + new MergingPartitionReader<>( + ImmutableList.of(reader1, reader2), sortOrder, sparkSchema, schema); + + List results = readAll(mergingReader); + assertThat(results).containsExactly(1, 2, 3); + + mergingReader.close(); + } + + @Test + public void testSingleReader() throws IOException { + Schema schema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("id").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + PartitionReader reader = createMockReader(ImmutableList.of(1, 2, 3)); + + MergingPartitionReader mergingReader = + new MergingPartitionReader<>(ImmutableList.of(reader), sortOrder, sparkSchema, schema); + + List results = readAll(mergingReader); + assertThat(results).containsExactly(1, 2, 3); + + mergingReader.close(); + } + + @Test + public void testMultipleFieldsSorting() throws IOException { + Schema schema = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("id").asc("data").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + // Reader 1: [(1, "a"), (2, "a")] + PartitionReader reader1 = + createMockReaderWithStrings(ImmutableList.of(row(1, "a"), row(2, "a"))); + + // Reader 2: [(1, "b"), (2, "b")] + PartitionReader reader2 = + createMockReaderWithStrings(ImmutableList.of(row(1, "b"), row(2, "b"))); + + MergingPartitionReader mergingReader = + new MergingPartitionReader<>( + ImmutableList.of(reader1, reader2), sortOrder, sparkSchema, schema); + + List results = readAllRows(mergingReader); + assertThat(results).hasSize(4); + assertThat(results.get(0).getInt(0)).isEqualTo(1); + assertThat(results.get(0).getUTF8String(1).toString()).isEqualTo("a"); + assertThat(results.get(1).getInt(0)).isEqualTo(1); + assertThat(results.get(1).getUTF8String(1).toString()).isEqualTo("b"); + assertThat(results.get(2).getInt(0)).isEqualTo(2); + assertThat(results.get(2).getUTF8String(1).toString()).isEqualTo("a"); + assertThat(results.get(3).getInt(0)).isEqualTo(2); + assertThat(results.get(3).getUTF8String(1).toString()).isEqualTo("b"); + + mergingReader.close(); + } + + @Test + public void testEmptyReadersListThrows() { + Schema schema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + SortOrder sortOrder = SortOrder.builderFor(schema).asc("id").build(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + assertThatThrownBy( + () -> new MergingPartitionReader<>(ImmutableList.of(), sortOrder, sparkSchema, schema)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Readers cannot be empty"); + } + + @Test + public void testUnsortedOrderThrows() { + Schema schema = new Schema(Types.NestedField.required(1, "id", Types.IntegerType.get())); + SortOrder unsortedOrder = SortOrder.unsorted(); + StructType sparkSchema = SparkSchemaUtil.convert(schema); + + PartitionReader reader = createMockReader(ImmutableList.of(1, 2, 3)); + + assertThatThrownBy( + () -> + new MergingPartitionReader<>( + ImmutableList.of(reader), unsortedOrder, sparkSchema, schema)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Sort order must be sorted"); + } + + private PartitionReader createMockReader(List values) { + return new MockPartitionReader( + values.stream().map(this::createSingleIntRow).collect(ImmutableList.toImmutableList())); + } + + private PartitionReader createMockReaderWithStrings(List rows) { + return new MockPartitionReader(rows); + } + + private InternalRow createSingleIntRow(int value) { + GenericInternalRow row = new GenericInternalRow(1); + row.setInt(0, value); + return row; + } + + private InternalRow row(int id, String data) { + GenericInternalRow row = new GenericInternalRow(2); + row.setInt(0, id); + row.update(1, UTF8String.fromString(data)); + return row; + } + + private List readAll(MergingPartitionReader reader) throws IOException { + List results = Lists.newArrayList(); + while (reader.next()) { + results.add(reader.get().getInt(0)); + } + return results; + } + + private List readAllRows(MergingPartitionReader reader) + throws IOException { + List results = Lists.newArrayList(); + while (reader.next()) { + // Copy the row because the reader may reuse the object + InternalRow current = reader.get(); + GenericInternalRow copy = new GenericInternalRow(current.numFields()); + for (int i = 0; i < current.numFields(); i++) { + if (!current.isNullAt(i)) { + if (i == 0) { + copy.setInt(i, current.getInt(i)); + } else { + copy.update(i, current.getUTF8String(i).clone()); + } + } + } + results.add(copy); + } + return results; + } + + private static class MockPartitionReader implements PartitionReader { + private final List rows; + private int index = -1; + + MockPartitionReader(List rows) { + this.rows = Lists.newArrayList(rows); + } + + @Override + public boolean next() { + index++; + return index < rows.size(); + } + + @Override + public InternalRow get() { + return rows.get(index); + } + + @Override + public void close() { + // No-op for mock + } + } +} diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index 94547c2cf8fb..1c9fdd54b633 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -44,6 +44,7 @@ import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; import org.apache.iceberg.exceptions.CommitStateUnknownException; @@ -154,6 +155,7 @@ public void testBasicWrite() { assertThat(file.splitOffsets()).as("Split offsets not present").isNotNull(); } assertThat(file.recordCount()).as("Should have reported record count as 1").isEqualTo(1); + assertThat(file.sortOrderId()).isEqualTo(SortOrder.unsorted().orderId()); // TODO: append more metric info if (format.equals(FileFormat.PARQUET)) { assertThat(file.columnSizes()).as("Column sizes metric not present").isNotNull(); @@ -555,6 +557,55 @@ public void testViewsReturnRecentResults() { assertThat(actual2).hasSameSizeAs(expected2).isEqualTo(expected2); } + @TestTemplate + public void testWriteDataFilesInTableSortOrder() { + File parent = temp.resolve(format.toString()).toFile(); + File location = new File(parent, "test"); + String targetLocation = locationWithBranch(location); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.unpartitioned(); + Table table = tables.create(SCHEMA, spec, location.toString()); + + table.replaceSortOrder().asc("id").commit(); + + List expected = Lists.newArrayListWithCapacity(4000); + for (int i = 0; i < 4000; i++) { + expected.add(new SimpleRecord(i, "a")); + } + + Dataset df = spark.createDataFrame(expected, SimpleRecord.class); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + createBranch(table); + table.refresh(); + + Dataset result = spark.read().format("iceberg").load(targetLocation); + + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + assertThat(actual).hasSameSizeAs(expected).isEqualTo(expected); + + List files = Lists.newArrayList(); + for (ManifestFile manifest : + SnapshotUtil.latestSnapshot(table, branch).allManifests(table.io())) { + for (DataFile file : ManifestFiles.read(manifest, table.io())) { + files.add(file); + } + } + + assertThat(files) + .extracting(DataFile::sortOrderId) + .as("All DataFiles are written with the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType option) { File parent = temp.resolve(format.toString()).toFile(); File location = new File(parent, "test"); diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSupportsReportOrdering.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSupportsReportOrdering.java new file mode 100644 index 000000000000..2f098e5c512e --- /dev/null +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSupportsReportOrdering.java @@ -0,0 +1,760 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Consumer; +import org.apache.commons.lang3.StringUtils; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.ReplaceSortOrder; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.spark.SparkSQLProperties; +import org.apache.iceberg.spark.TestBaseWithCatalog; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; +import org.apache.spark.sql.execution.SortExec; +import org.apache.spark.sql.execution.SparkPlan; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestSupportsReportOrdering extends TestBaseWithCatalog { + + private static final Map ENABLED_ORDERING_SQL_CONF = orderingConfig(true); + private static final Map DISABLED_ORDERING_SQL_CONF = orderingConfig(false); + + private static Map orderingConfig(boolean preserveOrdering) { + return ImmutableMap.builder() + .put(SparkSQLProperties.PRESERVE_DATA_ORDERING, String.valueOf(preserveOrdering)) + .put(SparkSQLProperties.PRESERVE_DATA_GROUPING, "true") + .put("spark.sql.autoBroadcastJoinThreshold", "-1") + .put("spark.sql.adaptive.enabled", "false") + .put("spark.sql.sources.v2.bucketing.enabled", "true") + .put("spark.sql.sources.v2.bucketing.pushPartValues.enabled", "true") + .put("spark.sql.requireAllClusterKeysForCoPartition", "false") + .buildOrThrow(); + } + + @BeforeEach + public void useCatalog() { + sql("USE %s", catalogName); + } + + @AfterEach + public void removeTables() { + sql("DROP TABLE IF EXISTS %s", tableName); + sql("DROP TABLE IF EXISTS %s", tableName("table_source")); + spark.conf().unset(SparkSQLProperties.PRESERVE_DATA_ORDERING); + } + + @TestTemplate + public void testMergingMultipleSortedFiles() throws NoSuchTableException { + Table table = createSimpleTable(tableName); + setSortOrder(table, "id"); + + writeBatches( + tableName, + SimpleRecord.class, + ImmutableList.of(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")), + ImmutableList.of(new SimpleRecord(3, "c"), new SimpleRecord(4, "d")), + ImmutableList.of(new SimpleRecord(5, "e"), new SimpleRecord(6, "f")), + ImmutableList.of(new SimpleRecord(7, "g"), new SimpleRecord(8, "h"))); + + spark.conf().set(SparkSQLProperties.PRESERVE_DATA_ORDERING, "true"); + + Dataset result = + spark.sql(String.format("SELECT id, data FROM %s ORDER BY id", tableName)); + List rows = rowsToJava(result.collectAsList()); + + assertThat(rows) + .hasSize(8) + .containsExactly( + row(1, "a"), + row(2, "b"), + row(3, "c"), + row(4, "d"), + row(5, "e"), + row(6, "f"), + row(7, "g"), + row(8, "h")); + } + + @TestTemplate + public void testDescendingSortOrder() throws NoSuchTableException { + Table table = createSimpleTable(tableName); + table.replaceSortOrder().desc("id").commit(); + + writeBatches( + tableName, + SimpleRecord.class, + ImmutableList.of(new SimpleRecord(10, "j"), new SimpleRecord(9, "i")), + ImmutableList.of(new SimpleRecord(8, "h"), new SimpleRecord(7, "g")), + ImmutableList.of(new SimpleRecord(6, "f"), new SimpleRecord(4, "d"))); + + spark.conf().set(SparkSQLProperties.PRESERVE_DATA_ORDERING, "true"); + + Dataset result = spark.sql(String.format("SELECT id FROM %s ORDER BY id DESC", tableName)); + List rows = rowsToJava(result.collectAsList()); + + assertThat(rows).hasSize(6).containsExactly(row(10), row(9), row(8), row(7), row(6), row(4)); + } + + @TestTemplate + public void testMultiColumnSortOrder() throws NoSuchTableException { + Table table = createThreeColumnTable(tableName); + setSortOrder(table, "c3", "c1"); + + writeBatches( + tableName, + ThreeColumnRecord.class, + ImmutableList.of(new ThreeColumnRecord(1, "a", "A"), new ThreeColumnRecord(3, "c", "A")), + ImmutableList.of(new ThreeColumnRecord(2, "b", "A"), new ThreeColumnRecord(1, "a", "B")), + ImmutableList.of(new ThreeColumnRecord(2, "b", "B"), new ThreeColumnRecord(3, "c", "B"))); + + spark.conf().set(SparkSQLProperties.PRESERVE_DATA_ORDERING, "true"); + + Dataset result = + spark.sql(String.format("SELECT c3, c1, c2 FROM %s ORDER BY c3, c1", tableName)); + List rows = rowsToJava(result.collectAsList()); + + assertThat(rows) + .hasSize(6) + .containsExactly( + row("A", 1, "a"), + row("A", 2, "b"), + row("A", 3, "c"), + row("B", 1, "a"), + row("B", 2, "b"), + row("B", 3, "c")); + } + + @TestTemplate + public void testSingleFileDoesNotRequireMerging() throws NoSuchTableException { + Table table = createSimpleTable(tableName); + setSortOrder(table, "id"); + + List batch = ImmutableList.of(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); + spark.createDataFrame(batch, SimpleRecord.class).coalesce(1).writeTo(tableName).append(); + + spark.conf().set(SparkSQLProperties.PRESERVE_DATA_ORDERING, "true"); + + Dataset result = spark.sql(String.format("SELECT * FROM %s", tableName)); + List rows = rowsToJava(result.collectAsList()); + + assertThat(rows).hasSize(2); + } + + @TestTemplate + public void testPartitionedTableWithMultipleFilesPerPartition() throws NoSuchTableException { + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (c3)", + tableName); + + Table table = validationCatalog.loadTable(tableIdent); + setSortOrder(table, "c1"); + + writeBatches( + tableName, + ThreeColumnRecord.class, + ImmutableList.of(new ThreeColumnRecord(1, "a", "P1"), new ThreeColumnRecord(3, "c", "P1")), + ImmutableList.of(new ThreeColumnRecord(2, "b", "P1"), new ThreeColumnRecord(4, "d", "P1")), + ImmutableList.of(new ThreeColumnRecord(5, "e", "P2"), new ThreeColumnRecord(7, "g", "P2")), + ImmutableList.of(new ThreeColumnRecord(6, "f", "P2"), new ThreeColumnRecord(8, "h", "P2"))); + + spark.conf().set(SparkSQLProperties.PRESERVE_DATA_ORDERING, "true"); + + Dataset p1Result = + spark.sql(String.format("SELECT c1, c2 FROM %s WHERE c3 = 'P1' ORDER BY c1", tableName)); + List p1Rows = rowsToJava(p1Result.collectAsList()); + + assertThat(p1Rows) + .hasSize(4) + .containsExactly(row(1, "a"), row(2, "b"), row(3, "c"), row(4, "d")); + + Dataset p2Result = + spark.sql(String.format("SELECT c1, c2 FROM %s WHERE c3 = 'P2' ORDER BY c1", tableName)); + List p2Rows = rowsToJava(p2Result.collectAsList()); + + assertThat(p2Rows) + .hasSize(4) + .containsExactly(row(5, "e"), row(6, "f"), row(7, "g"), row(8, "h")); + } + + @TestTemplate + public void testOrderingNotReportedWhenDisabled() throws NoSuchTableException { + Table table = createSimpleTable(tableName); + setSortOrder(table, "id"); + + List batch = ImmutableList.of(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); + spark.createDataFrame(batch, SimpleRecord.class).coalesce(1).writeTo(tableName).append(); + + spark.conf().unset(SparkSQLProperties.PRESERVE_DATA_ORDERING); + + Dataset result = spark.sql(String.format("SELECT * FROM %s", tableName)); + List rows = rowsToJava(result.collectAsList()); + + assertThat(rows).hasSize(2); + } + + @TestTemplate + public void testOrderingNotReportedForUnsortedTable() throws NoSuchTableException { + createSimpleTable(tableName); + + List batch = ImmutableList.of(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); + spark.createDataFrame(batch, SimpleRecord.class).coalesce(1).writeTo(tableName).append(); + + spark.conf().set(SparkSQLProperties.PRESERVE_DATA_ORDERING, "true"); + + Dataset result = spark.sql(String.format("SELECT * FROM %s", tableName)); + List rows = rowsToJava(result.collectAsList()); + + assertThat(rows).hasSize(2); + } + + @TestTemplate + public void testSortRequiredWhenOrderingNotReported() throws NoSuchTableException { + Table table = createSimpleTable(tableName); + setSortOrder(table, "id"); + + writeBatches( + tableName, + SimpleRecord.class, + ImmutableList.of(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")), + ImmutableList.of(new SimpleRecord(3, "c"), new SimpleRecord(4, "d"))); + + spark.conf().unset(SparkSQLProperties.PRESERVE_DATA_ORDERING); + + SparkPlan plan = + executeAndKeepPlan(String.format("SELECT id, data FROM %s ORDER BY id", tableName)); + + List sorts = collectPlans(plan, SortExec.class); + + assertThat(sorts).isNotEmpty(); + } + + @TestTemplate + public void testSortMergeJoinWithSortedTables() throws NoSuchTableException { + createBucketedTable(tableName, "c1"); + createBucketedTable(tableName("table_source"), "c1"); + + writeBatches( + tableName, + ThreeColumnRecord.class, + ImmutableList.of(new ThreeColumnRecord(1, "a", "X"), new ThreeColumnRecord(2, "b", "X")), + ImmutableList.of(new ThreeColumnRecord(3, "c", "X"), new ThreeColumnRecord(4, "d", "X"))); + + writeBatches( + tableName("table_source"), + ThreeColumnRecord.class, + ImmutableList.of(new ThreeColumnRecord(1, "A", "Y"), new ThreeColumnRecord(2, "B", "Y")), + ImmutableList.of(new ThreeColumnRecord(3, "C", "Y"), new ThreeColumnRecord(4, "D", "Y"))); + + assertPlanWithoutSort( + 0, + 2, + null, + "SELECT t1.c1, t1.c2, t2.c2 FROM %s t1 JOIN %s t2 ON t1.c1 = t2.c1", + tableName, + tableName("table_source")); + } + + @TestTemplate + public void testMergeWithSortedBucketedTables() throws NoSuchTableException { + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(4, c1)) " + + "TBLPROPERTIES('write.merge.mode' = 'merge-on-read', '%s' = '%d')", + tableName, TableProperties.SPLIT_SIZE, 1024); + + Table targetTable = validationCatalog.loadTable(tableIdent); + setSortOrder(targetTable, "c1"); + + writeBatches( + tableName, + ThreeColumnRecord.class, + ImmutableList.of( + new ThreeColumnRecord(1, "old1", "data1"), new ThreeColumnRecord(2, "old2", "data2")), + ImmutableList.of( + new ThreeColumnRecord(3, "old3", "data3"), new ThreeColumnRecord(4, "old4", "data4"))); + + String sourceTableName = tableName("table_source"); + TableIdentifier sourceTableIdent = TableIdentifier.of(Namespace.of("default"), "table_source"); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(4, c1)) " + + "TBLPROPERTIES('%s' = '%d')", + sourceTableName, TableProperties.SPLIT_SIZE, 1024); + + Table sourceTable = validationCatalog.loadTable(sourceTableIdent); + setSortOrder(sourceTable, "c1"); + + writeBatches( + sourceTableName, + ThreeColumnRecord.class, + ImmutableList.of( + new ThreeColumnRecord(2, "new2", "data2"), new ThreeColumnRecord(3, "new3", "data3")), + ImmutableList.of( + new ThreeColumnRecord(5, "new5", "data5"), new ThreeColumnRecord(6, "new6", "data6"))); + + refreshTables(tableName, sourceTableName); + + validationCatalog.loadTable(tableIdent).refresh(); + validationCatalog.loadTable(sourceTableIdent).refresh(); + + assertPlanWithoutSort( + 1, + 3, + this::verifyMergeResults, + "MERGE INTO %s t USING %s s ON t.c1 = s.c1 " + + "WHEN MATCHED THEN UPDATE SET t.c2 = s.c2, t.c3 = s.c3 " + + "WHEN NOT MATCHED THEN INSERT *", + tableName, + sourceTableName); + } + + @TestTemplate + public void testHistoricalSortOrderInJoin() throws NoSuchTableException { + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(4, c1)) " + + "TBLPROPERTIES('%s' = '%d')", + tableName, TableProperties.SPLIT_SIZE, 1024); + + Table table1 = validationCatalog.loadTable(tableIdent); + setSortOrder(table1, "c1"); + + writeBatches( + tableName, + ThreeColumnRecord.class, + ImmutableList.of(new ThreeColumnRecord(1, "a", "X"), new ThreeColumnRecord(2, "b", "X")), + ImmutableList.of(new ThreeColumnRecord(3, "c", "X"), new ThreeColumnRecord(4, "d", "X"))); + + table1.replaceSortOrder().asc("c2").asc("c1").commit(); + + String table2Name = tableName("table_source"); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(4, c1)) " + + "TBLPROPERTIES('%s' = '%d')", + table2Name, TableProperties.SPLIT_SIZE, 1024); + + TableIdentifier table2Ident = TableIdentifier.of(Namespace.of("default"), "table_source"); + Table table2 = validationCatalog.loadTable(table2Ident); + setSortOrder(table2, "c1"); + + writeBatches( + table2Name, + ThreeColumnRecord.class, + ImmutableList.of(new ThreeColumnRecord(1, "A", "Y"), new ThreeColumnRecord(2, "B", "Y")), + ImmutableList.of(new ThreeColumnRecord(3, "C", "Y"), new ThreeColumnRecord(4, "D", "Y"))); + + table2.replaceSortOrder().asc("c2").asc("c1").commit(); + + // Both tables have files with historical sort order [c1 ASC] + // but current table sort order is [c2 ASC, c1 ASC] + assertPlanWithoutSort( + 2, + 2, + null, + "SELECT t1.c1, t1.c2, t2.c2 FROM %s t1 JOIN %s t2 ON t1.c1 = t2.c1", + tableName, + table2Name); + } + + @TestTemplate + public void testMixedSortOrdersNoReporting() throws NoSuchTableException { + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(4, c1)) " + + "TBLPROPERTIES('%s' = '%d')", + tableName, TableProperties.SPLIT_SIZE, 1024); + + Table table1 = validationCatalog.loadTable(tableIdent); + setSortOrder(table1, "c1"); + + writeBatches( + tableName, + ThreeColumnRecord.class, + ImmutableList.of(new ThreeColumnRecord(1, "a", "X"), new ThreeColumnRecord(2, "b", "X"))); + + table1.replaceSortOrder().asc("c2").asc("c1").commit(); + + writeBatches( + tableName, + ThreeColumnRecord.class, + ImmutableList.of(new ThreeColumnRecord(3, "c", "X"), new ThreeColumnRecord(4, "d", "X"))); + + String table2Name = tableName("table_source"); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(4, c1)) " + + "TBLPROPERTIES('%s' = '%d')", + table2Name, TableProperties.SPLIT_SIZE, 1024); + + TableIdentifier table2Ident = TableIdentifier.of(Namespace.of("default"), "table_source"); + Table table2 = validationCatalog.loadTable(table2Ident); + setSortOrder(table2, "c1"); + + writeBatches( + table2Name, + ThreeColumnRecord.class, + ImmutableList.of(new ThreeColumnRecord(1, "A", "Y"), new ThreeColumnRecord(2, "B", "Y"))); + + table2.replaceSortOrder().asc("c2").asc("c1").commit(); + + writeBatches( + table2Name, + ThreeColumnRecord.class, + ImmutableList.of(new ThreeColumnRecord(3, "C", "Y"), new ThreeColumnRecord(4, "D", "Y"))); + + assertPlanWithoutSort( + 2, + 2, + null, + "SELECT t1.c1, t1.c2, t2.c2 FROM %s t1 JOIN %s t2 ON t1.c1 = t2.c1", + tableName, + table2Name); + } + + @TestTemplate + public void testSPJWithDifferentPartitionAndSortKeys() throws NoSuchTableException { + createBucketedTable(tableName, "c3", "c1"); + createBucketedTable(tableName("table_source"), "c3", "c1"); + + writeBatches( + tableName, + ThreeColumnRecord.class, + ImmutableList.of( + new ThreeColumnRecord(1, "a", "2024-01-01"), + new ThreeColumnRecord(2, "b", "2024-01-02")), + ImmutableList.of( + new ThreeColumnRecord(1, "c", "2024-01-03"), + new ThreeColumnRecord(2, "d", "2024-01-04"))); + + writeBatches( + tableName("table_source"), + ThreeColumnRecord.class, + ImmutableList.of( + new ThreeColumnRecord(1, "A", "2024-01-01"), + new ThreeColumnRecord(2, "B", "2024-01-02")), + ImmutableList.of( + new ThreeColumnRecord(1, "C", "2024-01-03"), + new ThreeColumnRecord(2, "D", "2024-01-04"))); + + refreshTables(tableName, tableName("table_source")); + + assertPlanWithoutSort( + 0, + 2, + null, + "SELECT t1.c1, t1.c2, t2.c2 FROM %s t1 JOIN %s t2 ON t1.c3 = t2.c3 AND t1.c1 = t2.c1", + tableName, + tableName("table_source")); + } + + @TestTemplate + public void testHistoricalSortOrderInMerge() throws NoSuchTableException { + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(4, c1)) " + + "TBLPROPERTIES('write.merge.mode' = 'merge-on-read', '%s' = '%d')", + tableName, TableProperties.SPLIT_SIZE, 1024); + + Table targetTable = validationCatalog.loadTable(tableIdent); + setSortOrder(targetTable, "c1"); + + writeBatches( + tableName, + ThreeColumnRecord.class, + ImmutableList.of( + new ThreeColumnRecord(1, "old1", "data1"), new ThreeColumnRecord(2, "old2", "data2")), + ImmutableList.of( + new ThreeColumnRecord(3, "old3", "data3"), new ThreeColumnRecord(4, "old4", "data4"))); + + targetTable.replaceSortOrder().asc("c2").asc("c1").commit(); + + String sourceTableName = tableName("table_source"); + TableIdentifier sourceTableIdent = TableIdentifier.of(Namespace.of("default"), "table_source"); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(4, c1)) " + + "TBLPROPERTIES('%s' = '%d')", + sourceTableName, TableProperties.SPLIT_SIZE, 1024); + + Table sourceTable = validationCatalog.loadTable(sourceTableIdent); + setSortOrder(sourceTable, "c1"); + + writeBatches( + sourceTableName, + ThreeColumnRecord.class, + ImmutableList.of( + new ThreeColumnRecord(2, "new2", "data2"), new ThreeColumnRecord(3, "new3", "data3")), + ImmutableList.of( + new ThreeColumnRecord(5, "new5", "data5"), new ThreeColumnRecord(6, "new6", "data6"))); + + sourceTable.replaceSortOrder().asc("c2").asc("c1").commit(); + + refreshTables(tableName, sourceTableName); + + validationCatalog.loadTable(tableIdent).refresh(); + validationCatalog.loadTable(sourceTableIdent).refresh(); + + // Files have historical sort order [c1 ASC] but tables have [c2 ASC, c1 ASC] + assertPlanWithoutSort( + 3, + 3, + this::verifyMergeResults, + "MERGE INTO %s t USING %s s ON t.c1 = s.c1 " + + "WHEN MATCHED THEN UPDATE SET t.c2 = s.c2, t.c3 = s.c3 " + + "WHEN NOT MATCHED THEN INSERT *", + tableName, + sourceTableName); + } + + @TestTemplate + public void testMergeOnReadWithDeleteFiles() throws NoSuchTableException { + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(4, c1)) " + + "TBLPROPERTIES(" + + " 'write.delete.mode' = 'merge-on-read', " + + " 'write.merge.mode' = 'merge-on-read', " + + " 'format-version' = '2', " + + " '%s' = '%d'" + + ")", + tableName, TableProperties.SPLIT_SIZE, 1024); + + Table targetTable = validationCatalog.loadTable(tableIdent); + setSortOrder(targetTable, "c1"); + + writeBatches( + tableName, + ThreeColumnRecord.class, + ImmutableList.of( + new ThreeColumnRecord(1, "old1", "data1"), new ThreeColumnRecord(2, "old2", "data2")), + ImmutableList.of( + new ThreeColumnRecord(3, "old3", "data3"), new ThreeColumnRecord(4, "old4", "data4"))); + + sql("DELETE FROM %s WHERE c1 = 2", tableName); + + targetTable.refresh(); + long deleteFileCount = + Iterables.size(targetTable.currentSnapshot().addedDeleteFiles(targetTable.io())); + assertThat(deleteFileCount).isGreaterThan(0); + + String sourceTableName = tableName("table_source"); + TableIdentifier sourceTableIdent = TableIdentifier.of(Namespace.of("default"), "table_source"); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(4, c1)) " + + "TBLPROPERTIES('%s' = '%d')", + sourceTableName, TableProperties.SPLIT_SIZE, 1024); + + Table sourceTable = validationCatalog.loadTable(sourceTableIdent); + setSortOrder(sourceTable, "c1"); + + writeBatches( + sourceTableName, + ThreeColumnRecord.class, + ImmutableList.of( + new ThreeColumnRecord(2, "new2", "data2"), new ThreeColumnRecord(3, "new3", "data3")), + ImmutableList.of( + new ThreeColumnRecord(5, "new5", "data5"), new ThreeColumnRecord(6, "new6", "data6"))); + + refreshTables(tableName, sourceTableName); + + validationCatalog.loadTable(tableIdent).refresh(); + validationCatalog.loadTable(sourceTableIdent).refresh(); + + assertPlanWithoutSort( + 1, + 3, + this::verifyMergeResults, + "MERGE INTO %s t USING %s s ON t.c1 = s.c1 " + + "WHEN MATCHED THEN UPDATE SET t.c2 = s.c2, t.c3 = s.c3 " + + "WHEN NOT MATCHED THEN INSERT *", + tableName, + sourceTableName); + } + + private Table createSimpleTable(String name) { + sql("CREATE TABLE %s (id INT, data STRING) USING iceberg", name); + return validationCatalog.loadTable(tableIdent); + } + + private Table createThreeColumnTable(String name) { + sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) USING iceberg", name); + return validationCatalog.loadTable(tableIdent); + } + + private void createBucketedTable(String name, String... sortCols) { + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(4, c1)) " + + "TBLPROPERTIES('%s' = '%d')", + name, TableProperties.SPLIT_SIZE, 1024); + + TableIdentifier ident = + name.equals(tableName) + ? tableIdent + : TableIdentifier.of(Namespace.of("default"), "table_source"); + Table table = validationCatalog.loadTable(ident); + + if (sortCols.length > 0) { + ReplaceSortOrder sortOrder = table.replaceSortOrder(); + for (String col : sortCols) { + sortOrder = sortOrder.asc(col); + } + sortOrder.commit(); + } + } + + @SafeVarargs + private void writeBatches(String tableName, Class recordClass, List... batches) + throws NoSuchTableException { + for (List batch : batches) { + spark.createDataFrame(batch, recordClass).coalesce(1).writeTo(tableName).append(); + } + } + + private void setSortOrder(Table table, String... columns) { + ReplaceSortOrder sortOrder = table.replaceSortOrder(); + for (String col : columns) { + sortOrder = sortOrder.asc(col); + } + sortOrder.commit(); + } + + private void refreshTable(String table) { + sql("REFRESH TABLE %s", table); + } + + private void refreshTables(String... tables) { + for (String table : tables) { + refreshTable(table); + } + } + + private void verifyMergeResults(String targetTableName) { + Dataset result = + spark.sql(String.format("SELECT c1, c2, c3 FROM %s ORDER BY c1", targetTableName)); + List rows = rowsToJava(result.collectAsList()); + + assertThat(rows) + .hasSize(6) + .containsExactly( + row(1, "old1", "data1"), // unchanged + row(2, "new2", "data2"), // updated from source + row(3, "new3", "data3"), // updated from source + row(4, "old4", "data4"), // unchanged + row(5, "new5", "data5"), // inserted from source + row(6, "new6", "data6")); // inserted from source + } + + private void assertPlanWithoutSort( + int expectedNumSortsWithOrdering, + int expectedNumSortsWithoutOrdering, + Consumer dataVerification, + String query, + Object... args) { + + AtomicReference> rowsWithOrdering = new AtomicReference<>(); + AtomicReference> rowsWithoutOrdering = new AtomicReference<>(); + + Table targetTable = validationCatalog.loadTable(tableIdent); + long snapshotBeforeExecution = targetTable.currentSnapshot().snapshotId(); + + withSQLConf( + ENABLED_ORDERING_SQL_CONF, + () -> { + String plan = executeAndKeepPlan(query, args).toString(); + int actualNumSorts = StringUtils.countMatches(plan, "Sort ["); + assertThat(actualNumSorts) + .as("Number of sorts with enabled ordering must match") + .isEqualTo(expectedNumSortsWithOrdering); + + sql("REFRESH TABLE %s", tableName); + validationCatalog.loadTable(tableIdent).refresh(); + + if (dataVerification != null) { + dataVerification.accept(tableName); + } else { + rowsWithOrdering.set(sql(query, args)); + } + }); + + sql( + "CALL %s.system.rollback_to_snapshot('%s', %dL)", + catalogName, tableName, snapshotBeforeExecution); + + sql("REFRESH TABLE %s", tableName); + validationCatalog.loadTable(tableIdent).refresh(); + + withSQLConf( + DISABLED_ORDERING_SQL_CONF, + () -> { + String plan = executeAndKeepPlan(query, args).toString(); + int actualNumSorts = StringUtils.countMatches(plan, "Sort ["); + assertThat(actualNumSorts) + .as("Number of sorts with disabled ordering must match") + .isEqualTo(expectedNumSortsWithoutOrdering); + + sql("REFRESH TABLE %s", tableName); + validationCatalog.loadTable(tableIdent).refresh(); + if (dataVerification != null) { + dataVerification.accept(tableName); + } else { + rowsWithoutOrdering.set(sql(query, args)); + } + }); + + if (dataVerification == null) { + assertEquals( + "Sort elimination should not change query output", + rowsWithoutOrdering.get(), + rowsWithOrdering.get()); + } + } +}