diff --git a/.palantir/revapi.yml b/.palantir/revapi.yml index 283697418b6b..509078c0fc4b 100644 --- a/.palantir/revapi.yml +++ b/.palantir/revapi.yml @@ -874,6 +874,15 @@ acceptedBreaks: justification: "Static utility class - should not have public constructor" "1.4.0": org.apache.iceberg:iceberg-core: + - code: "java.class.removed" + old: "class org.apache.iceberg.actions.BinPackStrategy" + justification: "Removed Deprecated Classes" + - code: "java.class.removed" + old: "class org.apache.iceberg.actions.SortStrategy" + justification: "Removed Deprecated Classes" + - code: "java.class.removed" + old: "interface org.apache.iceberg.actions.RewriteStrategy" + justification: "Removed Deprecated Classes" - code: "java.field.serialVersionUIDChanged" new: "field org.apache.iceberg.util.SerializableMap.serialVersionUID" justification: "Serialization is not be used" diff --git a/core/src/main/java/org/apache/iceberg/actions/BinPackStrategy.java b/core/src/main/java/org/apache/iceberg/actions/BinPackStrategy.java deleted file mode 100644 index 0da1f6063bd4..000000000000 --- a/core/src/main/java/org/apache/iceberg/actions/BinPackStrategy.java +++ /dev/null @@ -1,333 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.actions; - -import java.math.RoundingMode; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.FluentIterable; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.math.LongMath; -import org.apache.iceberg.util.BinPacking; -import org.apache.iceberg.util.BinPacking.ListPacker; -import org.apache.iceberg.util.PropertyUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A rewrite strategy for data files which determines which files to rewrite based on their size. If - * files are either smaller than the {@link #MIN_FILE_SIZE_BYTES} threshold or larger than the - * {@link #MAX_FILE_SIZE_BYTES} threshold, they are considered targets for being rewritten. - * - *

Once selected files are grouped based on a {@link BinPacking} into groups defined by {@link - * RewriteDataFiles#MAX_FILE_GROUP_SIZE_BYTES}. Groups will be considered for rewriting if they - * contain more files than {@link #MIN_INPUT_FILES} or would produce at least one file of {@link - * RewriteDataFiles#TARGET_FILE_SIZE_BYTES}. - * - * @deprecated since 1.3.0, will be removed in 1.4.0; use {@link SizeBasedFileRewriter} instead. - * Note: This can only be removed once Spark 3.2 isn't using this API anymore. - */ -@Deprecated -public abstract class BinPackStrategy implements RewriteStrategy { - - private static final Logger LOG = LoggerFactory.getLogger(BinPackStrategy.class); - - /** - * The minimum number of files that need to be in a file group for it to be considered for - * compaction if the total size of that group is less than the {@link - * RewriteDataFiles#TARGET_FILE_SIZE_BYTES}. This can also be thought of as the maximum number of - * non-target-size files that could remain in a file group (partition) after rewriting. - */ - public static final String MIN_INPUT_FILES = "min-input-files"; - - public static final int MIN_INPUT_FILES_DEFAULT = 5; - - /** - * Adjusts files which will be considered for rewriting. Files smaller than {@link - * #MIN_FILE_SIZE_BYTES} will be considered for rewriting. This functions independently of {@link - * #MAX_FILE_SIZE_BYTES}. - * - *

Defaults to 75% of the target file size - */ - public static final String MIN_FILE_SIZE_BYTES = "min-file-size-bytes"; - - public static final double MIN_FILE_SIZE_DEFAULT_RATIO = 0.75d; - - /** - * Adjusts files which will be considered for rewriting. Files larger than {@link - * #MAX_FILE_SIZE_BYTES} will be considered for rewriting. This functions independently of {@link - * #MIN_FILE_SIZE_BYTES}. - * - *

Defaults to 180% of the target file size - */ - public static final String MAX_FILE_SIZE_BYTES = "max-file-size-bytes"; - - public static final double MAX_FILE_SIZE_DEFAULT_RATIO = 1.80d; - - /** - * The minimum number of deletes that needs to be associated with a data file for it to be - * considered for rewriting. If a data file has this number of deletes or more, it will be - * rewritten regardless of its file size determined by {@link #MIN_FILE_SIZE_BYTES} and {@link - * #MAX_FILE_SIZE_BYTES}. If a file group contains a file that satisfies this condition, the file - * group will be rewritten regardless of the number of files in the file group determined by - * {@link #MIN_INPUT_FILES} - * - *

Defaults to Integer.MAX_VALUE, which means this feature is not enabled by default. - */ - public static final String DELETE_FILE_THRESHOLD = "delete-file-threshold"; - - public static final int DELETE_FILE_THRESHOLD_DEFAULT = Integer.MAX_VALUE; - - static final long SPLIT_OVERHEAD = 1024 * 5; - - /** - * Rewrites all files, regardless of their size. Defaults to false, rewriting only mis-sized - * files; - */ - public static final String REWRITE_ALL = "rewrite-all"; - - public static final boolean REWRITE_ALL_DEFAULT = false; - - private int minInputFiles; - private int deleteFileThreshold; - private long minFileSize; - private long maxFileSize; - private long targetFileSize; - private long maxGroupSize; - private boolean rewriteAll; - - @Override - public String name() { - return "BINPACK"; - } - - @Override - public Set validOptions() { - return ImmutableSet.of( - MIN_INPUT_FILES, - DELETE_FILE_THRESHOLD, - MIN_FILE_SIZE_BYTES, - MAX_FILE_SIZE_BYTES, - REWRITE_ALL); - } - - @Override - public RewriteStrategy options(Map options) { - targetFileSize = - PropertyUtil.propertyAsLong( - options, - RewriteDataFiles.TARGET_FILE_SIZE_BYTES, - PropertyUtil.propertyAsLong( - table().properties(), - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT)); - - minFileSize = - PropertyUtil.propertyAsLong( - options, MIN_FILE_SIZE_BYTES, (long) (targetFileSize * MIN_FILE_SIZE_DEFAULT_RATIO)); - - maxFileSize = - PropertyUtil.propertyAsLong( - options, MAX_FILE_SIZE_BYTES, (long) (targetFileSize * MAX_FILE_SIZE_DEFAULT_RATIO)); - - maxGroupSize = - PropertyUtil.propertyAsLong( - options, - RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, - RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES_DEFAULT); - - minInputFiles = PropertyUtil.propertyAsInt(options, MIN_INPUT_FILES, MIN_INPUT_FILES_DEFAULT); - - deleteFileThreshold = - PropertyUtil.propertyAsInt(options, DELETE_FILE_THRESHOLD, DELETE_FILE_THRESHOLD_DEFAULT); - - rewriteAll = PropertyUtil.propertyAsBoolean(options, REWRITE_ALL, REWRITE_ALL_DEFAULT); - - validateOptions(); - return this; - } - - @Override - public Iterable selectFilesToRewrite(Iterable dataFiles) { - if (rewriteAll) { - LOG.info("Table {} set to rewrite all data files", table().name()); - return dataFiles; - } else { - return FluentIterable.from(dataFiles) - .filter( - scanTask -> - scanTask.length() < minFileSize - || scanTask.length() > maxFileSize - || taskHasTooManyDeletes(scanTask)); - } - } - - @Override - public Iterable> planFileGroups(Iterable dataFiles) { - ListPacker packer = new BinPacking.ListPacker<>(maxGroupSize, 1, false); - List> potentialGroups = packer.pack(dataFiles, FileScanTask::length); - if (rewriteAll) { - return potentialGroups; - } else { - return potentialGroups.stream() - .filter( - group -> - (group.size() >= minInputFiles && group.size() > 1) - || (sizeOfInputFiles(group) > targetFileSize && group.size() > 1) - || sizeOfInputFiles(group) > maxFileSize - || group.stream().anyMatch(this::taskHasTooManyDeletes)) - .collect(Collectors.toList()); - } - } - - protected long targetFileSize() { - return this.targetFileSize; - } - - /** - * Determine how many output files to create when rewriting. We use this to determine the - * split-size we want to use when actually writing files to avoid the following situation. - * - *

If we are writing 10.1 G of data with a target file size of 1G we would end up with 11 - * files, one of which would only have 0.1g. This would most likely be less preferable to 10 files - * each of which was 1.01g. So here we decide whether to round up or round down based on what the - * estimated average file size will be if we ignore the remainder (0.1g). If the new file size is - * less than 10% greater than the target file size then we will round down when determining the - * number of output files. - * - * @param totalSizeInBytes total data size for a file group - * @return the number of files this strategy should create - */ - protected long numOutputFiles(long totalSizeInBytes) { - if (totalSizeInBytes < targetFileSize) { - return 1; - } - - long fileCountWithRemainder = - LongMath.divide(totalSizeInBytes, targetFileSize, RoundingMode.CEILING); - if (LongMath.mod(totalSizeInBytes, targetFileSize) > minFileSize) { - // Our Remainder file is of valid size for this compaction so keep it - return fileCountWithRemainder; - } - - long fileCountWithoutRemainder = - LongMath.divide(totalSizeInBytes, targetFileSize, RoundingMode.FLOOR); - long avgFileSizeWithoutRemainder = totalSizeInBytes / fileCountWithoutRemainder; - if (avgFileSizeWithoutRemainder < Math.min(1.1 * targetFileSize, writeMaxFileSize())) { - // Round down and distribute remainder amongst other files - return fileCountWithoutRemainder; - } else { - // Keep the remainder file - return fileCountWithRemainder; - } - } - - /** - * Returns the smallest of our max write file threshold, and our estimated split size based on the - * number of output files we want to generate. Add a overhead onto the estimated splitSize to try - * to avoid small errors in size creating brand-new files. - */ - protected long splitSize(long totalSizeInBytes) { - long estimatedSplitSize = - (totalSizeInBytes / numOutputFiles(totalSizeInBytes)) + SPLIT_OVERHEAD; - return Math.min(estimatedSplitSize, writeMaxFileSize()); - } - - protected long inputFileSize(List fileToRewrite) { - return fileToRewrite.stream().mapToLong(FileScanTask::length).sum(); - } - - /** - * Estimates a larger max target file size than our target size used in task creation to avoid - * tasks which are predicted to have a certain size, but exceed that target size when serde is - * complete creating tiny remainder files. - * - *

While we create tasks that should all be smaller than our target size there is a chance that - * the actual data will end up being larger than our target size due to various factors of - * compression, serialization and other factors outside our control. If this occurs, instead of - * making a single file that is close in size to our target we would end up producing one file of - * the target size, and then a small extra file with the remaining data. For example, if our - * target is 512 MB we may generate a rewrite task that should be 500 MB. When we write the data - * we may find we actually have to write out 530 MB. If we use the target size while writing we - * would produced a 512 MB file and a 18 MB file. If instead we use a larger size estimated by - * this method, then we end up writing a single file. - * - * @return the target size plus one half of the distance between max and target - */ - protected long writeMaxFileSize() { - return (long) (targetFileSize + ((maxFileSize - targetFileSize) * 0.5)); - } - - private long sizeOfInputFiles(List group) { - return group.stream().mapToLong(FileScanTask::length).sum(); - } - - private boolean taskHasTooManyDeletes(FileScanTask task) { - return task.deletes() != null && task.deletes().size() >= deleteFileThreshold; - } - - private void validateOptions() { - Preconditions.checkArgument( - minFileSize >= 0, - "Cannot set %s to a negative number, %s < 0", - MIN_FILE_SIZE_BYTES, - minFileSize); - - Preconditions.checkArgument( - maxFileSize > minFileSize, - "Cannot set %s greater than or equal to %s, %s >= %s", - MIN_FILE_SIZE_BYTES, - MAX_FILE_SIZE_BYTES, - minFileSize, - maxFileSize); - - Preconditions.checkArgument( - targetFileSize > minFileSize, - "Cannot set %s greater than or equal to %s, all files written will be smaller than the threshold, %s >= %s", - MIN_FILE_SIZE_BYTES, - RewriteDataFiles.TARGET_FILE_SIZE_BYTES, - minFileSize, - targetFileSize); - - Preconditions.checkArgument( - targetFileSize < maxFileSize, - "Cannot set %s is greater than or equal to %s, all files written will be larger than the threshold, %s >= %s", - RewriteDataFiles.TARGET_FILE_SIZE_BYTES, - MAX_FILE_SIZE_BYTES, - targetFileSize, - maxFileSize); - - Preconditions.checkArgument( - minInputFiles > 0, - "Cannot set %s is less than 1. All values less than 1 have the same effect as 1. %s < 1", - MIN_INPUT_FILES, - minInputFiles); - - Preconditions.checkArgument( - deleteFileThreshold > 0, - "Cannot set %s is less than 1. All values less than 1 have the same effect as 1. %s < 1", - DELETE_FILE_THRESHOLD, - deleteFileThreshold); - } -} diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteStrategy.java b/core/src/main/java/org/apache/iceberg/actions/RewriteStrategy.java deleted file mode 100644 index ed0f7995789e..000000000000 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteStrategy.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.actions; - -import java.io.Serializable; -import java.util.List; -import java.util.Map; -import java.util.Set; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Table; - -/** - * A strategy for rewriting files. - * - * @deprecated since 1.3.0, will be removed in 1.4.0; use {@link FileRewriter} instead. Note: This - * can only be removed once Spark 3.2 isn't using this API anymore. - */ -@Deprecated -public interface RewriteStrategy extends Serializable { - /** Returns the name of this rewrite strategy */ - String name(); - - /** Returns the table being modified by this rewrite strategy */ - Table table(); - - /** - * Returns a set of options which this rewrite strategy can use. This is an allowed-list and any - * options not specified here will be rejected at runtime. - */ - Set validOptions(); - - /** Sets options to be used with this strategy */ - RewriteStrategy options(Map options); - - /** - * Selects files which this strategy believes are valid targets to be rewritten. - * - * @param dataFiles iterable of FileScanTasks for files in a given partition - * @return iterable containing only FileScanTasks to be rewritten - */ - Iterable selectFilesToRewrite(Iterable dataFiles); - - /** - * Groups file scans into lists which will be processed in a single executable unit. Each group - * will end up being committed as an independent set of changes. This creates the jobs which will - * eventually be run as by the underlying Action. - * - * @param dataFiles iterable of FileScanTasks to be rewritten - * @return iterable of lists of FileScanTasks which will be processed together - */ - Iterable> planFileGroups(Iterable dataFiles); - - /** - * Method which will rewrite files based on this particular RewriteStrategy's algorithm. This will - * most likely be Action framework specific (Spark/Presto/Flink ....). - * - * @param filesToRewrite a group of files to be rewritten together - * @return a set of newly written files - */ - Set rewriteFiles(List filesToRewrite); -} diff --git a/core/src/main/java/org/apache/iceberg/actions/SortStrategy.java b/core/src/main/java/org/apache/iceberg/actions/SortStrategy.java deleted file mode 100644 index 00a0704eda76..000000000000 --- a/core/src/main/java/org/apache/iceberg/actions/SortStrategy.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.actions; - -import java.util.Map; -import java.util.Set; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.util.SortOrderUtil; - -/** - * A rewrite strategy for data files which aims to reorder data with data files to optimally lay - * them out in relation to a column. For example, if the Sort strategy is used on a set of files - * which is ordered by column x and original has files File A (x: 0 - 50), File B ( x: 10 - 40) and - * File C ( x: 30 - 60), this Strategy will attempt to rewrite those files into File A' (x: 0-20), - * File B' (x: 21 - 40), File C' (x: 41 - 60). - * - *

Currently the there is no file overlap detection and we will rewrite all files if {@link - * SortStrategy#REWRITE_ALL} is true (default: false). If this property is disabled any files that - * would be chosen by {@link BinPackStrategy} will be rewrite candidates. - * - *

In the future other algorithms for determining files to rewrite will be provided. - * - * @deprecated since 1.3.0, will be removed in 1.4.0; use {@link SizeBasedFileRewriter} instead. - * Note: This can only be removed once Spark 3.2 isn't using this API anymore. - */ -@Deprecated -public abstract class SortStrategy extends BinPackStrategy { - - private SortOrder sortOrder; - - /** - * Sets the sort order to be used in this strategy when rewriting files - * - * @param order the order to use - * @return this for method chaining - */ - public SortStrategy sortOrder(SortOrder order) { - Preconditions.checkArgument(!order.isUnsorted(), "Cannot set strategy sort order: unsorted"); - this.sortOrder = SortOrderUtil.buildSortOrder(table(), order); - return this; - } - - protected SortOrder sortOrder() { - return sortOrder; - } - - @Override - public String name() { - return "SORT"; - } - - @Override - public Set validOptions() { - return ImmutableSet.builder().addAll(super.validOptions()).build(); - } - - @Override - public RewriteStrategy options(Map options) { - super.options(options); // Also checks validity of BinPack options - - if (sortOrder == null) { - sortOrder = table().sortOrder(); - } - - validateOptions(); - return this; - } - - protected void validateOptions() { - Preconditions.checkArgument( - !sortOrder.isUnsorted(), - "Can't use %s when there is no sort order, either define table %s's sort order or set sort" - + "order in the action", - name(), - table().name()); - - SortOrder.checkCompatibility(sortOrder, table().schema()); - } -} diff --git a/core/src/test/java/org/apache/iceberg/actions/TestBinPackStrategy.java b/core/src/test/java/org/apache/iceberg/actions/TestBinPackStrategy.java deleted file mode 100644 index 07f0b84439c5..000000000000 --- a/core/src/test/java/org/apache/iceberg/actions/TestBinPackStrategy.java +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.actions; - -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MockFileScanTask; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableTestBase; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestBinPackStrategy extends TableTestBase { - - @Parameterized.Parameters(name = "formatVersion = {0}") - public static Object[] parameters() { - return new Object[] {2}; // We don't actually use the format version since everything is mock - } - - private static final long MB = 1024 * 1024; - - public TestBinPackStrategy(int formatVersion) { - super(formatVersion); - } - - class TestBinPackStrategyImpl extends BinPackStrategy { - - @Override - public Table table() { - return table; - } - - @Override - public Set rewriteFiles(List filesToRewrite) { - throw new UnsupportedOperationException(); - } - } - - private List filesOfSize(long... sizes) { - return Arrays.stream(sizes) - .mapToObj(size -> new MockFileScanTask(size * MB)) - .collect(Collectors.toList()); - } - - private RewriteStrategy defaultBinPack() { - return new TestBinPackStrategyImpl().options(Collections.emptyMap()); - } - - @Test - public void testFilteringAllValid() { - RewriteStrategy strategy = defaultBinPack(); - - Iterable testFiles = filesOfSize(100, 100, 100, 100, 1000); - Iterable filtered = - ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); - - Assert.assertEquals("No files should be removed from the set", testFiles, filtered); - } - - @Test - public void testFilteringRemoveInvalid() { - RewriteStrategy strategy = defaultBinPack(); - - Iterable testFiles = filesOfSize(500, 500, 500, 600, 600); - Iterable filtered = - ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); - - Assert.assertEquals( - "All files should be removed from the set", Collections.emptyList(), filtered); - } - - @Test - public void testFilteringCustomMinMaxFileSize() { - RewriteStrategy strategy = - defaultBinPack() - .options( - ImmutableMap.of( - BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), - BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(490 * MB))); - - Iterable testFiles = filesOfSize(500, 500, 480, 480, 560, 520); - Iterable expectedFiles = filesOfSize(480, 480, 560); - Iterable filtered = - ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); - - Assert.assertEquals( - "Should remove files that exceed or are smaller than new bounds", expectedFiles, filtered); - } - - @Test - public void testFilteringWithDeletes() { - RewriteStrategy strategy = - defaultBinPack() - .options( - ImmutableMap.of( - BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), - BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(490 * MB), - BinPackStrategy.DELETE_FILE_THRESHOLD, Integer.toString(2))); - - List testFiles = filesOfSize(500, 500, 480, 480, 560, 520); - testFiles.add(MockFileScanTask.mockTaskWithDeletes(500 * MB, 2)); - Iterable expectedFiles = filesOfSize(480, 480, 560, 500); - Iterable filtered = - ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); - - Assert.assertEquals("Should include file with deletes", expectedFiles, filtered); - } - - @Test - public void testGroupingMinInputFilesInvalid() { - RewriteStrategy strategy = - defaultBinPack() - .options(ImmutableMap.of(BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5))); - - Iterable testFiles = filesOfSize(1, 1, 1, 1); - - Iterable> grouped = strategy.planFileGroups(testFiles); - - Assert.assertEquals("Should plan 0 groups, not enough input files", 0, Iterables.size(grouped)); - } - - @Test - public void testGroupingMinInputFilesAsOne() { - RewriteStrategy strategy = - defaultBinPack() - .options( - ImmutableMap.of( - BinPackStrategy.MIN_INPUT_FILES, Integer.toString(1), - BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(3 * MB), - RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(2 * MB), - BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(MB), - BinPackStrategy.DELETE_FILE_THRESHOLD, Integer.toString(2))); - - Iterable testFiles1 = filesOfSize(1); - Iterable> grouped1 = strategy.planFileGroups(testFiles1); - - Assert.assertEquals( - "Should plan 0 groups, 1 file is too small but no deletes are present so rewriting is " - + "a NOOP", - 0, - Iterables.size(grouped1)); - - Iterable testFiles2 = filesOfSize(4); - Iterable> grouped2 = strategy.planFileGroups(testFiles2); - - Assert.assertEquals( - "Should plan 1 group because the file present is larger than maxFileSize and can be " - + "split", - 1, - Iterables.size(grouped2)); - - List testFiles3 = Lists.newArrayList(); - testFiles3.add(MockFileScanTask.mockTaskWithDeletes(MB, 2)); - Iterable> grouped3 = strategy.planFileGroups(testFiles3); - Assert.assertEquals( - "Should plan 1 group, the data file has delete files and can be re-written without " - + "deleted row", - 1, - Iterables.size(grouped3)); - } - - @Test - public void testGroupWithLargeFileMinInputFiles() { - RewriteStrategy strategy = - defaultBinPack() - .options(ImmutableMap.of(BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5))); - - Iterable testFiles1 = filesOfSize(2000); - Iterable> grouped1 = strategy.planFileGroups(testFiles1); - - Assert.assertEquals( - "Should plan 1 group, not enough input files but the input file exceeds our max" - + "and can be written into at least one new target-file-size files", - ImmutableList.of(testFiles1), - grouped1); - - Iterable testFiles2 = filesOfSize(500, 500, 500); - Iterable> grouped2 = strategy.planFileGroups(testFiles2); - - Assert.assertEquals( - "Should plan 1 group, not enough input files but the sum of file sizes exceeds " - + "target-file-size and files within the group is greater than 1", - ImmutableList.of(testFiles2), - grouped2); - - Iterable testFiles3 = filesOfSize(10, 10, 10); - Iterable> grouped3 = strategy.planFileGroups(testFiles3); - - Assert.assertEquals( - "Should plan 0 groups, not enough input files and the sum of file sizes does not " - + "exceeds target-file-size and files within the group is greater than 1", - ImmutableList.of(), - grouped3); - } - - @Test - public void testGroupingMinInputFilesValid() { - RewriteStrategy strategy = - defaultBinPack() - .options(ImmutableMap.of(BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5))); - - Iterable testFiles = filesOfSize(1, 1, 1, 1, 1); - - Iterable> grouped = strategy.planFileGroups(testFiles); - - Assert.assertEquals( - "Should plan 1 groups since there are enough input files", - ImmutableList.of(testFiles), - grouped); - } - - @Test - public void testGroupingWithDeletes() { - RewriteStrategy strategy = - defaultBinPack() - .options( - ImmutableMap.of( - BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5), - BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), - BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(490 * MB), - BinPackStrategy.DELETE_FILE_THRESHOLD, Integer.toString(2))); - - List testFiles = Lists.newArrayList(); - testFiles.add(MockFileScanTask.mockTaskWithDeletes(500 * MB, 2)); - Iterable> grouped = strategy.planFileGroups(testFiles); - - Assert.assertEquals( - "Should plan 1 groups since there are enough input files", - ImmutableList.of(testFiles), - grouped); - } - - @Test - public void testMaxGroupSize() { - RewriteStrategy strategy = - defaultBinPack() - .options( - ImmutableMap.of( - RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Long.toString(1000 * MB))); - - Iterable testFiles = filesOfSize(300, 300, 300, 300, 300, 300); - - Iterable> grouped = strategy.planFileGroups(testFiles); - - Assert.assertEquals( - "Should plan 2 groups since there is enough data for two groups", - 2, - Iterables.size(grouped)); - } - - @Test - public void testNumOuputFiles() { - BinPackStrategy strategy = (BinPackStrategy) defaultBinPack(); - long targetFileSize = strategy.targetFileSize(); - Assert.assertEquals( - "Should keep remainder if the remainder is a valid size", - 2, - strategy.numOutputFiles(targetFileSize + 450 * MB)); - Assert.assertEquals( - "Should discard remainder file if the remainder is very small", - 1, - strategy.numOutputFiles(targetFileSize + 40 * MB)); - Assert.assertEquals( - "Should keep remainder file if it would change average file size greatly", - 2, - strategy.numOutputFiles((long) (targetFileSize + 0.40 * targetFileSize))); - Assert.assertEquals( - "Should discard remainder if file is small and wouldn't change average that much", - 200, - strategy.numOutputFiles(200 * targetFileSize + 13 * MB)); - Assert.assertEquals( - "Should keep remainder if it's a valid size", - 201, - strategy.numOutputFiles(200 * targetFileSize + 499 * MB)); - Assert.assertEquals( - "Should not return 0 even for very small files", 1, strategy.numOutputFiles(1)); - } - - @Test - public void testInvalidOptions() { - Assertions.assertThatThrownBy( - () -> - defaultBinPack() - .options( - ImmutableMap.of( - BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(1 * MB)))) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith( - "Cannot set min-file-size-bytes greater than or equal to max-file-size-bytes"); - - Assertions.assertThatThrownBy( - () -> - defaultBinPack() - .options( - ImmutableMap.of( - BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(1000 * MB)))) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith( - "Cannot set min-file-size-bytes greater than or equal to max-file-size-bytes"); - - Assertions.assertThatThrownBy( - () -> - defaultBinPack() - .options(ImmutableMap.of(BinPackStrategy.MIN_INPUT_FILES, Long.toString(-5)))) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith( - "Cannot set min-input-files is less than 1. All values less than 1 have the same effect as 1"); - - Assertions.assertThatThrownBy( - () -> - defaultBinPack() - .options( - ImmutableMap.of(BinPackStrategy.DELETE_FILE_THRESHOLD, Long.toString(-5)))) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith( - "Cannot set delete-file-threshold is less than 1. All values less than 1 have the same effect as 1"); - - Assertions.assertThatThrownBy( - () -> - defaultBinPack() - .options( - ImmutableMap.of( - RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(-5)))) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageStartingWith("Cannot set min-file-size-bytes to a negative number"); - } - - @Test - public void testRewriteAllSelectFilesToRewrite() { - RewriteStrategy strategy = - defaultBinPack().options(ImmutableMap.of(BinPackStrategy.REWRITE_ALL, "true")); - - Iterable testFiles = filesOfSize(500, 500, 480, 480, 560, 520); - Iterable expectedFiles = filesOfSize(500, 500, 480, 480, 560, 520); - Iterable filtered = - ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); - Assert.assertEquals("Should rewrite all files", expectedFiles, filtered); - } - - @Test - public void testRewriteAllPlanFileGroups() { - RewriteStrategy strategy = - defaultBinPack() - .options( - ImmutableMap.of( - BinPackStrategy.MIN_INPUT_FILES, - Integer.toString(5), - BinPackStrategy.REWRITE_ALL, - "true")); - - Iterable testFiles = filesOfSize(1, 1, 1, 1); - Iterable> grouped = strategy.planFileGroups(testFiles); - - Assert.assertEquals("Should plan 1 group to rewrite all files", 1, Iterables.size(grouped)); - } -} diff --git a/core/src/test/java/org/apache/iceberg/actions/TestSortStrategy.java b/core/src/test/java/org/apache/iceberg/actions/TestSortStrategy.java deleted file mode 100644 index 57c1e066b1f3..000000000000 --- a/core/src/test/java/org/apache/iceberg/actions/TestSortStrategy.java +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.actions; - -import java.util.Collections; -import java.util.List; -import java.util.Set; -import java.util.stream.IntStream; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MockFileScanTask; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableTestBase; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Types; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -@RunWith(Parameterized.class) -public class TestSortStrategy extends TableTestBase { - - @Parameterized.Parameters(name = "formatVersion = {0}") - public static Object[] parameters() { - return new Object[] {2}; // We don't actually use the format version since everything is mock - } - - @Override - public void setupTable() throws Exception { - super.setupTable(); - table.replaceSortOrder().asc("data").commit(); - } - - private static final long MB = 1024 * 1024; - - public TestSortStrategy(int formatVersion) { - super(formatVersion); - } - - class TestSortStrategyImpl extends SortStrategy { - - @Override - public Table table() { - return table; - } - - @Override - public Set rewriteFiles(List filesToRewrite) { - throw new UnsupportedOperationException(); - } - } - - private SortStrategy defaultSort() { - return (SortStrategy) new TestSortStrategyImpl().options(Collections.emptyMap()); - } - - private List tasksForSortOrder(int sortOrderId, int... fileSizesMB) { - ImmutableList.Builder files = ImmutableList.builder(); - IntStream.of(fileSizesMB) - .forEach(length -> files.add(MockFileScanTask.mockTask(length * MB, sortOrderId))); - return files.build(); - } - - @Test - public void testInvalidSortOrder() { - Assertions.assertThatThrownBy( - () -> defaultSort().sortOrder(SortOrder.unsorted()).options(Collections.emptyMap())) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot set strategy sort order: unsorted"); - - Assertions.assertThatThrownBy( - () -> { - Schema badSchema = - new Schema( - ImmutableList.of( - Types.NestedField.required(0, "nonexistant", Types.IntegerType.get()))); - - defaultSort() - .sortOrder(SortOrder.builderFor(badSchema).asc("nonexistant").build()) - .options(Collections.emptyMap()); - }) - .isInstanceOf(ValidationException.class) - .hasMessageStartingWith("Cannot find field 'nonexistant' in struct"); - } - - @Test - public void testSelectAll() { - List invalid = - ImmutableList.builder() - .addAll(tasksForSortOrder(-1, 500, 500, 500, 500)) - .addAll(tasksForSortOrder(table.sortOrder().orderId(), 10, 10, 2000, 10)) - .build(); - - List expected = - ImmutableList.builder() - .addAll(invalid) - .addAll(tasksForSortOrder(table.sortOrder().orderId(), 500, 490, 520)) - .build(); - - RewriteStrategy strategy = - defaultSort().options(ImmutableMap.of(SortStrategy.REWRITE_ALL, "true")); - List actual = ImmutableList.copyOf(strategy.selectFilesToRewrite(expected)); - - Assert.assertEquals("Should mark all files for rewrite", expected, actual); - } - - @Test - public void testUseSizeOptions() { - List expected = - ImmutableList.builder() - .addAll(tasksForSortOrder(table.sortOrder().orderId(), 498, 551)) - .build(); - - List fileScanTasks = - ImmutableList.builder() - .addAll(expected) - .addAll(tasksForSortOrder(table.sortOrder().orderId(), 500, 500)) - .build(); - - RewriteStrategy strategy = - defaultSort() - .options( - ImmutableMap.of( - SortStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), - SortStrategy.MIN_FILE_SIZE_BYTES, Long.toString(499 * MB))); - - List actual = ImmutableList.copyOf(strategy.selectFilesToRewrite(fileScanTasks)); - - Assert.assertEquals( - "Should mark files for rewrite with adjusted min and max size", expected, actual); - } -} diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java index eaef8e0bccaa..b08c35281905 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java @@ -35,7 +35,7 @@ import org.apache.iceberg.SortDirection; import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; -import org.apache.iceberg.actions.BinPackStrategy; +import org.apache.iceberg.actions.SizeBasedFileRewriter; import org.apache.iceberg.relocated.com.google.common.io.Files; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSchemaUtil; @@ -105,7 +105,7 @@ public void cleanUpIteration() throws IOException { public void sortInt() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -118,7 +118,7 @@ public void sortInt() { public void sortInt2() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -132,7 +132,7 @@ public void sortInt2() { public void sortInt3() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -148,7 +148,7 @@ public void sortInt3() { public void sortInt4() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -164,7 +164,7 @@ public void sortInt4() { public void sortString() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -177,7 +177,7 @@ public void sortString() { public void sortFourColumns() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -193,7 +193,7 @@ public void sortFourColumns() { public void sortSixColumns() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -211,7 +211,7 @@ public void sortSixColumns() { public void zSortInt() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .zOrder("intCol") .execute(); } @@ -221,7 +221,7 @@ public void zSortInt() { public void zSortInt2() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .zOrder("intCol", "intCol2") .execute(); } @@ -231,7 +231,7 @@ public void zSortInt2() { public void zSortInt3() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .zOrder("intCol", "intCol2", "intCol3") .execute(); } @@ -241,7 +241,7 @@ public void zSortInt3() { public void zSortInt4() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .zOrder("intCol", "intCol2", "intCol3", "intCol4") .execute(); } @@ -251,7 +251,7 @@ public void zSortInt4() { public void zSortString() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .zOrder("stringCol") .execute(); } @@ -261,7 +261,7 @@ public void zSortString() { public void zSortFourColumns() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .zOrder("stringCol", "intCol", "dateCol", "doubleCol") .execute(); } @@ -271,7 +271,7 @@ public void zSortFourColumns() { public void zSortSixColumns() { SparkActions.get() .rewriteDataFiles(table()) - .option(BinPackStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .zOrder("stringCol", "intCol", "dateCol", "timestampCol", "doubleCol", "longCol") .execute(); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index c5bf7548c684..f17e8c6ed89f 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -63,12 +63,12 @@ import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; -import org.apache.iceberg.actions.BinPackStrategy; import org.apache.iceberg.actions.RewriteDataFiles; import org.apache.iceberg.actions.RewriteDataFiles.Result; import org.apache.iceberg.actions.RewriteDataFilesCommitManager; import org.apache.iceberg.actions.RewriteFileGroup; -import org.apache.iceberg.actions.SortStrategy; +import org.apache.iceberg.actions.SizeBasedDataRewriter; +import org.apache.iceberg.actions.SizeBasedFileRewriter; import org.apache.iceberg.data.GenericAppenderFactory; import org.apache.iceberg.data.Record; import org.apache.iceberg.deletes.PositionDelete; @@ -139,7 +139,7 @@ public void setupTableLocation() throws Exception { private RewriteDataFilesSparkAction basicRewrite(Table table) { // Always compact regardless of input files table.refresh(); - return actions().rewriteDataFiles(table).option(BinPackStrategy.MIN_INPUT_FILES, "1"); + return actions().rewriteDataFiles(table).option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1"); } @Test @@ -227,9 +227,10 @@ public void testBinPackAfterPartitionChange() { RewriteDataFiles.Result result = basicRewrite(table) - .option(SortStrategy.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") .option( - SortStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) + 1000)) + SizeBasedFileRewriter.MIN_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) + 1000)) .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) + 1001)) @@ -279,10 +280,10 @@ public void testBinPackWithDeletes() throws Exception { actions() .rewriteDataFiles(table) // do not include any file based on bin pack file size configs - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0") + .option(SizeBasedFileRewriter.MIN_FILE_SIZE_BYTES, "0") .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) - .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "2") + .option(SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) + .option(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "2") .execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount()); assertThat(result.rewrittenBytesCount()).isGreaterThan(0L).isLessThan(dataSizeBefore); @@ -315,7 +316,7 @@ public void testBinPackWithDeleteAllData() { Result result = actions() .rewriteDataFiles(table) - .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "1") + .option(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "1") .execute(); Assert.assertEquals("Action should rewrite 1 data files", 1, result.rewrittenDataFilesCount()); assertThat(result.rewrittenBytesCount()).isEqualTo(dataSizeBefore); @@ -454,7 +455,7 @@ public void testBinPackSplitLargeFile() { Result result = basicRewrite(table) .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(targetSize)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(targetSize * 2 - 2000)) + .option(SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, Long.toString(targetSize * 2 - 2000)) .execute(); Assert.assertEquals("Action should delete 1 data files", 1, result.rewrittenDataFilesCount()); @@ -485,8 +486,8 @@ public void testBinPackCombineMixedFiles() { Result result = basicRewrite(table) .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize + 1000)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString(targetSize + 80000)) - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 1000)) + .option(SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, Integer.toString(targetSize + 80000)) + .option(SizeBasedFileRewriter.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 1000)) .execute(); Assert.assertEquals("Action should delete 3 data files", 3, result.rewrittenDataFilesCount()); @@ -514,9 +515,11 @@ public void testBinPackCombineMediumFiles() { Result result = basicRewrite(table) .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((int) (targetSize * 1.8))) .option( - BinPackStrategy.MIN_FILE_SIZE_BYTES, + SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, + Integer.toString((int) (targetSize * 1.8))) + .option( + SizeBasedFileRewriter.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 100)) // All files too small .execute(); @@ -574,7 +577,7 @@ public void testMultipleGroups() { basicRewrite(table) .option( RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) - .option(BinPackStrategy.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") .execute(); Assert.assertEquals("Should have 10 fileGroups", result.rewriteResults().size(), 10); @@ -897,7 +900,7 @@ public void testSortMultipleGroups() { RewriteDataFiles.Result result = basicRewrite(table) .sort() - .option(SortStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .option( RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .execute(); @@ -927,8 +930,8 @@ public void testSimpleSort() { RewriteDataFiles.Result result = basicRewrite(table) .sort() - .option(SortStrategy.MIN_INPUT_FILES, "1") - .option(SortStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); @@ -961,8 +964,8 @@ public void testSortAfterPartitionChange() { RewriteDataFiles.Result result = basicRewrite(table) .sort() - .option(SortStrategy.MIN_INPUT_FILES, "1") - .option(SortStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); @@ -996,7 +999,7 @@ public void testSortCustomSortOrder() { RewriteDataFiles.Result result = basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c2").build()) - .option(SortStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); @@ -1034,7 +1037,7 @@ public void testSortCustomSortOrderRequiresRepartition() { RewriteDataFiles.Result result = basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c3").build()) - .option(SortStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / partitions)) @@ -1068,13 +1071,13 @@ public void testAutoSortShuffleOutput() { basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c2").build()) .option( - SortStrategy.MAX_FILE_SIZE_BYTES, + SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, Integer.toString((averageFileSize(table) / 2) + 2)) // Divide files in 2 .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / 2)) - .option(SortStrategy.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); @@ -1147,13 +1150,13 @@ public void testZOrderSort() { basicRewrite(table) .zOrder("c2", "c3") .option( - SortStrategy.MAX_FILE_SIZE_BYTES, + SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, Integer.toString((averageFileSize(table) / 2) + 2)) // Divide files in 2 .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / 2)) - .option(SortStrategy.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") .execute(); Assert.assertEquals("Should have 1 fileGroups", 1, result.rewriteResults().size()); @@ -1208,8 +1211,8 @@ public void testZOrderAllTypesSort() { "stringCol", "binaryCol", "booleanCol") - .option(SortStrategy.MIN_INPUT_FILES, "1") - .option(SortStrategy.REWRITE_ALL, "true") + .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewriter.REWRITE_ALL, "true") .execute(); Assert.assertEquals("Should have 1 fileGroups", 1, result.rewriteResults().size());