Skip to content

Commit a044d16

Browse files
committed
Allow removing unused deletes when planning
1 parent 6b28b0e commit a044d16

File tree

6 files changed

+251
-3
lines changed

6 files changed

+251
-3
lines changed

api/src/main/java/org/apache/iceberg/Scan.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,11 @@ default ThisT select(String... columns) {
120120
*/
121121
ThisT filter(Expression expr);
122122

123+
default ThisT removeUnusedDeletesWhenPlanning() {
124+
throw new UnsupportedOperationException(
125+
this.getClass().getName() + " doesn't implement planByPartition");
126+
}
127+
123128
/**
124129
* Returns this scan's filter {@link Expression}.
125130
*

core/src/main/java/org/apache/iceberg/BaseScan.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,10 @@ protected boolean shouldIgnoreResiduals() {
134134
return context().ignoreResiduals();
135135
}
136136

137+
protected boolean shouldRemoveUnusedDeletesWhenPlanning() {
138+
return context().removeUnusedDeletesWhenPlanning();
139+
}
140+
137141
protected Expression residualFilter() {
138142
return shouldIgnoreResiduals() ? Expressions.alwaysTrue() : filter();
139143
}
@@ -149,6 +153,12 @@ protected ExecutorService planExecutor() {
149153
protected abstract ThisT newRefinedScan(
150154
Table newTable, Schema newSchema, TableScanContext newContext);
151155

156+
@Override
157+
public ThisT removeUnusedDeletesWhenPlanning() {
158+
return newRefinedScan(
159+
table, tableSchema(), context.shouldRemoveUnusedDeletesWhenPlanning(true));
160+
}
161+
152162
@Override
153163
public ThisT option(String property, String value) {
154164
return newRefinedScan(table, schema, context.withOption(property, value));

core/src/main/java/org/apache/iceberg/DataTableScan.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,10 @@ public CloseableIterable<FileScanTask> doPlanFiles() {
7979
.ignoreDeleted()
8080
.columnsToKeepStats(columnsToKeepStats());
8181

82+
if (shouldRemoveUnusedDeletesWhenPlanning()) {
83+
manifestGroup = manifestGroup.planByPartition();
84+
}
85+
8286
if (shouldIgnoreResiduals()) {
8387
manifestGroup = manifestGroup.ignoreResiduals();
8488
}

core/src/main/java/org/apache/iceberg/DeleteFileIndex.java

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@
2626
import java.util.Collection;
2727
import java.util.Collections;
2828
import java.util.Comparator;
29+
import java.util.Iterator;
2930
import java.util.List;
3031
import java.util.Map;
32+
import java.util.Objects;
3133
import java.util.Queue;
3234
import java.util.Set;
3335
import java.util.concurrent.ConcurrentLinkedQueue;
@@ -67,11 +69,11 @@
6769
class DeleteFileIndex {
6870
private static final DeleteFile[] EMPTY_DELETES = new DeleteFile[0];
6971

70-
private final EqualityDeletes globalDeletes;
7172
private final PartitionMap<EqualityDeletes> eqDeletesByPartition;
7273
private final PartitionMap<PositionDeletes> posDeletesByPartition;
7374
private final Map<String, PositionDeletes> posDeletesByPath;
7475
private final Map<String, DeleteFile> dvByPath;
76+
private EqualityDeletes globalDeletes;
7577
private final boolean hasEqDeletes;
7678
private final boolean hasPosDeletes;
7779
private final boolean isEmpty;
@@ -137,6 +139,47 @@ public Iterable<DeleteFile> referencedDeleteFiles() {
137139
return deleteFiles;
138140
}
139141

142+
void removeIndex(int specId, StructLike partition) {
143+
if (partition.size() == 0) {
144+
this.globalDeletes = null;
145+
return;
146+
}
147+
148+
if (eqDeletesByPartition != null) {
149+
eqDeletesByPartition.remove(specId, partition);
150+
}
151+
if (posDeletesByPartition != null) {
152+
posDeletesByPartition.remove(specId, partition);
153+
}
154+
155+
if (posDeletesByPath != null) {
156+
Set<String> toRemove = Sets.newHashSet();
157+
for (Map.Entry<String, PositionDeletes> deletes : posDeletesByPath.entrySet()) {
158+
Iterator<DeleteFile> deleteFiles = deletes.getValue().referencedDeleteFiles().iterator();
159+
if (deleteFiles.hasNext()) {
160+
DeleteFile deleteFile = deleteFiles.next();
161+
if (specId == deleteFile.specId() && Objects.equals(partition, deleteFile.partition())) {
162+
toRemove.add(deletes.getKey());
163+
break;
164+
}
165+
}
166+
}
167+
toRemove.forEach(posDeletesByPath::remove);
168+
}
169+
170+
if (dvByPath != null) {
171+
Set<String> toRemove = Sets.newHashSet();
172+
for (Map.Entry<String, DeleteFile> deletes : dvByPath.entrySet()) {
173+
DeleteFile deleteFile = deletes.getValue();
174+
if (specId == deleteFile.specId() && Objects.equals(partition, deleteFile.partition())) {
175+
toRemove.add(deletes.getKey());
176+
break;
177+
}
178+
}
179+
toRemove.forEach(dvByPath::remove);
180+
}
181+
}
182+
140183
DeleteFile[] forEntry(ManifestEntry<DataFile> entry) {
141184
return forDataFile(entry.dataSequenceNumber(), entry.file());
142185
}

core/src/main/java/org/apache/iceberg/ManifestGroup.java

Lines changed: 176 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020

2121
import com.github.benmanes.caffeine.cache.Caffeine;
2222
import com.github.benmanes.caffeine.cache.LoadingCache;
23+
import java.io.Closeable;
2324
import java.io.IOException;
25+
import java.nio.ByteBuffer;
2426
import java.util.List;
2527
import java.util.Map;
2628
import java.util.Set;
@@ -40,16 +42,18 @@
4042
import org.apache.iceberg.metrics.ScanMetricsUtil;
4143
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
4244
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
45+
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
4346
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
4447
import org.apache.iceberg.types.Types;
4548
import org.apache.iceberg.util.ContentFileUtil;
49+
import org.apache.iceberg.util.Pair;
4650
import org.apache.iceberg.util.ParallelIterable;
4751

4852
class ManifestGroup {
4953
private static final Types.StructType EMPTY_STRUCT = Types.StructType.of();
5054

5155
private final FileIO io;
52-
private final Set<ManifestFile> dataManifests;
56+
private Set<ManifestFile> dataManifests;
5357
private final DeleteFileIndex.Builder deleteIndexBuilder;
5458
private Predicate<ManifestEntry<DataFile>> manifestEntryPredicate;
5559
private Map<Integer, PartitionSpec> specsById;
@@ -64,6 +68,7 @@ class ManifestGroup {
6468
private Set<Integer> columnsToKeepStats;
6569
private ExecutorService executorService;
6670
private ScanMetrics scanMetrics;
71+
private DeleteFileIndex deleteFiles;
6772

6873
ManifestGroup(FileIO io, Iterable<ManifestFile> manifests) {
6974
this(
@@ -162,6 +167,34 @@ ManifestGroup planWith(ExecutorService newExecutorService) {
162167
return this;
163168
}
164169

170+
ManifestGroup planByPartition() {
171+
Map<Pair<Integer, StructLike>, Integer> partitionRefCount = Maps.newHashMap();
172+
Map<ManifestFile, Set<Pair<Integer, StructLike>>> distinctPartitionsInManifest =
173+
Maps.newHashMap();
174+
for (ManifestFile file : dataManifests) {
175+
Set<Pair<Integer, StructLike>> visited = Sets.newHashSet();
176+
try (ManifestReader<DataFile> reader = ManifestFiles.read(file, io)) {
177+
for (DataFile dataFile : reader) {
178+
Pair<Integer, StructLike> partition = Pair.of(dataFile.specId(), dataFile.partition());
179+
if (visited.add(partition)) {
180+
partitionRefCount.put(partition, partitionRefCount.getOrDefault(partition, 0) + 1);
181+
}
182+
}
183+
} catch (IOException e) {
184+
throw new RuntimeException(e);
185+
}
186+
distinctPartitionsInManifest.put(file, visited);
187+
}
188+
189+
Set<ManifestFile> newDataFiles = Sets.newHashSet();
190+
for (ManifestFile file : dataManifests) {
191+
newDataFiles.add(
192+
new CloseableManifest(file, partitionRefCount, distinctPartitionsInManifest));
193+
}
194+
this.dataManifests = newDataFiles;
195+
return this;
196+
}
197+
165198
/**
166199
* Returns an iterable of scan tasks. It is safe to add entries of this iterable to a collection
167200
* as {@link DataFile} in each {@link FileScanTask} is defensively copied.
@@ -172,6 +205,144 @@ public CloseableIterable<FileScanTask> planFiles() {
172205
return plan(ManifestGroup::createFileScanTasks);
173206
}
174207

208+
private class CloseableManifest implements ManifestFile, Closeable {
209+
private final ManifestFile delegate;
210+
private final Map<Pair<Integer, StructLike>, Integer> partitionRefCount;
211+
private final Map<ManifestFile, Set<Pair<Integer, StructLike>>> distinctPartitionsInManifest;
212+
213+
private CloseableManifest(
214+
ManifestFile delegate,
215+
Map<Pair<Integer, StructLike>, Integer> partitionRefCount,
216+
Map<ManifestFile, Set<Pair<Integer, StructLike>>> distinctPartitionsInManifest) {
217+
this.delegate = delegate;
218+
this.partitionRefCount = partitionRefCount;
219+
this.distinctPartitionsInManifest = distinctPartitionsInManifest;
220+
}
221+
222+
@Override
223+
public void close() {
224+
synchronized (partitionRefCount) {
225+
Set<Pair<Integer, StructLike>> pairs = distinctPartitionsInManifest.get(delegate);
226+
for (Pair<Integer, StructLike> partition : pairs) {
227+
partitionRefCount.put(partition, partitionRefCount.get(partition) - 1);
228+
if (partitionRefCount.get(partition) == 0) {
229+
deleteFiles.removeIndex(partition.first(), partition.second());
230+
}
231+
}
232+
}
233+
}
234+
235+
@Override
236+
public boolean hasAddedFiles() {
237+
return delegate.hasAddedFiles();
238+
}
239+
240+
@Override
241+
public boolean hasExistingFiles() {
242+
return delegate.hasExistingFiles();
243+
}
244+
245+
@Override
246+
public boolean hasDeletedFiles() {
247+
return delegate.hasDeletedFiles();
248+
}
249+
250+
@Override
251+
public ByteBuffer keyMetadata() {
252+
return delegate.keyMetadata();
253+
}
254+
255+
@Override
256+
public Long firstRowId() {
257+
return delegate.firstRowId();
258+
}
259+
260+
@Override
261+
public String path() {
262+
return delegate.path();
263+
}
264+
265+
@Override
266+
public long length() {
267+
return delegate.length();
268+
}
269+
270+
@Override
271+
public int partitionSpecId() {
272+
return delegate.partitionSpecId();
273+
}
274+
275+
@Override
276+
public ManifestContent content() {
277+
return delegate.content();
278+
}
279+
280+
@Override
281+
public long sequenceNumber() {
282+
return delegate.sequenceNumber();
283+
}
284+
285+
@Override
286+
public long minSequenceNumber() {
287+
return delegate.minSequenceNumber();
288+
}
289+
290+
@Override
291+
public Long snapshotId() {
292+
return delegate.snapshotId();
293+
}
294+
295+
@Override
296+
public Integer addedFilesCount() {
297+
return delegate.addedFilesCount();
298+
}
299+
300+
@Override
301+
public Long addedRowsCount() {
302+
return delegate.addedRowsCount();
303+
}
304+
305+
@Override
306+
public Integer existingFilesCount() {
307+
return delegate.existingFilesCount();
308+
}
309+
310+
@Override
311+
public Long existingRowsCount() {
312+
return delegate.existingRowsCount();
313+
}
314+
315+
@Override
316+
public Integer deletedFilesCount() {
317+
return delegate.deletedFilesCount();
318+
}
319+
320+
@Override
321+
public Long deletedRowsCount() {
322+
return delegate.deletedRowsCount();
323+
}
324+
325+
@Override
326+
public List<PartitionFieldSummary> partitions() {
327+
return delegate.partitions();
328+
}
329+
330+
@Override
331+
public ManifestFile copy() {
332+
return delegate.copy();
333+
}
334+
335+
@Override
336+
public int hashCode() {
337+
return delegate.hashCode();
338+
}
339+
340+
@Override
341+
public boolean equals(Object obj) {
342+
return delegate.equals(obj);
343+
}
344+
}
345+
175346
public <T extends ScanTask> CloseableIterable<T> plan(CreateTasksFunction<T> createTasksFunc) {
176347
LoadingCache<Integer, ResidualEvaluator> residualCache =
177348
Caffeine.newBuilder()
@@ -182,7 +353,7 @@ public <T extends ScanTask> CloseableIterable<T> plan(CreateTasksFunction<T> cre
182353
return ResidualEvaluator.of(spec, filter, caseSensitive);
183354
});
184355

185-
DeleteFileIndex deleteFiles = deleteIndexBuilder.scanMetrics(scanMetrics).build();
356+
deleteFiles = deleteIndexBuilder.scanMetrics(scanMetrics).build();
186357

187358
boolean dropStats = ManifestReader.dropStats(columns);
188359
if (deleteFiles.hasEqualityDeletes()) {
@@ -352,6 +523,9 @@ public void close() throws IOException {
352523
if (iterable != null) {
353524
iterable.close();
354525
}
526+
if (manifest instanceof CloseableManifest) {
527+
((CloseableManifest) manifest).close();
528+
}
355529
}
356530
});
357531
}

core/src/main/java/org/apache/iceberg/TableScanContext.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ public boolean returnColumnStats() {
6060
return false;
6161
}
6262

63+
@Value.Default
64+
public boolean removeUnusedDeletesWhenPlanning() {
65+
return false;
66+
}
67+
6368
@Nullable
6469
public abstract Set<Integer> columnsToKeepStats();
6570

@@ -129,6 +134,13 @@ TableScanContext shouldReturnColumnStats(boolean returnColumnStats) {
129134
.build();
130135
}
131136

137+
TableScanContext shouldRemoveUnusedDeletesWhenPlanning(boolean removeUnusedDeletesWhenPlanning) {
138+
return ImmutableTableScanContext.builder()
139+
.from(this)
140+
.removeUnusedDeletesWhenPlanning(removeUnusedDeletesWhenPlanning)
141+
.build();
142+
}
143+
132144
TableScanContext columnsToKeepStats(Set<Integer> columnsToKeepStats) {
133145
Preconditions.checkState(
134146
returnColumnStats(),

0 commit comments

Comments
 (0)