apache · JingsongLi · May 8, 2023 · May 5, 2023 · May 6, 2023 · May 6, 2023
diff --git a/docs/layouts/shortcodes/generated/core_configuration.html b/docs/layouts/shortcodes/generated/core_configuration.html
@@ -194,12 +194,6 @@
             <td><p>Enum</p></td>
             <td>Specify the merge engine for table with primary key.<br /><br />Possible values:<ul><li>"deduplicate": De-duplicate and keep the last row.</li><li>"partial-update": Partial update non-null fields.</li><li>"aggregation": Aggregate fields with same primary key.</li></ul></td>
         </tr>
-        <tr>
-            <td><h5>sort-engine</h5></td>
-            <td style="word-wrap: break-word;">loser-tree</td>
-            <td><p>Enum</p></td>
-            <td>Specify the sort engine for table with primary key.<br /><br />Possible values:<ul><li>"min-heap": Use min-heap for multiway sorting.</li><li>"loser-tree": Use loser-tree for multiway sorting. Compared with heapsort, loser-tree has fewer comparisons and is more efficient.</li></ul></td>
-        </tr>
         <tr>
             <td><h5>num-levels</h5></td>
             <td style="word-wrap: break-word;">(none)</td>
@@ -296,6 +290,12 @@
             <td>Long</td>
             <td>End condition "watermark" for bounded streaming mode. Stream reading will end when a larger watermark snapshot is encountered.</td>
         </tr>
+        <tr>
+            <td><h5>scan.manifest.parallelism</h5></td>
+            <td style="word-wrap: break-word;">(none)</td>
+            <td>Integer</td>
+            <td>The parallelism of scanning manifest files, default value is the size of cpu processor.Note: Scale-up this parameter will increase memory usage while scanning manifest files.We can consider downsize it when we encounter an out of memory exception while scanning</td>
+        </tr>
         <tr>
             <td><h5>scan.mode</h5></td>
             <td style="word-wrap: break-word;">default</td>
@@ -344,6 +344,12 @@
             <td>Duration</td>
             <td>The maximum time of completed snapshots to retain.</td>
         </tr>
+        <tr>
+            <td><h5>sort-engine</h5></td>
+            <td style="word-wrap: break-word;">loser-tree</td>
+            <td><p>Enum</p></td>
+            <td>Specify the sort engine for table with primary key.<br /><br />Possible values:<ul><li>"min-heap": Use min-heap for multiway sorting.</li><li>"loser-tree": Use loser-tree for multiway sorting. Compared with heapsort, loser-tree has fewer comparisons and is more efficient.</li></ul></td>
+        </tr>
         <tr>
             <td><h5>source.split.open-file-cost</h5></td>
             <td style="word-wrap: break-word;">4 mb</td>

diff --git a/paimon-core/src/main/java/org/apache/paimon/AppendOnlyFileStore.java b/paimon-core/src/main/java/org/apache/paimon/AppendOnlyFileStore.java
@@ -96,7 +96,8 @@ private AppendOnlyFileStoreScan newScan(boolean forWrite) {
                 manifestFileFactory(forWrite),
                 manifestListFactory(forWrite),
                 options.bucket(),
-                forWrite);
+                forWrite,
+                options.scanManifestParallelism());
     }
 
     @Override

diff --git a/paimon-core/src/main/java/org/apache/paimon/CoreOptions.java b/paimon-core/src/main/java/org/apache/paimon/CoreOptions.java
@@ -387,6 +387,15 @@ public class CoreOptions implements Serializable {
                             "End condition \"watermark\" for bounded streaming mode. Stream"
                                     + " reading will end when a larger watermark snapshot is encountered.");
 
+    public static final ConfigOption<Integer> SCAN_MANIFEST_PARALLELISM =
+            key("scan.manifest.parallelism")
+                    .intType()
+                    .noDefaultValue()
+                    .withDescription(
+                            "The parallelism of scanning manifest files, default value is the size of cpu processor."
+                                    + "Note: Scale-up this parameter will increase memory usage while scanning manifest files."
+                                    + "We can consider downsize it when we encounter an out of memory exception while scanning");
+
     public static final ConfigOption<LogConsistency> LOG_CONSISTENCY =
             key("log.consistency")
                     .enumType(LogConsistency.class)
@@ -844,6 +853,10 @@ public Long scanSnapshotId() {
         return options.get(SCAN_SNAPSHOT_ID);
     }
 
+    public Integer scanManifestParallelism() {
+        return options.get(SCAN_MANIFEST_PARALLELISM);
+    }
+
     public Optional<String> sequenceField() {
         return options.getOptional(SEQUENCE_FIELD);
     }

diff --git a/paimon-core/src/main/java/org/apache/paimon/KeyValueFileStore.java b/paimon-core/src/main/java/org/apache/paimon/KeyValueFileStore.java
@@ -121,7 +121,8 @@ private KeyValueFileStoreScan newScan(boolean forWrite) {
                 manifestFileFactory(forWrite),
                 manifestListFactory(forWrite),
                 options.bucket(),
-                forWrite);
+                forWrite,
+                options.scanManifestParallelism());
     }
 
     @Override

diff --git a/paimon-core/src/main/java/org/apache/paimon/manifest/AbstractManifestEntry.java b/paimon-core/src/main/java/org/apache/paimon/manifest/AbstractManifestEntry.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.manifest;
+
+import org.apache.paimon.data.BinaryRow;
+import org.apache.paimon.utils.FileStorePathFactory;
+import org.apache.paimon.utils.Preconditions;
+
+import java.util.Collection;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Objects;
+
+/** Abstract a simplest model of manifest file. */
+public abstract class AbstractManifestEntry {
+    protected final FileKind kind;
+    protected final String fileName;
+    // for tables without partition this field should be a row with 0 columns (not null)
+    protected final BinaryRow partition;
+    protected final int bucket;
+    protected final int totalBuckets;
+    protected final int level;
+
+    public AbstractManifestEntry(
+            FileKind kind,
+            String fileName,
+            BinaryRow partition,
+            int bucket,
+            int totalBuckets,
+            int level) {
+        this.kind = kind;
+        this.fileName = fileName;
+        this.partition = partition;
+        this.bucket = bucket;
+        this.totalBuckets = totalBuckets;
+        this.level = level;
+    }
+
+    public FileKind kind() {
+        return kind;
+    }
+
+    public BinaryRow partition() {
+        return partition;
+    }
+
+    public int bucket() {
+        return bucket;
+    }
+
+    public int totalBuckets() {
+        return totalBuckets;
+    }
+
+    public int level() {
+        return level;
+    }
+
+    public Identifier identifier() {
+        return new Identifier(partition, bucket, level, fileName);
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (!(o instanceof AbstractManifestEntry)) {
+            return false;
+        }
+        AbstractManifestEntry that = (AbstractManifestEntry) o;
+        return Objects.equals(kind, that.kind)
+                && Objects.equals(partition, that.partition)
+                && bucket == that.bucket
+                && level == that.level
+                && Objects.equals(fileName, that.fileName);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(kind, partition, bucket, level, fileName);
+    }
+
+    @Override
+    public String toString() {
+        return String.format("{%s, %s, %d, %d, %s}", kind, partition, bucket, level, fileName);
+    }
+
+    public static <T extends AbstractManifestEntry> Collection<T> mergeEntries(
+            Iterable<T> entries) {
+        LinkedHashMap<Identifier, T> map = new LinkedHashMap<>();
+        mergeEntries(entries, map);
+        return map.values();
+    }
+
+    public static <T extends AbstractManifestEntry> void mergeEntries(
+            Iterable<T> entries, Map<Identifier, T> map) {
+        for (T entry : entries) {
+            Identifier identifier = entry.identifier();
+            switch (entry.kind()) {
+                case ADD:
+                    Preconditions.checkState(
+                            !map.containsKey(identifier),
+                            "Trying to add file %s which is already added. Manifest might be corrupted.",
+                            identifier);
+                    map.put(identifier, entry);
+                    break;
+                case DELETE:
+                    // each dataFile will only be added once and deleted once,
+                    // if we know that it is added before then both add and delete entry can be
+                    // removed because there won't be further operations on this file,
+                    // otherwise we have to keep the delete entry because the add entry must be
+                    // in the previous manifest files
+                    if (map.containsKey(identifier)) {
+                        map.remove(identifier);
+                    } else {
+                        map.put(identifier, entry);
+                    }
+                    break;
+                default:
+                    throw new UnsupportedOperationException(
+                            "Unknown value kind " + entry.kind().name());
+            }
+        }
+    }
+
+    /**
+     * The same {@link Identifier} indicates that the {@link AbstractManifestEntry} refers to the
+     * same data file.
+     */
+    public static class Identifier {
+        public final BinaryRow partition;
+        public final int bucket;
+        public final int level;
+        public final String fileName;
+
+        private Identifier(BinaryRow partition, int bucket, int level, String fileName) {
+            this.partition = partition;
+            this.bucket = bucket;
+            this.level = level;
+            this.fileName = fileName;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (!(o instanceof Identifier)) {
+                return false;
+            }
+            Identifier that = (Identifier) o;
+            return Objects.equals(partition, that.partition)
+                    && bucket == that.bucket
+                    && level == that.level
+                    && Objects.equals(fileName, that.fileName);
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(partition, bucket, level, fileName);
+        }
+
+        @Override
+        public String toString() {
+            return String.format("{%s, %d, %d, %s}", partition, bucket, level, fileName);
+        }
+
+        public String toString(FileStorePathFactory pathFactory) {
+            return pathFactory.getPartitionString(partition)
+                    + ", bucket "
+                    + bucket
+                    + ", level "
+                    + level
+                    + ", file "
+                    + fileName;
+        }
+    }
+}