From 73b4404b52e2bbc8ea504fcc6501661995bdda44 Mon Sep 17 00:00:00 2001 From: wangtao Date: Thu, 27 Jul 2023 17:21:02 +0800 Subject: [PATCH 1/5] init test class --- .../test/java/TestFindDuplicateRecords.java | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 ams/server/src/test/java/TestFindDuplicateRecords.java diff --git a/ams/server/src/test/java/TestFindDuplicateRecords.java b/ams/server/src/test/java/TestFindDuplicateRecords.java new file mode 100644 index 0000000000..4315e06dc4 --- /dev/null +++ b/ams/server/src/test/java/TestFindDuplicateRecords.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import com.netease.arctic.table.TableIdentifier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestFindDuplicateRecords { + private static final Logger LOG = LoggerFactory.getLogger(TestFindDuplicateRecords.class); + private final String thriftUrl; + + public static void main(String[] args) { + TestFindDuplicateRecords scan = new TestFindDuplicateRecords("thrift://localhost:1260"); + scan.test(TableIdentifier.of("xxx", "yyy", "zzz")); + } + + public TestFindDuplicateRecords(String thriftUrl) { + this.thriftUrl = thriftUrl; + } + + public void test(TableIdentifier tableIdentifier) { + + } +} From 21b8775ce53c8cc7505d2ff137755d4361851f83 Mon Sep 17 00:00:00 2001 From: wangtao Date: Thu, 27 Jul 2023 17:23:44 +0800 Subject: [PATCH 2/5] load table --- .../src/test/java/TestFindDuplicateRecords.java | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/ams/server/src/test/java/TestFindDuplicateRecords.java b/ams/server/src/test/java/TestFindDuplicateRecords.java index 4315e06dc4..a1517a1870 100644 --- a/ams/server/src/test/java/TestFindDuplicateRecords.java +++ b/ams/server/src/test/java/TestFindDuplicateRecords.java @@ -16,6 +16,9 @@ * limitations under the License. */ +import com.netease.arctic.catalog.ArcticCatalog; +import com.netease.arctic.catalog.CatalogLoader; +import com.netease.arctic.table.ArcticTable; import com.netease.arctic.table.TableIdentifier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -34,6 +37,16 @@ public TestFindDuplicateRecords(String thriftUrl) { } public void test(TableIdentifier tableIdentifier) { - + ArcticTable arcticTable = loadTable(tableIdentifier); + // TODO + arcticTable.asUnkeyedTable(); + } + + private ArcticTable loadTable(TableIdentifier tableIdentifier) { + String catalogUrl = thriftUrl + "/" + tableIdentifier.getCatalog(); + + // 1.scan files + ArcticCatalog load = CatalogLoader.load(catalogUrl); + return load.loadTable(tableIdentifier); } } From 70d5751b52906b8c1a8e8d793018f0dc7a197051 Mon Sep 17 00:00:00 2001 From: wangtao Date: Thu, 27 Jul 2023 17:59:15 +0800 Subject: [PATCH 3/5] add related datafiles --- .../test/java/TestFindDuplicateRecords.java | 48 +++++++++++++++++-- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/ams/server/src/test/java/TestFindDuplicateRecords.java b/ams/server/src/test/java/TestFindDuplicateRecords.java index a1517a1870..bce61603cc 100644 --- a/ams/server/src/test/java/TestFindDuplicateRecords.java +++ b/ams/server/src/test/java/TestFindDuplicateRecords.java @@ -18,28 +18,66 @@ import com.netease.arctic.catalog.ArcticCatalog; import com.netease.arctic.catalog.CatalogLoader; +import com.netease.arctic.io.DataTestHelpers; import com.netease.arctic.table.ArcticTable; import com.netease.arctic.table.TableIdentifier; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + public class TestFindDuplicateRecords { private static final Logger LOG = LoggerFactory.getLogger(TestFindDuplicateRecords.class); private final String thriftUrl; + private final String CID_COLUMN = "id"; + private final String HID_COLUMN = "name"; + + private final Object CID_VALUE = 2003; + private final Object HID_VALUE = "hhh"; - public static void main(String[] args) { + public static void main(String[] args) throws IOException { TestFindDuplicateRecords scan = new TestFindDuplicateRecords("thrift://localhost:1260"); - scan.test(TableIdentifier.of("xxx", "yyy", "zzz")); + scan.test(TableIdentifier.of("local_iceberg", "db", "user10")); } public TestFindDuplicateRecords(String thriftUrl) { this.thriftUrl = thriftUrl; } - public void test(TableIdentifier tableIdentifier) { + public void test(TableIdentifier tableIdentifier) throws IOException { ArcticTable arcticTable = loadTable(tableIdentifier); - // TODO - arcticTable.asUnkeyedTable(); + Table table = arcticTable.asUnkeyedTable(); + List> relatedFiles = findRelatedFiles(table, table.currentSnapshot().snapshotId()); + + System.out.println("===== related data files ====="); + relatedFiles.forEach(f -> System.out.println(f.path())); + } + + public List> findRelatedFiles(Table table, long snapshotId) throws IOException { + List> files = new ArrayList<>(); + for (FileScanTask task : table.newScan().useSnapshot(snapshotId).planFiles()) { + List records = + DataTestHelpers.readDataFile(FileFormat.PARQUET, table.schema(), task.file().path().toString()); + for (Record record : records) { + if (equals((GenericRecord) record)) { + files.add(task.file()); + break; + } + } + } + return files; + } + + private boolean equals(GenericRecord record) { + return record.getField(CID_COLUMN).equals(CID_VALUE) && record.getField(HID_COLUMN).equals(HID_VALUE); } private ArcticTable loadTable(TableIdentifier tableIdentifier) { From be902e8882accb4820d1a0268d4aac2efc527c47 Mon Sep 17 00:00:00 2001 From: wangtao Date: Thu, 27 Jul 2023 17:59:36 +0800 Subject: [PATCH 4/5] fix load from sysdb" gst --- .../com/netease/arctic/server/table/TableConfiguration.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ams/server/src/main/java/com/netease/arctic/server/table/TableConfiguration.java b/ams/server/src/main/java/com/netease/arctic/server/table/TableConfiguration.java index a81518aa99..122fa22f39 100644 --- a/ams/server/src/main/java/com/netease/arctic/server/table/TableConfiguration.java +++ b/ams/server/src/main/java/com/netease/arctic/server/table/TableConfiguration.java @@ -1,5 +1,6 @@ package com.netease.arctic.server.table; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.google.common.base.Objects; import com.netease.arctic.server.optimizing.OptimizingConfig; import com.netease.arctic.table.TableProperties; @@ -7,6 +8,7 @@ import java.util.Map; +@JsonIgnoreProperties(ignoreUnknown = true) public class TableConfiguration { private boolean expireSnapshotEnabled; private long snapshotTTLMinutes; From e68229b18e2cf631de6e6ed32cab5fd8ee844fa1 Mon Sep 17 00:00:00 2001 From: wangtao Date: Fri, 28 Jul 2023 10:04:59 +0800 Subject: [PATCH 5/5] add planFiles --- .../src/test/java/TestFindDuplicateRecords.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ams/server/src/test/java/TestFindDuplicateRecords.java b/ams/server/src/test/java/TestFindDuplicateRecords.java index bce61603cc..624d3d0187 100644 --- a/ams/server/src/test/java/TestFindDuplicateRecords.java +++ b/ams/server/src/test/java/TestFindDuplicateRecords.java @@ -22,6 +22,7 @@ import com.netease.arctic.table.ArcticTable; import com.netease.arctic.table.TableIdentifier; import org.apache.iceberg.ContentFile; +import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.Table; @@ -76,6 +77,21 @@ public List> findRelatedFiles(Table table, long snapshotId) throw return files; } + public static void planFiles(Table table, long snapshotId) { + String file1 = "xxx"; + String file2 = "yyy"; + for (FileScanTask task : table.newScan().useSnapshot(snapshotId).planFiles()) { + String path = task.file().path().toString(); + if (path.equals(file1) || path.equals(file2)) { + System.out.println("===== related data files ====="); + System.out.println(path); + for (DeleteFile delete : task.deletes()) { + System.out.println(delete.path()); + } + } + } + } + private boolean equals(GenericRecord record) { return record.getField(CID_COLUMN).equals(CID_VALUE) && record.getField(HID_COLUMN).equals(HID_VALUE); }