From b7b0cf28ca73e67bad8ca50cae37eb81d0087b2d Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Tue, 19 Mar 2024 16:48:22 -0700 Subject: [PATCH] CDPD-64153. HDDS-9802. Tool to fix corrupted snapshot chain (#6386) (cherry picked from commit e9073166da9cb16e0529df9f59818842cc7fbbcb) Change-Id: I67950ede21d47449ce433b629e1a8f24a1d88a19 --- .../hadoop/ozone/om/helpers/SnapshotInfo.java | 24 ++- hadoop-ozone/dist/src/shell/ozone/ozone | 5 + .../hadoop/ozone/repair/OzoneRepair.java | 64 ++++++ .../apache/hadoop/ozone/repair/RDBRepair.java | 58 +++++ .../ozone/repair/om/SnapshotRepair.java | 200 ++++++++++++++++++ .../hadoop/ozone/repair/om/package-info.java | 22 ++ .../hadoop/ozone/repair/package-info.java | 22 ++ 7 files changed, 393 insertions(+), 2 deletions(-) create mode 100644 hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/OzoneRepair.java create mode 100644 hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/RDBRepair.java create mode 100644 hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/SnapshotRepair.java create mode 100644 hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/package-info.java create mode 100644 hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/package-info.java diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/helpers/SnapshotInfo.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/helpers/SnapshotInfo.java index bdb642d8fba..ad3d93bb87c 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/helpers/SnapshotInfo.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/helpers/SnapshotInfo.java @@ -49,8 +49,8 @@ * This class is used for storing info related to Snapshots. * * Each snapshot created has an associated SnapshotInfo entry - * containing the snapshotid, snapshot path, - * snapshot checkpoint directory, previous snapshotid + * containing the snapshotId, snapshot path, + * snapshot checkpoint directory, previous snapshotId * for the snapshot path & global amongst other necessary fields. */ public final class SnapshotInfo implements Auditable, CopyObject { @@ -619,4 +619,24 @@ public SnapshotInfo copyObject() { .setSstFiltered(sstFiltered) .build(); } + + @Override + public String toString() { + return "SnapshotInfo{" + + ", snapshotId: '" + snapshotId + '\'' + + ", name: '" + name + "'," + + ", volumeName: '" + volumeName + '\'' + + ", bucketName: '" + bucketName + '\'' + + ", snapshotStatus: '" + snapshotStatus + '\'' + + ", creationTime: '" + creationTime + '\'' + + ", deletionTime: '" + deletionTime + '\'' + + ", pathPreviousSnapshotId: '" + pathPreviousSnapshotId + '\'' + + ", globalPreviousSnapshotId: '" + globalPreviousSnapshotId + '\'' + + ", snapshotPath: '" + snapshotPath + '\'' + + ", checkpointDir: '" + checkpointDir + '\'' + + ", dbTxSequenceNumber: '" + dbTxSequenceNumber + '\'' + + ", deepClean: '" + deepClean + '\'' + + ", sstFiltered: '" + sstFiltered + '\'' + + '}'; + } } diff --git a/hadoop-ozone/dist/src/shell/ozone/ozone b/hadoop-ozone/dist/src/shell/ozone/ozone index 3d20c69d669..a86b312f031 100755 --- a/hadoop-ozone/dist/src/shell/ozone/ozone +++ b/hadoop-ozone/dist/src/shell/ozone/ozone @@ -59,6 +59,7 @@ function ozone_usage ozone_add_subcommand "dtutil" client "operations related to delegation tokens" ozone_add_subcommand "admin" client "Ozone admin tool" ozone_add_subcommand "debug" client "Ozone debug tool" + ozone_add_subcommand "repair" client "Ozone repair tool" ozone_generate_usage "${OZONE_SHELL_EXECNAME}" false } @@ -229,6 +230,10 @@ function ozonecmd_case OZONE_CLASSNAME=org.apache.hadoop.ozone.debug.OzoneDebug OZONE_RUN_ARTIFACT_NAME="ozone-tools" ;; + repair) + OZONE_CLASSNAME=org.apache.hadoop.ozone.repair.OzoneRepair + OZONE_RUN_ARTIFACT_NAME="ozone-tools" + ;; *) OZONE_CLASSNAME="${subcmd}" if ! ozone_validate_classname "${OZONE_CLASSNAME}"; then diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/OzoneRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/OzoneRepair.java new file mode 100644 index 00000000000..3bbbded5802 --- /dev/null +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/OzoneRepair.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.repair; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.hdds.cli.GenericCli; +import org.apache.hadoop.hdds.cli.HddsVersionProvider; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import picocli.CommandLine; + +/** + * Ozone Repair Command line tool. + */ +@CommandLine.Command(name = "ozone repair", + description = "Operational tool to repair Ozone", + versionProvider = HddsVersionProvider.class, + mixinStandardHelpOptions = true) +public class OzoneRepair extends GenericCli { + + private OzoneConfiguration ozoneConf; + + public OzoneRepair() { + super(OzoneRepair.class); + } + + @VisibleForTesting + public OzoneRepair(OzoneConfiguration configuration) { + super(OzoneRepair.class); + this.ozoneConf = configuration; + } + + public OzoneConfiguration getOzoneConf() { + if (ozoneConf == null) { + ozoneConf = createOzoneConfiguration(); + } + return ozoneConf; + } + + /** + * Main for the Ozone Repair shell Command handling. + * + * @param argv - System Args Strings[] + * @throws Exception + */ + public static void main(String[] argv) throws Exception { + new OzoneRepair().run(argv); + } +} diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/RDBRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/RDBRepair.java new file mode 100644 index 00000000000..0f36934ec14 --- /dev/null +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/RDBRepair.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.repair; + +import org.apache.hadoop.hdds.cli.GenericCli; +import org.apache.hadoop.hdds.cli.SubcommandWithParent; +import org.kohsuke.MetaInfServices; +import picocli.CommandLine; + +import java.util.concurrent.Callable; + +/** + * Ozone Repair CLI for RocksDB. + */ +@CommandLine.Command(name = "ldb", + description = "Operational tool to repair RocksDB table.") +@MetaInfServices(SubcommandWithParent.class) +public class RDBRepair implements Callable, SubcommandWithParent { + + @CommandLine.Spec + private CommandLine.Model.CommandSpec spec; + + @CommandLine.Option(names = {"--db"}, + required = true, + description = "Database File Path") + private String dbPath; + + public String getDbPath() { + return dbPath; + } + + @Override + public Void call() { + GenericCli.missingSubcommand(spec); + return null; + } + + @Override + public Class getParentType() { + return OzoneRepair.class; + } +} diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/SnapshotRepair.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/SnapshotRepair.java new file mode 100644 index 00000000000..a364936874f --- /dev/null +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/SnapshotRepair.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.repair.om; + +import org.apache.hadoop.hdds.cli.SubcommandWithParent; +import org.apache.hadoop.hdds.utils.IOUtils; +import org.apache.hadoop.hdds.utils.db.StringCodec; +import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksDB; +import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksIterator; +import org.apache.hadoop.ozone.debug.RocksDBUtils; +import org.apache.hadoop.ozone.om.helpers.SnapshotInfo; +import org.apache.hadoop.ozone.repair.RDBRepair; +import org.apache.hadoop.ozone.shell.bucket.BucketUri; +import org.kohsuke.MetaInfServices; +import org.rocksdb.ColumnFamilyDescriptor; +import org.rocksdb.ColumnFamilyHandle; +import org.rocksdb.RocksDBException; +import picocli.CommandLine; +import picocli.CommandLine.Model.CommandSpec; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.Callable; + +import static org.apache.hadoop.ozone.OzoneConsts.OM_KEY_PREFIX; +import static org.apache.hadoop.ozone.OzoneConsts.SNAPSHOT_INFO_TABLE; + +/** + * Tool to repair snapshotInfoTable in case it has corrupted entries. + */ +@CommandLine.Command( + name = "snapshot", + description = "CLI to update global and path previous snapshot for a snapshot in case snapshot chain is corrupted." +) +@MetaInfServices(SubcommandWithParent.class) +public class SnapshotRepair implements Callable, SubcommandWithParent { + + @CommandLine.Spec + private static CommandSpec spec; + + @CommandLine.ParentCommand + private RDBRepair parent; + + @CommandLine.Mixin + private BucketUri bucketUri; + + @CommandLine.Parameters(description = "Snapshot name to update", index = "1") + private String snapshotName; + + @CommandLine.Option(names = {"--global-previous", "--gp"}, + required = true, + description = "Global previous snapshotId to set for the given snapshot") + private UUID globalPreviousSnapshotId; + + @CommandLine.Option(names = {"--path-previous", "--pp"}, + required = true, + description = "Path previous snapshotId to set for the given snapshot") + private UUID pathPreviousSnapshotId; + + @CommandLine.Option(names = {"--dry-run"}, + required = true, + description = "To dry-run the command.", defaultValue = "true") + private boolean dryRun; + + @Override + public Void call() throws Exception { + List cfHandleList = new ArrayList<>(); + List cfDescList = RocksDBUtils.getColumnFamilyDescriptors(parent.getDbPath()); + + try (ManagedRocksDB db = ManagedRocksDB.open(parent.getDbPath(), cfDescList, cfHandleList)) { + ColumnFamilyHandle snapshotInfoCfh = getSnapshotInfoCfh(cfHandleList); + if (snapshotInfoCfh == null) { + System.err.println(SNAPSHOT_INFO_TABLE + " is not in a column family in DB for the given path."); + return null; + } + + String snapshotInfoTableKey = SnapshotInfo.getTableKey(bucketUri.getValue().getVolumeName(), + bucketUri.getValue().getBucketName(), snapshotName); + + SnapshotInfo snapshotInfo = getSnapshotInfo(db, snapshotInfoCfh, snapshotInfoTableKey); + if (snapshotInfo == null) { + System.err.println(snapshotName + " does not exist for given bucketUri: " + OM_KEY_PREFIX + + bucketUri.getValue().getVolumeName() + OM_KEY_PREFIX + bucketUri.getValue().getBucketName()); + return null; + } + + // snapshotIdSet is the set of the all existed snapshots ID to make that the provided global previous and path + // previous exist and after the update snapshot does not point to ghost snapshot. + Set snapshotIdSet = getSnapshotIdSet(db, snapshotInfoCfh); + + if (Objects.equals(snapshotInfo.getSnapshotId(), globalPreviousSnapshotId)) { + System.err.println("globalPreviousSnapshotId: '" + globalPreviousSnapshotId + + "' is equal to given snapshot's ID: '" + snapshotInfo.getSnapshotId() + "'."); + return null; + } + + if (Objects.equals(snapshotInfo.getSnapshotId(), pathPreviousSnapshotId)) { + System.err.println("pathPreviousSnapshotId: '" + pathPreviousSnapshotId + + "' is equal to given snapshot's ID: '" + snapshotInfo.getSnapshotId() + "'."); + return null; + } + + if (!snapshotIdSet.contains(globalPreviousSnapshotId)) { + System.err.println("globalPreviousSnapshotId: '" + globalPreviousSnapshotId + + "' does not exist in snapshotInfoTable."); + return null; + } + + if (!snapshotIdSet.contains(pathPreviousSnapshotId)) { + System.err.println("pathPreviousSnapshotId: '" + pathPreviousSnapshotId + + "' does not exist in snapshotInfoTable."); + return null; + } + + snapshotInfo.setGlobalPreviousSnapshotId(globalPreviousSnapshotId); + snapshotInfo.setPathPreviousSnapshotId(pathPreviousSnapshotId); + + if (dryRun) { + System.out.println("SnapshotInfo would be updated to : " + snapshotInfo); + } else { + byte[] snapshotInfoBytes = SnapshotInfo.getCodec().toPersistedFormat(snapshotInfo); + db.get() + .put(snapshotInfoCfh, StringCodec.get().toPersistedFormat(snapshotInfoTableKey), snapshotInfoBytes); + + System.out.println("Snapshot Info is updated to : " + + getSnapshotInfo(db, snapshotInfoCfh, snapshotInfoTableKey)); + } + } catch (RocksDBException exception) { + System.err.println("Failed to update the RocksDB for the given path: " + parent.getDbPath()); + System.err.println( + "Make sure that Ozone entity (OM, SCM or DN) is not running for the give dbPath and current host."); + System.err.println(exception); + } finally { + IOUtils.closeQuietly(new ArrayList<>(cfHandleList)); + } + + return null; + } + + private Set getSnapshotIdSet(ManagedRocksDB db, ColumnFamilyHandle snapshotInfoCfh) + throws IOException { + Set snapshotIdSet = new HashSet<>(); + try (ManagedRocksIterator iterator = new ManagedRocksIterator(db.get().newIterator(snapshotInfoCfh))) { + iterator.get().seekToFirst(); + + while (iterator.get().isValid()) { + SnapshotInfo snapshotInfo = SnapshotInfo.getCodec().fromPersistedFormat(iterator.get().value()); + snapshotIdSet.add(snapshotInfo.getSnapshotId()); + iterator.get().next(); + } + } + return snapshotIdSet; + } + + private ColumnFamilyHandle getSnapshotInfoCfh(List cfHandleList) throws RocksDBException { + byte[] nameBytes = SNAPSHOT_INFO_TABLE.getBytes(StandardCharsets.UTF_8); + + for (ColumnFamilyHandle cf : cfHandleList) { + if (Arrays.equals(cf.getName(), nameBytes)) { + return cf; + } + } + + return null; + } + + private SnapshotInfo getSnapshotInfo(ManagedRocksDB db, ColumnFamilyHandle snapshotInfoCfh, String snapshotInfoLKey) + throws IOException, RocksDBException { + byte[] bytes = db.get().get(snapshotInfoCfh, StringCodec.get().toPersistedFormat(snapshotInfoLKey)); + return bytes != null ? SnapshotInfo.getCodec().fromPersistedFormat(bytes) : null; + } + + @Override + public Class getParentType() { + return RDBRepair.class; + } +} diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/package-info.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/package-info.java new file mode 100644 index 00000000000..9e2324a4a6f --- /dev/null +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/om/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * OM related repair tools. + */ +package org.apache.hadoop.ozone.repair.om; diff --git a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/package-info.java b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/package-info.java new file mode 100644 index 00000000000..bd382d04cf7 --- /dev/null +++ b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/repair/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Ozone Repair tools. + */ +package org.apache.hadoop.ozone.repair;