Skip to content

Commit

Permalink
HDDS-9802. Tool to fix corrupted snapshot chain (apache#6386)
Browse files Browse the repository at this point in the history
(cherry picked from commit e907316)
  • Loading branch information
hemantk-12 authored and xichen01 committed Apr 17, 2024
1 parent 986be02 commit b8a01fb
Show file tree
Hide file tree
Showing 7 changed files with 374 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@
* This class is used for storing info related to Snapshots.
*
* Each snapshot created has an associated SnapshotInfo entry
* containing the snapshotid, snapshot path,
* snapshot checkpoint directory, previous snapshotid
* containing the snapshotId, snapshot path,
* snapshot checkpoint directory, previous snapshotId
* for the snapshot path & global amongst other necessary fields.
*/
public final class SnapshotInfo implements Auditable, CopyObject<SnapshotInfo> {
Expand Down
6 changes: 6 additions & 0 deletions hadoop-ozone/dist/src/shell/ozone/ozone
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ function ozone_usage
ozone_add_subcommand "dtutil" client "operations related to delegation tokens"
ozone_add_subcommand "admin" client "Ozone admin tool"
ozone_add_subcommand "debug" client "Ozone debug tool"
ozone_add_subcommand "repair" client "Ozone repair tool"
ozone_add_subcommand "checknative" client "checks if native libraries are loaded"

ozone_generate_usage "${OZONE_SHELL_EXECNAME}" false
Expand Down Expand Up @@ -236,6 +237,11 @@ function ozonecmd_case
OZONE_DEBUG_OPTS="${OZONE_DEBUG_OPTS} ${OZONE_MODULE_ACCESS_ARGS}"
OZONE_RUN_ARTIFACT_NAME="ozone-tools"
;;
repair)
OZONE_CLASSNAME=org.apache.hadoop.ozone.repair.OzoneRepair
OZONE_DEBUG_OPTS="${OZONE_DEBUG_OPTS} ${OZONE_MODULE_ACCESS_ARGS}"
OZONE_RUN_ARTIFACT_NAME="ozone-tools"
;;
checknative)
OZONE_CLASSNAME=org.apache.hadoop.ozone.shell.checknative.CheckNative
OZONE_RUN_ARTIFACT_NAME="ozone-tools"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.ozone.repair;

import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.hdds.cli.GenericCli;
import org.apache.hadoop.hdds.cli.HddsVersionProvider;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import picocli.CommandLine;

/**
* Ozone Repair Command line tool.
*/
@CommandLine.Command(name = "ozone repair",
description = "Operational tool to repair Ozone",
versionProvider = HddsVersionProvider.class,
mixinStandardHelpOptions = true)
public class OzoneRepair extends GenericCli {

private OzoneConfiguration ozoneConf;

public OzoneRepair() {
super(OzoneRepair.class);
}

@VisibleForTesting
public OzoneRepair(OzoneConfiguration configuration) {
super(OzoneRepair.class);
this.ozoneConf = configuration;
}

public OzoneConfiguration getOzoneConf() {
if (ozoneConf == null) {
ozoneConf = createOzoneConfiguration();
}
return ozoneConf;
}

/**
* Main for the Ozone Repair shell Command handling.
*
* @param argv - System Args Strings[]
* @throws Exception
*/
public static void main(String[] argv) throws Exception {
new OzoneRepair().run(argv);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.ozone.repair;

import org.apache.hadoop.hdds.cli.GenericCli;
import org.apache.hadoop.hdds.cli.SubcommandWithParent;
import org.kohsuke.MetaInfServices;
import picocli.CommandLine;

import java.util.concurrent.Callable;

/**
* Ozone Repair CLI for RocksDB.
*/
@CommandLine.Command(name = "ldb",
description = "Operational tool to repair RocksDB table.")
@MetaInfServices(SubcommandWithParent.class)
public class RDBRepair implements Callable<Void>, SubcommandWithParent {

@CommandLine.Spec
private CommandLine.Model.CommandSpec spec;

@CommandLine.Option(names = {"--db"},
required = true,
description = "Database File Path")
private String dbPath;

public String getDbPath() {
return dbPath;
}

@Override
public Void call() {
GenericCli.missingSubcommand(spec);
return null;
}

@Override
public Class<?> getParentType() {
return OzoneRepair.class;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.ozone.repair.om;

import org.apache.hadoop.hdds.cli.SubcommandWithParent;
import org.apache.hadoop.hdds.utils.IOUtils;
import org.apache.hadoop.hdds.utils.db.StringCodec;
import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksDB;
import org.apache.hadoop.hdds.utils.db.managed.ManagedRocksIterator;
import org.apache.hadoop.ozone.debug.RocksDBUtils;
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
import org.apache.hadoop.ozone.repair.RDBRepair;
import org.apache.hadoop.ozone.shell.bucket.BucketUri;
import org.kohsuke.MetaInfServices;
import org.rocksdb.ColumnFamilyDescriptor;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.RocksDBException;
import picocli.CommandLine;
import picocli.CommandLine.Model.CommandSpec;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.Callable;

import static org.apache.hadoop.ozone.OzoneConsts.OM_KEY_PREFIX;
import static org.apache.hadoop.ozone.OzoneConsts.SNAPSHOT_INFO_TABLE;

/**
* Tool to repair snapshotInfoTable in case it has corrupted entries.
*/
@CommandLine.Command(
name = "snapshot",
description = "CLI to update global and path previous snapshot for a snapshot in case snapshot chain is corrupted."
)
@MetaInfServices(SubcommandWithParent.class)
public class SnapshotRepair implements Callable<Void>, SubcommandWithParent {

@CommandLine.Spec
private static CommandSpec spec;

@CommandLine.ParentCommand
private RDBRepair parent;

@CommandLine.Mixin
private BucketUri bucketUri;

@CommandLine.Parameters(description = "Snapshot name to update", index = "1")
private String snapshotName;

@CommandLine.Option(names = {"--global-previous", "--gp"},
required = true,
description = "Global previous snapshotId to set for the given snapshot")
private UUID globalPreviousSnapshotId;

@CommandLine.Option(names = {"--path-previous", "--pp"},
required = true,
description = "Path previous snapshotId to set for the given snapshot")
private UUID pathPreviousSnapshotId;

@CommandLine.Option(names = {"--dry-run"},
required = true,
description = "To dry-run the command.", defaultValue = "true")
private boolean dryRun;

@Override
public Void call() throws Exception {
List<ColumnFamilyHandle> cfHandleList = new ArrayList<>();
List<ColumnFamilyDescriptor> cfDescList = RocksDBUtils.getColumnFamilyDescriptors(parent.getDbPath());

try (ManagedRocksDB db = ManagedRocksDB.open(parent.getDbPath(), cfDescList, cfHandleList)) {
ColumnFamilyHandle snapshotInfoCfh = getSnapshotInfoCfh(cfHandleList);
if (snapshotInfoCfh == null) {
System.err.println(SNAPSHOT_INFO_TABLE + " is not in a column family in DB for the given path.");
return null;
}

String snapshotInfoTableKey = SnapshotInfo.getTableKey(bucketUri.getValue().getVolumeName(),
bucketUri.getValue().getBucketName(), snapshotName);

SnapshotInfo snapshotInfo = getSnapshotInfo(db, snapshotInfoCfh, snapshotInfoTableKey);
if (snapshotInfo == null) {
System.err.println(snapshotName + " does not exist for given bucketUri: " + OM_KEY_PREFIX +
bucketUri.getValue().getVolumeName() + OM_KEY_PREFIX + bucketUri.getValue().getBucketName());
return null;
}

// snapshotIdSet is the set of the all existed snapshots ID to make that the provided global previous and path
// previous exist and after the update snapshot does not point to ghost snapshot.
Set<UUID> snapshotIdSet = getSnapshotIdSet(db, snapshotInfoCfh);

if (Objects.equals(snapshotInfo.getSnapshotId(), globalPreviousSnapshotId)) {
System.err.println("globalPreviousSnapshotId: '" + globalPreviousSnapshotId +
"' is equal to given snapshot's ID: '" + snapshotInfo.getSnapshotId() + "'.");
return null;
}

if (Objects.equals(snapshotInfo.getSnapshotId(), pathPreviousSnapshotId)) {
System.err.println("pathPreviousSnapshotId: '" + pathPreviousSnapshotId +
"' is equal to given snapshot's ID: '" + snapshotInfo.getSnapshotId() + "'.");
return null;
}

if (!snapshotIdSet.contains(globalPreviousSnapshotId)) {
System.err.println("globalPreviousSnapshotId: '" + globalPreviousSnapshotId +
"' does not exist in snapshotInfoTable.");
return null;
}

if (!snapshotIdSet.contains(pathPreviousSnapshotId)) {
System.err.println("pathPreviousSnapshotId: '" + pathPreviousSnapshotId +
"' does not exist in snapshotInfoTable.");
return null;
}

snapshotInfo.setGlobalPreviousSnapshotId(globalPreviousSnapshotId);
snapshotInfo.setPathPreviousSnapshotId(pathPreviousSnapshotId);

if (dryRun) {
System.out.println("SnapshotInfo would be updated to : " + snapshotInfo);
} else {
byte[] snapshotInfoBytes = SnapshotInfo.getCodec().toPersistedFormat(snapshotInfo);
db.get()
.put(snapshotInfoCfh, StringCodec.get().toPersistedFormat(snapshotInfoTableKey), snapshotInfoBytes);

System.out.println("Snapshot Info is updated to : " +
getSnapshotInfo(db, snapshotInfoCfh, snapshotInfoTableKey));
}
} catch (RocksDBException exception) {
System.err.println("Failed to update the RocksDB for the given path: " + parent.getDbPath());
System.err.println(
"Make sure that Ozone entity (OM, SCM or DN) is not running for the give dbPath and current host.");
System.err.println(exception);
} finally {
IOUtils.closeQuietly(cfHandleList);
}

return null;
}

private Set<UUID> getSnapshotIdSet(ManagedRocksDB db, ColumnFamilyHandle snapshotInfoCfh)
throws IOException {
Set<UUID> snapshotIdSet = new HashSet<>();
try (ManagedRocksIterator iterator = new ManagedRocksIterator(db.get().newIterator(snapshotInfoCfh))) {
iterator.get().seekToFirst();

while (iterator.get().isValid()) {
SnapshotInfo snapshotInfo = SnapshotInfo.getCodec().fromPersistedFormat(iterator.get().value());
snapshotIdSet.add(snapshotInfo.getSnapshotId());
iterator.get().next();
}
}
return snapshotIdSet;
}

private ColumnFamilyHandle getSnapshotInfoCfh(List<ColumnFamilyHandle> cfHandleList) throws RocksDBException {
byte[] nameBytes = SNAPSHOT_INFO_TABLE.getBytes(StandardCharsets.UTF_8);

for (ColumnFamilyHandle cf : cfHandleList) {
if (Arrays.equals(cf.getName(), nameBytes)) {
return cf;
}
}

return null;
}

private SnapshotInfo getSnapshotInfo(ManagedRocksDB db, ColumnFamilyHandle snapshotInfoCfh, String snapshotInfoLKey)
throws IOException, RocksDBException {
byte[] bytes = db.get().get(snapshotInfoCfh, StringCodec.get().toPersistedFormat(snapshotInfoLKey));
return bytes != null ? SnapshotInfo.getCodec().fromPersistedFormat(bytes) : null;
}

@Override
public Class<?> getParentType() {
return RDBRepair.class;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/**
* OM related repair tools.
*/
package org.apache.hadoop.ozone.repair.om;
Loading

0 comments on commit b8a01fb

Please sign in to comment.