Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HBASE-28241 The snapshot operation encountered an NPE and failed. #5560

Merged
merged 2 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ protected ProcedureMetrics getProcedureMetrics(MasterProcedureEnv env) {
private boolean prepareMergeRegion(final MasterProcedureEnv env) throws IOException {
// Fail if we are taking snapshot for the given table
TableName tn = regionsToMerge[0].getTable();
if (env.getMasterServices().getSnapshotManager().isTakingSnapshot(tn)) {
if (env.getMasterServices().getSnapshotManager().isTableTakingAnySnapshot(tn)) {
frostruan marked this conversation as resolved.
Show resolved Hide resolved
throw new MergeRegionException("Skip merging regions "
+ RegionInfo.getShortNameToLog(regionsToMerge) + ", because we are snapshotting " + tn);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,8 @@ private byte[] getSplitRow() {
public boolean prepareSplitRegion(final MasterProcedureEnv env) throws IOException {
// Fail if we are taking snapshot for the given table
if (
env.getMasterServices().getSnapshotManager().isTakingSnapshot(getParentRegion().getTable())
env.getMasterServices().getSnapshotManager()
.isTableTakingAnySnapshot(getParentRegion().getTable())
) {
setFailure(new IOException("Skip splitting region " + getParentRegion().getShortNameToLog()
+ ", because we are taking snapshot for the table " + getParentRegion().getTable()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hbase.master.assignment;

import static org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility.assertProcFailed;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

Expand All @@ -33,16 +34,20 @@
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.SnapshotDescription;
import org.apache.hadoop.hbase.client.SnapshotType;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility;
import org.apache.hadoop.hbase.master.procedure.TestSnapshotProcedure;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.util.Bytes;
Expand All @@ -59,6 +64,9 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos;

@Category({ MasterTests.class, LargeTests.class })
public class TestMergeTableRegionsProcedure {

Expand Down Expand Up @@ -347,6 +355,42 @@ public void testMergeWithoutPONR() throws Exception {
assertRegionCount(tableName, initialRegionCount - 1);
}

@Test
public void testMergingRegionWhileTakingSnapshot() throws Exception {
final TableName tableName = TableName.valueOf("testMergingRegionWhileTakingSnapshot");
final ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();

List<RegionInfo> tableRegions = createTable(tableName);

ProcedureTestingUtility.waitNoProcedureRunning(procExec);

SnapshotDescription snapshot =
new SnapshotDescription("SnapshotProcedureTest", tableName, SnapshotType.FLUSH);
SnapshotProtos.SnapshotDescription snapshotProto =
ProtobufUtil.createHBaseProtosSnapshotDesc(snapshot);
snapshotProto = SnapshotDescriptionUtils.validate(snapshotProto,
UTIL.getHBaseCluster().getMaster().getConfiguration());
long snapshotProcId = procExec.submitProcedure(
new TestSnapshotProcedure.DelaySnapshotProcedure(procExec.getEnvironment(), snapshotProto));
UTIL.getHBaseCluster().getMaster().getSnapshotManager().registerSnapshotProcedure(snapshotProto,
snapshotProcId);

RegionInfo[] regionsToMerge = new RegionInfo[2];
regionsToMerge[0] = tableRegions.get(0);
regionsToMerge[1] = tableRegions.get(1);

long mergeProcId = procExec.submitProcedure(
new MergeTableRegionsProcedure(procExec.getEnvironment(), regionsToMerge, true));

ProcedureTestingUtility
.waitProcedure(UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor(), mergeProcId);
ProcedureTestingUtility.waitProcedure(
UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor(), snapshotProcId);

assertProcFailed(procExec, mergeProcId);
assertEquals(initialRegionCount, UTIL.getAdmin().getRegions(tableName).size());
}

private List<RegionInfo> createTable(final TableName tableName) throws Exception {
TableDescriptor desc = TableDescriptorBuilder.newBuilder(tableName)
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(FAMILY)).build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.SnapshotDescription;
import org.apache.hadoop.hbase.client.SnapshotType;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
Expand All @@ -50,10 +52,12 @@
import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility;
import org.apache.hadoop.hbase.master.procedure.TestSnapshotProcedure;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.Bytes;
Expand All @@ -69,6 +73,9 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos;

@Category({ MasterTests.class, MediumTests.class })
public class TestSplitTableRegionProcedure {

Expand Down Expand Up @@ -550,6 +557,52 @@ public void testSplitWithoutPONR() throws Exception {
verify(tableName, splitRowNum);
}

@Test
public void testSplitRegionWhileTakingSnapshot() throws Exception {
final TableName tableName = TableName.valueOf(name.getMethodName());
final ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();

RegionInfo[] regions = MasterProcedureTestingUtility.createTable(procExec, tableName, null,
columnFamilyName1, columnFamilyName2);
int splitRowNum = startRowNum + rowCount / 2;
byte[] splitKey = Bytes.toBytes("" + splitRowNum);

assertTrue("not able to find a splittable region", regions != null);
assertTrue("not able to find a splittable region", regions.length == 1);
ProcedureTestingUtility.waitNoProcedureRunning(procExec);

// task snapshot
SnapshotDescription snapshot =
new SnapshotDescription("SnapshotProcedureTest", tableName, SnapshotType.FLUSH);
SnapshotProtos.SnapshotDescription snapshotProto =
ProtobufUtil.createHBaseProtosSnapshotDesc(snapshot);
snapshotProto = SnapshotDescriptionUtils.validate(snapshotProto,
UTIL.getHBaseCluster().getMaster().getConfiguration());
long snapshotProcId = procExec.submitProcedure(
new TestSnapshotProcedure.DelaySnapshotProcedure(procExec.getEnvironment(), snapshotProto));
UTIL.getHBaseCluster().getMaster().getSnapshotManager().registerSnapshotProcedure(snapshotProto,
snapshotProcId);

// collect AM metrics before test
collectAssignmentManagerMetrics();

// Split region of the table
long procId = procExec.submitProcedure(
new SplitTableRegionProcedure(procExec.getEnvironment(), regions[0], splitKey));
// Wait the completion
ProcedureTestingUtility.waitProcedure(procExec, procId);
ProcedureTestingUtility.waitProcedure(procExec, snapshotProcId);

ProcedureTestingUtility.assertProcFailed(procExec, procId);
ProcedureTestingUtility.assertProcNotFailed(procExec, snapshotProcId);

assertTrue(UTIL.getMiniHBaseCluster().getRegions(tableName).size() == 1);
assertTrue(UTIL.countRows(tableName) == 0);

assertEquals(splitSubmittedCount + 1, splitProcMetrics.getSubmittedCounter().getCount());
assertEquals(splitFailedCount + 1, splitProcMetrics.getFailedCounter().getCount());
}

private void deleteData(final TableName tableName, final int startDeleteRowNum)
throws IOException, InterruptedException {
Table t = UTIL.getConnection().getTable(tableName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@
*/
package org.apache.hadoop.hbase.master.procedure;

import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.SnapshotState.SNAPSHOT_SNAPSHOT_ONLINE_REGIONS;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseTestingUtil;
Expand Down Expand Up @@ -51,6 +53,7 @@
import org.slf4j.LoggerFactory;

import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.SnapshotState;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos;

Expand All @@ -70,6 +73,27 @@ public class TestSnapshotProcedure {
protected SnapshotDescription snapshot;
protected SnapshotProtos.SnapshotDescription snapshotProto;

public static final class DelaySnapshotProcedure extends SnapshotProcedure {
public DelaySnapshotProcedure() {
}

public DelaySnapshotProcedure(final MasterProcedureEnv env,
final SnapshotProtos.SnapshotDescription snapshot) {
super(env, snapshot);
}

@Override
protected Flow executeFromState(MasterProcedureEnv env,
MasterProcedureProtos.SnapshotState state)
throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
Flow flow = super.executeFromState(env, state);
if (state == SNAPSHOT_SNAPSHOT_ONLINE_REGIONS) {
TimeUnit.SECONDS.sleep(20);
}
return flow;
}
}

@Before
public void setup() throws Exception {
TEST_UTIL = new HBaseTestingUtil();
Expand Down