Skip to content

Commit

Permalink
HBASE-24632 Enable procedure-based log splitting as default in hbase3…
Browse files Browse the repository at this point in the history
… Add deprecation of 'classic' zk-based WAL splitter. (#2156)

Also fix three bugs:

 * We were trying to delete non-empty directory; weren't doing
 accounting for meta WALs where meta had moved off the server
 (successfully)
 * We were deleting split WALs rather than archiving them.
 * We were not handling corrupt files.

Deprecations and removal of tests of old system.

Signed-off-by: Anoop Sam John <anoopsamjohn@apache.org>
  • Loading branch information
saintstack authored Jul 29, 2020
1 parent 56f32ea commit 345b77a
Show file tree
Hide file tree
Showing 24 changed files with 393 additions and 464 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1447,9 +1447,18 @@ public enum OperationStatusCode {
public static final String HBASE_CLIENT_FAST_FAIL_INTERCEPTOR_IMPL =
"hbase.client.fast.fail.interceptor.impl";

/**
* @deprecated since 2.4.0 and in 3.0.0, to be removed in 4.0.0, replaced by procedure-based
* distributed WAL splitter; see SplitWALManager.
*/
@Deprecated
public static final String HBASE_SPLIT_WAL_COORDINATED_BY_ZK = "hbase.split.wal.zk.coordinated";

public static final boolean DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK = true;
/**
* @deprecated since 2.4.0 and in 3.0.0, to be removed in 4.0.0.
*/
@Deprecated
public static final boolean DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK = false;

public static final String HBASE_SPLIT_WAL_MAX_SPLITTER = "hbase.regionserver.wal.max.splitters";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,14 @@
/**
* Counters kept by the distributed WAL split log process.
* Used by master and regionserver packages.
* @deprecated since 2.4.0 and in 3.0.0, to be removed in 4.0.0, replaced by procedure-based
* distributed WAL splitter, see SplitWALManager
*/
@Deprecated
@InterfaceAudience.Private
public class SplitLogCounters {
private SplitLogCounters() {}

//Spnager counters
public final static LongAdder tot_mgr_log_split_batch_start = new LongAdder();
public final static LongAdder tot_mgr_log_split_batch_success = new LongAdder();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@
* Encapsulates protobuf serialization/deserialization so we don't leak generated pb outside of
* this class. Used by regionserver and master packages.
* <p>Immutable
* @deprecated since 2.4.0 and in 3.0.0, to be removed in 4.0.0, replaced by procedure-based
* distributed WAL splitter, see SplitWALManager
*/
@Deprecated
@InterfaceAudience.Private
public class SplitLogTask {
private final ServerName originServer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,11 @@
* Methods required for task life circle: <BR>
* {@link #checkTaskStillAvailable(String)} Check that task is still there <BR>
* {@link #checkTasks()} check for unassigned tasks and resubmit them
* @deprecated since 2.4.0 and in 3.0.0, to be removed in 4.0.0, replaced by procedure-based
* distributed WAL splitter, see SplitWALManager
*/
@InterfaceAudience.Private
@Deprecated
public interface SplitLogManagerCoordination {
/**
* Detail class that shares data between coordination and split log manager
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
/**
*
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
Expand All @@ -18,16 +17,14 @@
*/
package org.apache.hadoop.hbase.coordination;
import java.util.concurrent.atomic.LongAdder;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.SplitLogTask;
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.regionserver.RegionServerServices;
import org.apache.hadoop.hbase.regionserver.SplitLogWorker;
import org.apache.hadoop.hbase.regionserver.SplitLogWorker.TaskExecutor;

import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;

/**
Expand All @@ -44,7 +41,10 @@
* <p>
* Important methods for WALSplitterHandler: <BR>
* splitting task has completed.
* @deprecated since 2.4.0 and in 3.0.0, to be removed in 4.0.0, replaced by procedure-based
* distributed WAL splitter, see SplitWALManager
*/
@Deprecated
@InterfaceAudience.Private
public interface SplitLogWorkerCoordination {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@

/**
* ZooKeeper-based implementation of {@link org.apache.hadoop.hbase.CoordinatedStateManager}.
* @deprecated since 2.4.0 and in 3.0.0, to be removed in 4.0.0, replaced by procedure-based
* distributed WAL splitter (see SplitWALManager) which doesn't use this zk-based coordinator.
*/
@Deprecated
@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG)
public class ZkCoordinatedStateManager implements CoordinatedStateManager {
protected ZKWatcher watcher;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
import org.apache.hadoop.hbase.ipc.RpcServer.BlockingServiceAndInterface;
import org.apache.hadoop.hbase.ipc.RpcServerFactory;
import org.apache.hadoop.hbase.ipc.RpcServerInterface;
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
import org.apache.hadoop.hbase.ipc.ServerRpcController;
import org.apache.hadoop.hbase.master.assignment.RegionStates;
import org.apache.hadoop.hbase.master.locking.LockProcedure;
Expand Down Expand Up @@ -2436,6 +2437,12 @@ public ClearDeadServersResponse clearDeadServers(RpcController controller,
@Override
public ReportProcedureDoneResponse reportProcedureDone(RpcController controller,
ReportProcedureDoneRequest request) throws ServiceException {
// Check Masters is up and ready for duty before progressing. Remote side will keep trying.
try {
this.master.checkServiceStarted();
} catch (ServerNotRunningYetException snrye) {
throw new ServiceException(snrye);
}
request.getResultList().forEach(result -> {
if (result.getStatus() == RemoteProcedureResult.Status.SUCCESS) {
master.remoteProcedureCompleted(result.getProcId());
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
Expand Down Expand Up @@ -44,7 +43,6 @@
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;

/**
Expand All @@ -55,13 +53,19 @@
public class MasterWalManager {
private static final Logger LOG = LoggerFactory.getLogger(MasterWalManager.class);

/**
* Filter *in* WAL files that are for the hbase:meta Region.
*/
final static PathFilter META_FILTER = new PathFilter() {
@Override
public boolean accept(Path p) {
return AbstractFSWALProvider.isMetaFile(p);
}
};

/**
* Filter *out* WAL files that are for the hbase:meta Region; i.e. return user-space WALs only.
*/
@VisibleForTesting
public final static PathFilter NON_META_FILTER = new PathFilter() {
@Override
Expand All @@ -81,10 +85,19 @@ public boolean accept(Path p) {

// The Path to the old logs dir
private final Path oldLogDir;

private final Path rootDir;

// create the split log lock
private final Lock splitLogLock = new ReentrantLock();

/**
* Superceded by {@link SplitWALManager}; i.e. procedure-based WAL splitting rather than
* 'classic' zk-coordinated WAL splitting.
* @deprecated since 2.3.0 and 3.0.0 to be removed in 4.0.0; replaced by {@link SplitWALManager}.
* @see SplitWALManager
*/
@Deprecated
private final SplitLogManager splitLogManager;

// Is the fileystem ok?
Expand All @@ -102,7 +115,6 @@ public MasterWalManager(Configuration conf, FileSystem fs, Path rootDir, MasterS
this.rootDir = rootDir;
this.services = services;
this.splitLogManager = new SplitLogManager(services, conf);

this.oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
}

Expand Down Expand Up @@ -204,7 +216,7 @@ public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException {
*/
@Deprecated
public Set<ServerName> getFailedServersFromLogFolders() throws IOException {
boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
boolean retrySplitting = !conf.getBoolean(WALSplitter.SPLIT_SKIP_ERRORS_KEY,
WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);

Set<ServerName> serverNames = new HashSet<>();
Expand Down Expand Up @@ -361,11 +373,13 @@ public void splitLog(final Set<ServerName> serverNames, PathFilter filter) throw
}

/**
* For meta region open and closed normally on a server, it may leave some meta
* WAL in the server's wal dir. Since meta region is no long on this server,
* The SCP won't split those meta wals, just leaving them there. So deleting
* the wal dir will fail since the dir is not empty. Actually We can safely achive those
* meta log and Archiving the meta log and delete the dir.
* The hbase:meta region may OPEN and CLOSE without issue on a server and then move elsewhere.
* On CLOSE, the WAL for the hbase:meta table may not be archived yet (The WAL is only needed if
* hbase:meta did not close cleanaly). Since meta region is no long on this server,
* the ServerCrashProcedure won't split these leftover hbase:meta WALs, just leaving them in
* the WAL splitting dir. If we try to delete the WAL splitting for the server, it fail since
* the dir is not totally empty. We can safely archive these hbase:meta log; then the
* WAL dir can be deleted.
* @param serverName the server to archive meta log
*/
public void archiveMetaLog(final ServerName serverName) {
Expand Down Expand Up @@ -396,6 +410,4 @@ public void archiveMetaLog(final ServerName serverName) {
LOG.warn("Failed archiving meta log for server " + serverName, ie);
}
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,9 @@ public long getNumWALFiles() {

@Override
public Map<String,Entry<Long,Long>> getTableSpaceUtilization() {
if (master == null) {
return Collections.emptyMap();
}
QuotaObserverChore quotaChore = master.getQuotaObserverChore();
if (quotaChore == null) {
return Collections.emptyMap();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,10 @@
* again. If a task is resubmitted then there is a risk that old "delete task"
* can delete the re-submission.
* @see SplitWALManager for an alternate implementation based on Procedures.
* @deprecated since 2.4.0 and in 3.0.0, to be removed in 4.0.0, replaced by procedure-based
* distributed WAL splitter, see SplitWALManager.
*/
@Deprecated
@InterfaceAudience.Private
public class SplitLogManager {
private static final Logger LOG = LoggerFactory.getLogger(SplitLogManager.class);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/**
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
Expand All @@ -16,35 +16,36 @@
* limitations under the License.
*/
package org.apache.hadoop.hbase.master;

import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_SPLIT_WAL_MAX_SPLITTER;
import static org.apache.hadoop.hbase.HConstants.HBASE_SPLIT_WAL_MAX_SPLITTER;
import static org.apache.hadoop.hbase.master.MasterWalManager.META_FILTER;
import static org.apache.hadoop.hbase.master.MasterWalManager.NON_META_FILTER;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathIsNotEmptyDirectoryException;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
import org.apache.hadoop.hbase.master.procedure.SplitWALProcedure;
import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
import org.apache.hadoop.hbase.wal.WALSplitUtil;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;

Expand Down Expand Up @@ -78,15 +79,17 @@ public class SplitWALManager {
private final Path rootDir;
private final FileSystem fs;
private final Configuration conf;
private final Path walArchiveDir;

public SplitWALManager(MasterServices master) {
public SplitWALManager(MasterServices master) throws IOException {
this.master = master;
this.conf = master.getConfiguration();
this.splitWorkerAssigner = new SplitWorkerAssigner(this.master,
conf.getInt(HBASE_SPLIT_WAL_MAX_SPLITTER, DEFAULT_HBASE_SPLIT_WAL_MAX_SPLITTER));
this.rootDir = master.getMasterFileSystem().getWALRootDir();
// TODO: This should be the WAL FS, not the Master FS?
this.fs = master.getMasterFileSystem().getFileSystem();

this.walArchiveDir = new Path(this.rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
}

public List<Procedure> splitWALs(ServerName crashedServer, boolean splitMeta)
Expand Down Expand Up @@ -117,14 +120,24 @@ private Path getWALSplitDir(ServerName serverName) {
return logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
}

public void deleteSplitWAL(String wal) throws IOException {
fs.delete(new Path(wal), false);
/**
* Archive processed WAL
*/
public void archive(String wal) throws IOException {
WALSplitUtil.moveWAL(this.fs, new Path(wal), this.walArchiveDir);
}

public void deleteWALDir(ServerName serverName) throws IOException {
Path splitDir = getWALSplitDir(serverName);
if (!fs.delete(splitDir, false)) {
LOG.warn("Failed delete {}", splitDir);
try {
if (!fs.delete(splitDir, false)) {
LOG.warn("Failed delete {}, contains {}", splitDir, fs.listFiles(splitDir, true));
}
} catch (PathIsNotEmptyDirectoryException e) {
FileStatus [] files = CommonFSUtils.listStatus(fs, splitDir);
LOG.warn("PathIsNotEmptyDirectoryException {}",
Arrays.stream(files).map(f -> f.getPath()).collect(Collectors.toList()));
throw e;
}
}

Expand Down Expand Up @@ -197,7 +210,11 @@ public SplitWorkerAssigner(MasterServices master, int maxSplitTasks) {
this.maxSplitTasks = maxSplitTasks;
this.master = master;
this.event = new ProcedureEvent<>("split-WAL-worker-assigning");
this.master.getServerManager().registerListener(this);
// ServerManager might be null in a test context where we are mocking; allow for this
ServerManager sm = this.master.getServerManager();
if (sm != null) {
sm.registerListener(this);
}
}

public synchronized Optional<ServerName> acquire() {
Expand Down
Loading

0 comments on commit 345b77a

Please sign in to comment.