Skip to content

Commit bfb8f28

Browse files
author
Inigo Goiri
committed
HDFS-14775. Add Timestamp for longest FSN write/read lock held log. Contributed by Chen Zhang.
1 parent d17ba85 commit bfb8f28

File tree

2 files changed

+52
-45
lines changed

2 files changed

+52
-45
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java

Lines changed: 43 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,12 @@
2626
import java.util.concurrent.locks.ReentrantReadWriteLock;
2727

2828
import com.google.common.annotations.VisibleForTesting;
29-
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
3029
import org.apache.hadoop.conf.Configuration;
3130
import org.apache.hadoop.ipc.Server;
3231
import org.apache.hadoop.log.LogThrottlingHelper;
3332
import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation;
3433
import org.apache.hadoop.util.StringUtils;
34+
import org.apache.hadoop.util.Time;
3535
import org.apache.hadoop.util.Timer;
3636

3737
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_LOCK_SUPPRESS_WARNING_INTERVAL_DEFAULT;
@@ -105,14 +105,9 @@ public Long initialValue() {
105105
* The info (lock held time and stack trace) when longest time (ms) a read
106106
* lock was held since the last report.
107107
*/
108-
private final AtomicReference<ReadLockHeldInfo> longestReadLockHeldInfo =
109-
new AtomicReference<>(new ReadLockHeldInfo(0, null));
110-
111-
/**
112-
* The stack trace when longest time of the write lock
113-
* was held since the last report.
114-
*/
115-
private volatile String longestWriteLockHeldStackTrace;
108+
private final AtomicReference<LockHeldInfo> longestReadLockHeldInfo =
109+
new AtomicReference<>(new LockHeldInfo(0, 0, null));
110+
private LockHeldInfo longestWriteLockHeldInfo = new LockHeldInfo(0, 0, null);
116111

117112
@VisibleForTesting
118113
static final String OP_NAME_OTHER = "OTHER";
@@ -169,8 +164,9 @@ public void readUnlock() {
169164

170165
public void readUnlock(String opName) {
171166
final boolean needReport = coarseLock.getReadHoldCount() == 1;
167+
final long currentTimeStampNanos = timer.monotonicNowNanos();
172168
final long readLockIntervalNanos =
173-
timer.monotonicNowNanos() - readLockHeldTimeStampNanos.get();
169+
currentTimeStampNanos - readLockHeldTimeStampNanos.get();
174170
coarseLock.readLock().unlock();
175171

176172
if (needReport) {
@@ -179,13 +175,15 @@ public void readUnlock(String opName) {
179175
}
180176
final long readLockIntervalMs =
181177
TimeUnit.NANOSECONDS.toMillis(readLockIntervalNanos);
178+
final long currentTimeMs =
179+
TimeUnit.NANOSECONDS.toMillis(currentTimeStampNanos);
182180
if (needReport && readLockIntervalMs >= this.readLockReportingThresholdMs) {
183-
ReadLockHeldInfo localLockHeldInfo;
181+
LockHeldInfo localLockHeldInfo;
184182
do {
185183
localLockHeldInfo = longestReadLockHeldInfo.get();
186184
} while (localLockHeldInfo.getIntervalMs() - readLockIntervalMs < 0 &&
187185
!longestReadLockHeldInfo.compareAndSet(localLockHeldInfo,
188-
new ReadLockHeldInfo(readLockIntervalMs,
186+
new LockHeldInfo(currentTimeMs, readLockIntervalMs,
189187
StringUtils.getStackTrace(Thread.currentThread()))));
190188

191189
long localTimeStampOfLastReadLockReport;
@@ -202,13 +200,13 @@ public void readUnlock(String opName) {
202200
} while (!timeStampOfLastReadLockReportMs.compareAndSet(
203201
localTimeStampOfLastReadLockReport, nowMs));
204202
int numSuppressedWarnings = numReadLockWarningsSuppressed.getAndSet(0);
205-
ReadLockHeldInfo lockHeldInfo = longestReadLockHeldInfo
206-
.getAndSet(new ReadLockHeldInfo(0, null));
203+
LockHeldInfo lockHeldInfo =
204+
longestReadLockHeldInfo.getAndSet(new LockHeldInfo(0, 0, null));
207205
FSNamesystem.LOG.info(
208-
"\tNumber of suppressed read-lock reports: {}" +
209-
"\n\tLongest read-lock held interval: {}ms via {}",
210-
numSuppressedWarnings, lockHeldInfo.getIntervalMs(),
211-
lockHeldInfo.getStackTrace());
206+
"\tNumber of suppressed read-lock reports: {}"
207+
+ "\n\tLongest read-lock held at {} for {}ms via {}",
208+
numSuppressedWarnings, Time.formatTime(lockHeldInfo.getStartTimeMs()),
209+
lockHeldInfo.getIntervalMs(), lockHeldInfo.getStackTrace());
212210
}
213211
}
214212

@@ -258,18 +256,21 @@ public void writeUnlock(String opName, boolean suppressWriteLockReport) {
258256
LogAction logAction = LogThrottlingHelper.DO_NOT_LOG;
259257
if (needReport &&
260258
writeLockIntervalMs >= this.writeLockReportingThresholdMs) {
261-
SummaryStatistics currentStats =
262-
writeLockReportLogger.getCurrentStats("write", 0);
263-
double currentMaxTime = currentStats != null ? currentStats.getMax() : 0;
264-
if (currentMaxTime < writeLockIntervalMs) {
265-
longestWriteLockHeldStackTrace =
266-
StringUtils.getStackTrace(Thread.currentThread());
259+
if (longestWriteLockHeldInfo.getIntervalMs() < writeLockIntervalMs) {
260+
longestWriteLockHeldInfo =
261+
new LockHeldInfo(currentTimeMs, writeLockIntervalMs,
262+
StringUtils.getStackTrace(Thread.currentThread()));
267263
}
268264

269265
logAction = writeLockReportLogger
270266
.record("write", currentTimeMs, writeLockIntervalMs);
271267
}
272268

269+
LockHeldInfo lockHeldInfo = longestWriteLockHeldInfo;
270+
if (logAction.shouldLog()) {
271+
longestWriteLockHeldInfo = new LockHeldInfo(0, 0, null);
272+
}
273+
273274
coarseLock.writeLock().unlock();
274275

275276
if (needReport) {
@@ -278,12 +279,13 @@ public void writeUnlock(String opName, boolean suppressWriteLockReport) {
278279

279280
if (logAction.shouldLog()) {
280281
FSNamesystem.LOG.info(
281-
"\tNumber of suppressed write-lock reports: {}" +
282-
"\n\tLongest write-lock held interval: {}ms via {} " +
283-
"\n\tTotal suppressed write-lock held time: {}",
284-
logAction.getCount() - 1, logAction.getStats(0).getMax(),
285-
longestWriteLockHeldStackTrace,
286-
logAction.getStats(0).getSum() - writeLockIntervalMs);
282+
"\tNumber of suppressed write-lock reports: {}"
283+
+ "\n\tLongest write-lock held at {} for {}ms via {}"
284+
+ "\n\tTotal suppressed write-lock held time: {}",
285+
logAction.getCount() - 1,
286+
Time.formatTime(lockHeldInfo.getStartTimeMs()),
287+
lockHeldInfo.getIntervalMs(), lockHeldInfo.getStackTrace(),
288+
logAction.getStats(0).getSum() - lockHeldInfo.getIntervalMs());
287289
}
288290
}
289291

@@ -382,34 +384,30 @@ private static String getMetricName(String operationName, boolean isWrite) {
382384
/**
383385
* Read lock Held Info.
384386
*/
385-
private static class ReadLockHeldInfo {
386-
/** Read lock held time. */
387+
private static class LockHeldInfo {
388+
/** Lock held start time. */
389+
private Long startTimeMs;
390+
/** Lock held time. */
387391
private Long intervalMs;
388-
/** The stack trace read lock was held. */
392+
/** The stack trace lock was held. */
389393
private String stackTrace;
390394

391-
ReadLockHeldInfo(long intervalMs, String stackTrace) {
395+
LockHeldInfo(long startTimeMs, long intervalMs, String stackTrace) {
396+
this.startTimeMs = startTimeMs;
392397
this.intervalMs = intervalMs;
393398
this.stackTrace = stackTrace;
394399
}
395400

401+
public Long getStartTimeMs() {
402+
return this.startTimeMs;
403+
}
404+
396405
public Long getIntervalMs() {
397406
return this.intervalMs;
398407
}
399408

400409
public String getStackTrace() {
401410
return this.stackTrace;
402411
}
403-
404-
@Override
405-
public int hashCode() {
406-
return this.intervalMs.hashCode();
407-
}
408-
409-
@Override
410-
public boolean equals(Object obj) {
411-
return obj instanceof ReadLockHeldInfo && ((ReadLockHeldInfo) obj)
412-
.getIntervalMs().compareTo(intervalMs) == 0;
413-
}
414412
}
415413
}

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystemLock.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.apache.hadoop.test.GenericTestUtils.LogCapturer;
2929
import org.apache.hadoop.test.MetricsAsserts;
3030
import org.apache.hadoop.util.FakeTimer;
31+
import org.apache.hadoop.util.Time;
3132
import org.junit.Test;
3233

3334
import java.io.IOException;
@@ -207,6 +208,10 @@ public void testFSWriteLockLongHoldingReport() throws Exception {
207208
// find the held interval time in the log
208209
Pattern pattern = Pattern.compile(".*[\n].*\\d+ms(.*[\n].*){1,}");
209210
assertTrue(pattern.matcher(logs.getOutput()).find());
211+
// only keep the "yyyy-MM-dd" part of date
212+
String startTimeStr =
213+
"held at " + Time.formatTime(timer.monotonicNow()).substring(0, 10);
214+
assertTrue(logs.getOutput().contains(startTimeStr));
210215
assertTrue(logs.getOutput().contains(
211216
"Number of suppressed write-lock reports: 2"));
212217
}
@@ -294,6 +299,10 @@ public void run() {
294299
Pattern tLongPattern = Pattern.compile(
295300
String.format(stackTracePatternString, tLong.getClass().getName()));
296301
assertTrue(tLongPattern.matcher(logs.getOutput()).find());
302+
// only keep the "yyyy-MM-dd" part of date
303+
String startTimeStr =
304+
"held at " + Time.formatTime(timer.monotonicNow()).substring(0, 10);
305+
assertTrue(logs.getOutput().contains(startTimeStr));
297306
assertTrue(logs.getOutput().contains(
298307
"Number of suppressed read-lock reports: 3"));
299308

0 commit comments

Comments
 (0)