Skip to content

Commit 134dcf1

Browse files
authored
YARN-11703. Validate accessibility of Node Manager working directories (#6903)
1 parent b4ddb2d commit 134dcf1

File tree

4 files changed

+140
-73
lines changed

4 files changed

+140
-73
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2157,16 +2157,19 @@ public static boolean isAclEnabled(Configuration conf) {
21572157
public static final String NM_MIN_PER_DISK_FREE_SPACE_MB =
21582158
NM_DISK_HEALTH_CHECK_PREFIX + "min-free-space-per-disk-mb";
21592159

2160+
/**
2161+
* By default, all the disk can be used before it is marked as offline.
2162+
*/
2163+
public static final long DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB = 0;
2164+
21602165
/**
21612166
* Enable/Disable the minimum disk free
21622167
* space threshold for disk health checker.
21632168
*/
21642169
public static final String NM_DISK_FREE_SPACE_THRESHOLD_ENABLED =
2165-
NM_DISK_HEALTH_CHECK_PREFIX +
2166-
"disk-free-space-threshold.enabled";
2170+
NM_DISK_HEALTH_CHECK_PREFIX + "disk-free-space-threshold.enabled";
21672171

2168-
public static final boolean
2169-
DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED = true;
2172+
public static final boolean DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED = true;
21702173

21712174
/**
21722175
* The minimum space that must be available on an offline
@@ -2180,9 +2183,13 @@ public static boolean isAclEnabled(Configuration conf) {
21802183
NM_DISK_HEALTH_CHECK_PREFIX +
21812184
"min-free-space-per-disk-watermark-high-mb";
21822185
/**
2183-
* By default, all of the disk can be used before it is marked as offline.
2186+
* Validate content of the node manager directories can be accessed.
21842187
*/
2185-
public static final long DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB = 0;
2188+
public static final String NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED =
2189+
NM_DISK_HEALTH_CHECK_PREFIX + "working-dir-content-accessibility-validation.enabled";
2190+
2191+
public static final boolean DEFAULT_NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED =
2192+
true;
21862193

21872194
/** The health checker scripts. */
21882195
public static final String NM_HEALTH_CHECK_SCRIPTS =

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1995,6 +1995,12 @@
19951995
<value>true</value>
19961996
</property>
19971997

1998+
<property>
1999+
<description>Validate content of the node manager directories can be accessed</description>
2000+
<name>yarn.nodemanager.disk-health-checker.working-dir-content-accessibility-validation.enabled</name>
2001+
<value>true</value>
2002+
</property>
2003+
19982004
<property>
19992005
<description>The maximum percentage of disk space utilization allowed after
20002006
which a disk is marked as bad. Values can range from 0.0 to 100.0.

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java

Lines changed: 94 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -21,29 +21,36 @@
2121
import java.io.File;
2222
import java.io.FileNotFoundException;
2323
import java.io.IOException;
24+
import java.io.UncheckedIOException;
25+
import java.nio.file.Files;
2426
import java.util.ArrayList;
2527
import java.util.Arrays;
2628
import java.util.Collections;
2729
import java.util.HashMap;
2830
import java.util.HashSet;
2931
import java.util.List;
3032
import java.util.Map;
33+
import java.util.Objects;
3134
import java.util.Set;
3235
import java.util.concurrent.ConcurrentHashMap;
3336
import java.util.concurrent.locks.ReentrantReadWriteLock;
3437
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
3538
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
39+
import java.util.stream.Collectors;
40+
import java.util.stream.Stream;
41+
3642
import org.slf4j.Logger;
3743
import org.slf4j.LoggerFactory;
3844

3945
import org.apache.hadoop.classification.InterfaceStability;
46+
import org.apache.hadoop.conf.Configuration;
4047
import org.apache.hadoop.fs.FileAlreadyExistsException;
4148
import org.apache.hadoop.fs.FileContext;
4249
import org.apache.hadoop.fs.Path;
4350
import org.apache.hadoop.fs.permission.FsPermission;
51+
import org.apache.hadoop.util.DiskChecker;
4452
import org.apache.hadoop.util.DiskValidator;
4553
import org.apache.hadoop.util.DiskValidatorFactory;
46-
import org.apache.hadoop.conf.Configuration;
4754
import org.apache.hadoop.yarn.conf.YarnConfiguration;
4855
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
4956

@@ -62,6 +69,7 @@ public class DirectoryCollection {
6269

6370
private boolean diskUtilizationThresholdEnabled;
6471
private boolean diskFreeSpaceThresholdEnabled;
72+
private boolean subAccessibilityValidationEnabled;
6573
/**
6674
* The enum defines disk failure type.
6775
*/
@@ -242,16 +250,15 @@ public DirectoryCollection(String[] dirs,
242250
throw new YarnRuntimeException(e);
243251
}
244252

245-
diskUtilizationThresholdEnabled = conf.
246-
getBoolean(YarnConfiguration.
247-
NM_DISK_UTILIZATION_THRESHOLD_ENABLED,
248-
YarnConfiguration.
249-
DEFAULT_NM_DISK_UTILIZATION_THRESHOLD_ENABLED);
250-
diskFreeSpaceThresholdEnabled = conf.
251-
getBoolean(YarnConfiguration.
252-
NM_DISK_FREE_SPACE_THRESHOLD_ENABLED,
253-
YarnConfiguration.
254-
DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED);
253+
diskUtilizationThresholdEnabled = conf.getBoolean(
254+
YarnConfiguration.NM_DISK_UTILIZATION_THRESHOLD_ENABLED,
255+
YarnConfiguration.DEFAULT_NM_DISK_UTILIZATION_THRESHOLD_ENABLED);
256+
diskFreeSpaceThresholdEnabled = conf.getBoolean(
257+
YarnConfiguration.NM_DISK_FREE_SPACE_THRESHOLD_ENABLED,
258+
YarnConfiguration.DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED);
259+
subAccessibilityValidationEnabled = conf.getBoolean(
260+
YarnConfiguration.NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED,
261+
YarnConfiguration.DEFAULT_NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED);
255262

256263
localDirs = new ArrayList<>(Arrays.asList(dirs));
257264
errorDirs = new ArrayList<>();
@@ -448,8 +455,7 @@ boolean checkDirs() {
448455

449456
// move testDirs out of any lock as it could wait for very long time in
450457
// case of busy IO
451-
Map<String, DiskErrorInformation> dirsFailedCheck = testDirs(allLocalDirs,
452-
preCheckGoodDirs);
458+
Map<String, DiskErrorInformation> dirsFailedCheck = testDirs(allLocalDirs, preCheckGoodDirs);
453459

454460
this.writeLock.lock();
455461
try {
@@ -521,60 +527,89 @@ boolean checkDirs() {
521527
}
522528
}
523529

524-
Map<String, DiskErrorInformation> testDirs(List<String> dirs,
525-
Set<String> goodDirs) {
526-
HashMap<String, DiskErrorInformation> ret =
527-
new HashMap<String, DiskErrorInformation>();
528-
for (final String dir : dirs) {
529-
String msg;
530-
try {
531-
File testDir = new File(dir);
532-
diskValidator.checkStatus(testDir);
533-
float diskUtilizationPercentageCutoff = goodDirs.contains(dir) ?
534-
diskUtilizationPercentageCutoffHigh : diskUtilizationPercentageCutoffLow;
535-
long diskFreeSpaceCutoff = goodDirs.contains(dir) ?
536-
diskFreeSpaceCutoffLow : diskFreeSpaceCutoffHigh;
537-
538-
if (diskUtilizationThresholdEnabled
539-
&& isDiskUsageOverPercentageLimit(testDir,
540-
diskUtilizationPercentageCutoff)) {
541-
msg =
542-
"used space above threshold of "
543-
+ diskUtilizationPercentageCutoff
544-
+ "%";
545-
ret.put(dir,
546-
new DiskErrorInformation(DiskErrorCause.DISK_FULL, msg));
547-
continue;
548-
} else if (diskFreeSpaceThresholdEnabled
549-
&& isDiskFreeSpaceUnderLimit(testDir, diskFreeSpaceCutoff)) {
550-
msg =
551-
"free space below limit of " + diskFreeSpaceCutoff
552-
+ "MB";
553-
ret.put(dir,
554-
new DiskErrorInformation(DiskErrorCause.DISK_FULL, msg));
555-
continue;
556-
}
557-
} catch (IOException ie) {
558-
ret.put(dir,
559-
new DiskErrorInformation(DiskErrorCause.OTHER, ie.getMessage()));
560-
}
530+
Map<String, DiskErrorInformation> testDirs(List<String> dirs, Set<String> goodDirs) {
531+
final Map<String, DiskErrorInformation> ret = new HashMap<>(0);
532+
for (String dir : dirs) {
533+
LOG.debug("Start testing dir accessibility: {}", dir);
534+
File testDir = new File(dir);
535+
boolean goodDir = goodDirs.contains(dir);
536+
Stream.of(
537+
validateDisk(testDir),
538+
validateUsageOverPercentageLimit(testDir, goodDir),
539+
validateDiskFreeSpaceUnderLimit(testDir, goodDir),
540+
validateSubsAccessibility(testDir)
541+
)
542+
.filter(Objects::nonNull)
543+
.findFirst()
544+
.ifPresent(diskErrorInformation -> ret.put(dir, diskErrorInformation));
561545
}
562546
return ret;
563547
}
564548

565-
private boolean isDiskUsageOverPercentageLimit(File dir,
566-
float diskUtilizationPercentageCutoff) {
567-
float freePercentage =
568-
100 * (dir.getUsableSpace() / (float) dir.getTotalSpace());
549+
private DiskErrorInformation validateDisk(File dir) {
550+
try {
551+
diskValidator.checkStatus(dir);
552+
LOG.debug("Dir {} pass throw the disk validation", dir);
553+
return null;
554+
} catch (IOException | UncheckedIOException | SecurityException e) {
555+
return new DiskErrorInformation(DiskErrorCause.OTHER, e.getMessage());
556+
}
557+
}
558+
559+
private DiskErrorInformation validateUsageOverPercentageLimit(File dir, boolean isGoodDir) {
560+
if (!diskUtilizationThresholdEnabled) {
561+
return null;
562+
}
563+
float diskUtilizationPercentageCutoff = isGoodDir
564+
? diskUtilizationPercentageCutoffHigh
565+
: diskUtilizationPercentageCutoffLow;
566+
float freePercentage = 100 * (dir.getUsableSpace() / (float) dir.getTotalSpace());
569567
float usedPercentage = 100.0F - freePercentage;
570-
return (usedPercentage > diskUtilizationPercentageCutoff
571-
|| usedPercentage >= 100.0F);
568+
if (usedPercentage > diskUtilizationPercentageCutoff || usedPercentage >= 100.0F) {
569+
return new DiskErrorInformation(DiskErrorCause.DISK_FULL,
570+
"used space above threshold of " + diskUtilizationPercentageCutoff + "%");
571+
} else {
572+
LOG.debug("Dir {} pass throw the usage over percentage validation", dir);
573+
return null;
574+
}
572575
}
573576

574-
private boolean isDiskFreeSpaceUnderLimit(File dir,
575-
long freeSpaceCutoff) {
577+
private DiskErrorInformation validateDiskFreeSpaceUnderLimit(File dir, boolean isGoodDir) {
578+
if (!diskFreeSpaceThresholdEnabled) {
579+
return null;
580+
}
581+
long freeSpaceCutoff = isGoodDir ? diskFreeSpaceCutoffLow : diskFreeSpaceCutoffHigh;
576582
long freeSpace = dir.getUsableSpace() / (1024 * 1024);
577-
return freeSpace < freeSpaceCutoff;
583+
if (freeSpace < freeSpaceCutoff) {
584+
return new DiskErrorInformation(DiskErrorCause.DISK_FULL,
585+
"free space below limit of " + freeSpaceCutoff + "MB");
586+
} else {
587+
LOG.debug("Dir {} pass throw the free space validation", dir);
588+
return null;
589+
}
590+
}
591+
592+
private DiskErrorInformation validateSubsAccessibility(File dir) {
593+
if (!subAccessibilityValidationEnabled) {
594+
return null;
595+
}
596+
try (Stream<java.nio.file.Path> walk = Files.walk(dir.toPath())) {
597+
List<File> subs = walk
598+
.map(java.nio.file.Path::toFile)
599+
.collect(Collectors.toList());
600+
for (File sub : subs) {
601+
if (sub.isDirectory()) {
602+
DiskChecker.checkDir(sub);
603+
} else if (!Files.isReadable(sub.toPath())) {
604+
return new DiskErrorInformation(DiskErrorCause.OTHER, "Can not read " + sub);
605+
} else {
606+
LOG.debug("{} under {} is accessible", sub, dir);
607+
}
608+
}
609+
} catch (IOException | UncheckedIOException | SecurityException e) {
610+
return new DiskErrorInformation(DiskErrorCause.OTHER, e.getMessage());
611+
}
612+
return null;
578613
}
579614

580615
private void createDir(FileContext localFs, Path dir, FsPermission perm)

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,17 @@
2020

2121
import java.io.File;
2222
import java.io.IOException;
23+
import java.nio.file.Files;
24+
import java.nio.file.attribute.PosixFilePermissions;
25+
import java.util.Collections;
2326
import java.util.List;
2427
import java.util.ListIterator;
28+
import java.util.Map;
29+
30+
import org.junit.After;
31+
import org.junit.Assert;
32+
import org.junit.Before;
33+
import org.junit.Test;
2534

2635
import org.apache.hadoop.conf.Configuration;
2736
import org.apache.hadoop.fs.CommonConfigurationKeys;
@@ -32,16 +41,11 @@
3241
import org.apache.hadoop.fs.permission.FsPermission;
3342
import org.apache.hadoop.yarn.conf.YarnConfiguration;
3443
import org.apache.hadoop.yarn.server.nodemanager.DirectoryCollection.DirsChangeListener;
35-
import org.junit.After;
36-
import org.junit.Assert;
37-
import org.junit.Before;
38-
import org.junit.Test;
3944

4045
public class TestDirectoryCollection {
4146

42-
private static final File testDir = new File("target",
43-
TestDirectoryCollection.class.getName()).getAbsoluteFile();
44-
private static final File testFile = new File(testDir, "testfile");
47+
private File testDir;
48+
private File testFile;
4549

4650
private Configuration conf;
4751
private FileContext localFs;
@@ -50,7 +54,8 @@ public class TestDirectoryCollection {
5054
public void setupForTests() throws IOException {
5155
conf = new Configuration();
5256
localFs = FileContext.getLocalFSFileContext(conf);
53-
testDir.mkdirs();
57+
testDir = Files.createTempDirectory(TestDirectoryCollection.class.getName()).toFile();
58+
testFile = new File(testDir, "testfile");
5459
testFile.createNewFile();
5560
}
5661

@@ -516,6 +521,20 @@ public void testDirsChangeListener() {
516521
Assert.assertEquals(listener3.num, 1);
517522
}
518523

524+
@Test
525+
public void testNonAccessibleSub() throws IOException {
526+
Files.setPosixFilePermissions(testDir.toPath(),
527+
PosixFilePermissions.fromString("rwx------"));
528+
Files.setPosixFilePermissions(testFile.toPath(),
529+
PosixFilePermissions.fromString("-w--w--w-"));
530+
DirectoryCollection dc = new DirectoryCollection(new String[]{testDir.toString()});
531+
Map<String, DirectoryCollection.DiskErrorInformation> diskErrorInformationMap =
532+
dc.testDirs(Collections.singletonList(testDir.toString()), Collections.emptySet());
533+
Assert.assertEquals(1, diskErrorInformationMap.size());
534+
Assert.assertTrue(diskErrorInformationMap.values().iterator().next()
535+
.message.contains(testFile.getName()));
536+
}
537+
519538
static class DirsChangeListenerTest implements DirsChangeListener {
520539
public int num = 0;
521540
public DirsChangeListenerTest() {

0 commit comments

Comments
 (0)