Skip to content

Commit 62e583c

Browse files
committed
YARN-3695. ServerProxy (NMProxy, etc.) shouldn't retry forever for non network exception. Contributed by Raju Bairishetti
1 parent fad291e commit 62e583c

File tree

3 files changed

+70
-35
lines changed
  • hadoop-yarn-project
    • hadoop-yarn
      • hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client
      • hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager

3 files changed

+70
-35
lines changed

hadoop-yarn-project/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,9 @@ Release 2.8.0 - UNRELEASED
568568
YARN-3860. rmadmin -transitionToActive should check the state of non-target node.
569569
(Masatake Iwasaki via junping_du)
570570

571+
YARN-3695. ServerProxy (NMProxy, etc.) shouldn't retry forever for non
572+
network exception. (Raju Bairishetti via jianhe)
573+
571574
Release 2.7.2 - UNRELEASED
572575

573576
INCOMPATIBLE CHANGES

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/ServerProxy.java

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,19 +53,22 @@ protected static RetryPolicy createRetryPolicy(Configuration conf,
5353
long maxWaitTime = conf.getLong(maxWaitTimeStr, defMaxWaitTime);
5454
long retryIntervalMS =
5555
conf.getLong(connectRetryIntervalStr, defRetryInterval);
56-
if (maxWaitTime == -1) {
57-
// wait forever.
58-
return RetryPolicies.RETRY_FOREVER;
59-
}
6056

61-
Preconditions.checkArgument(maxWaitTime > 0, "Invalid Configuration. "
62-
+ maxWaitTimeStr + " should be a positive value.");
57+
Preconditions.checkArgument((maxWaitTime == -1 || maxWaitTime > 0),
58+
"Invalid Configuration. " + maxWaitTimeStr + " should be either"
59+
+ " positive value or -1.");
6360
Preconditions.checkArgument(retryIntervalMS > 0, "Invalid Configuration. "
6461
+ connectRetryIntervalStr + "should be a positive value.");
6562

66-
RetryPolicy retryPolicy =
67-
RetryPolicies.retryUpToMaximumTimeWithFixedSleep(maxWaitTime,
68-
retryIntervalMS, TimeUnit.MILLISECONDS);
63+
RetryPolicy retryPolicy = null;
64+
if (maxWaitTime == -1) {
65+
// wait forever.
66+
retryPolicy = RetryPolicies.RETRY_FOREVER;
67+
} else {
68+
retryPolicy =
69+
RetryPolicies.retryUpToMaximumTimeWithFixedSleep(maxWaitTime,
70+
retryIntervalMS, TimeUnit.MILLISECONDS);
71+
}
6972

7073
Map<Class<? extends Exception>, RetryPolicy> exceptionToPolicyMap =
7174
new HashMap<Class<? extends Exception>, RetryPolicy>();

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestNMProxy.java

Lines changed: 55 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.net.InetSocketAddress;
2323

2424
import org.apache.hadoop.fs.UnsupportedFileSystemException;
25+
import org.apache.hadoop.io.retry.UnreliableInterface;
2526
import org.apache.hadoop.security.SecurityUtil;
2627
import org.apache.hadoop.security.UserGroupInformation;
2728
import org.apache.hadoop.security.token.Token;
@@ -58,8 +59,8 @@ public TestNMProxy() throws UnsupportedFileSystemException {
5859

5960
@Before
6061
public void setUp() throws Exception {
61-
conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_MAX_WAIT_MS, 10000);
62-
conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS, 100);
62+
containerManager.start();
63+
containerManager.setBlockNewContainerRequests(false);
6364
}
6465

6566
@Override
@@ -77,7 +78,13 @@ public StartContainersResponse startContainers(
7778
// This causes super to throw an NMNotYetReadyException
7879
containerManager.setBlockNewContainerRequests(true);
7980
} else {
80-
throw new java.net.ConnectException("start container exception");
81+
if (isRetryPolicyRetryForEver()) {
82+
// Throw non network exception
83+
throw new IOException(
84+
new UnreliableInterface.UnreliableException());
85+
} else {
86+
throw new java.net.ConnectException("start container exception");
87+
}
8188
}
8289
} else {
8390
// This stops super from throwing an NMNotYetReadyException
@@ -86,6 +93,11 @@ public StartContainersResponse startContainers(
8693
return super.startContainers(requests);
8794
}
8895

96+
private boolean isRetryPolicyRetryForEver() {
97+
return conf.getLong(
98+
YarnConfiguration.CLIENT_NM_CONNECT_MAX_WAIT_MS, 1000) == -1;
99+
}
100+
89101
@Override
90102
public StopContainersResponse stopContainers(
91103
StopContainersRequest requests) throws YarnException, IOException {
@@ -110,30 +122,13 @@ public GetContainerStatusesResponse getContainerStatuses(
110122
}
111123

112124
@Test(timeout = 20000)
113-
public void testNMProxyRetry() throws Exception {
114-
containerManager.start();
115-
containerManager.setBlockNewContainerRequests(false);
116-
StartContainersRequest allRequests =
117-
Records.newRecord(StartContainersRequest.class);
118-
ApplicationId appId = ApplicationId.newInstance(1, 1);
119-
ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
125+
public void testNMProxyRetry() throws Exception {
126+
conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_MAX_WAIT_MS, 10000);
127+
conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_RETRY_INTERVAL_MS, 100);
128+
StartContainersRequest allRequests =
129+
Records.newRecord(StartContainersRequest.class);
120130

121-
org.apache.hadoop.yarn.api.records.Token nmToken =
122-
context.getNMTokenSecretManager().createNMToken(attemptId,
123-
context.getNodeId(), user);
124-
final InetSocketAddress address =
125-
conf.getSocketAddr(YarnConfiguration.NM_BIND_HOST,
126-
YarnConfiguration.NM_ADDRESS, YarnConfiguration.DEFAULT_NM_ADDRESS,
127-
YarnConfiguration.DEFAULT_NM_PORT);
128-
Token<NMTokenIdentifier> token =
129-
ConverterUtils.convertFromYarn(nmToken,
130-
SecurityUtil.buildTokenService(address));
131-
UserGroupInformation ugi = UserGroupInformation.createRemoteUser(user);
132-
ugi.addToken(token);
133-
134-
ContainerManagementProtocol proxy =
135-
NMProxy.createNMProxy(conf, ContainerManagementProtocol.class, ugi,
136-
YarnRPC.create(conf), address);
131+
ContainerManagementProtocol proxy = getNMProxy();
137132

138133
retryCount = 0;
139134
shouldThrowNMNotYetReadyException = false;
@@ -156,4 +151,38 @@ public void testNMProxyRetry() throws Exception {
156151
proxy.startContainers(allRequests);
157152
Assert.assertEquals(5, retryCount);
158153
}
154+
155+
@Test(timeout = 20000, expected = IOException.class)
156+
public void testShouldNotRetryForeverForNonNetworkExceptionsOnNMConnections()
157+
throws Exception {
158+
conf.setLong(YarnConfiguration.CLIENT_NM_CONNECT_MAX_WAIT_MS, -1);
159+
StartContainersRequest allRequests =
160+
Records.newRecord(StartContainersRequest.class);
161+
162+
ContainerManagementProtocol proxy = getNMProxy();
163+
164+
shouldThrowNMNotYetReadyException = false;
165+
retryCount = 0;
166+
proxy.startContainers(allRequests);
167+
}
168+
169+
private ContainerManagementProtocol getNMProxy() {
170+
ApplicationId appId = ApplicationId.newInstance(1, 1);
171+
ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
172+
173+
org.apache.hadoop.yarn.api.records.Token nmToken =
174+
context.getNMTokenSecretManager().createNMToken(attemptId,
175+
context.getNodeId(), user);
176+
final InetSocketAddress address =
177+
conf.getSocketAddr(YarnConfiguration.NM_BIND_HOST,
178+
YarnConfiguration.NM_ADDRESS, YarnConfiguration.DEFAULT_NM_ADDRESS,
179+
YarnConfiguration.DEFAULT_NM_PORT);
180+
Token<NMTokenIdentifier> token =
181+
ConverterUtils.convertFromYarn(nmToken,
182+
SecurityUtil.buildTokenService(address));
183+
UserGroupInformation ugi = UserGroupInformation.createRemoteUser(user);
184+
ugi.addToken(token);
185+
return NMProxy.createNMProxy(conf, ContainerManagementProtocol.class, ugi,
186+
YarnRPC.create(conf), address);
187+
}
159188
}

0 commit comments

Comments
 (0)