Skip to content

Commit e442a32

Browse files
authored
YARN-11815: Fix NodeQueueLoadMonitor scheduler running on standby RMs (#7665)
* `NodeQueueLoadMonitor` runs as an active service via `OpportunisticContainerAllocatorAMService` in YARN Resource Manager. * However, its scheduler thread is started in the constructor itself. * This would cause the scheduler to run on standby RM too, which shouldn't be the case since it is an active service.
1 parent a4130c8 commit e442a32

File tree

3 files changed

+16
-14
lines changed

3 files changed

+16
-14
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/OpportunisticContainerAllocatorAMService.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,14 @@ private void handleNewContainers(List<Container> allocContainers,
373373
}
374374
}
375375

376+
@Override
377+
protected void serviceStart() throws Exception {
378+
if (this.nodeMonitor != null) {
379+
this.nodeMonitor.start();
380+
}
381+
super.serviceStart();
382+
}
383+
376384
@Override
377385
protected void serviceStop() throws Exception {
378386
if (nodeMonitor != null) {

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/distributed/NodeQueueLoadMonitor.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ public boolean isNodeAvailable(final ClusterNode cn) {
245245
protected ReentrantReadWriteLock sortedNodesLock = new ReentrantReadWriteLock();
246246
protected ReentrantReadWriteLock clusterNodesLock =
247247
new ReentrantReadWriteLock();
248+
private long nodeComputationInterval;
248249

249250
Runnable computeTask = new Runnable() {
250251
@Override
@@ -278,12 +279,15 @@ public NodeQueueLoadMonitor(long nodeComputationInterval,
278279
this.sortedNodes = new ArrayList<>();
279280
this.scheduledExecutor = Executors.newScheduledThreadPool(1);
280281
this.comparator = comparator;
281-
this.scheduledExecutor.scheduleAtFixedRate(computeTask,
282-
nodeComputationInterval, nodeComputationInterval,
283-
TimeUnit.MILLISECONDS);
282+
this.nodeComputationInterval = nodeComputationInterval;
284283
numNodesForAnyAllocation = numNodes;
285284
}
286285

286+
public void start() {
287+
this.scheduledExecutor.scheduleAtFixedRate(computeTask, nodeComputationInterval,
288+
nodeComputationInterval, TimeUnit.MILLISECONDS);
289+
}
290+
287291
protected void updateSortedNodes() {
288292
List<NodeId> nodeIds = sortNodes(true).stream()
289293
.map(n -> n.nodeId)

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -750,17 +750,7 @@ protected ApplicationMasterService createApplicationMasterService() {
750750
YarnConfiguration.OPPORTUNISTIC_CONTAINER_ALLOCATION_ENABLED,
751751
YarnConfiguration.DEFAULT_OPPORTUNISTIC_CONTAINER_ALLOCATION_ENABLED)) {
752752
return new OpportunisticContainerAllocatorAMService(getRMContext(),
753-
scheduler) {
754-
@Override
755-
protected void serviceStart() {
756-
// override to not start rpc handler
757-
}
758-
759-
@Override
760-
protected void serviceStop() {
761-
// don't do anything
762-
}
763-
};
753+
scheduler);
764754
}
765755
return new ApplicationMasterService(getRMContext(), scheduler) {
766756
@Override

0 commit comments

Comments
 (0)