-
Notifications
You must be signed in to change notification settings - Fork 29k
SPARK-4337. [YARN] Add ability to cancel pending requests #4141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -69,8 +69,7 @@ private[yarn] class YarnAllocator( | |
| } | ||
|
|
||
| // Visible for testing. | ||
| val allocatedHostToContainersMap = | ||
| new HashMap[String, collection.mutable.Set[ContainerId]] | ||
| val allocatedHostToContainersMap = new HashMap[String, collection.mutable.Set[ContainerId]] | ||
| val allocatedContainerToHostMap = new HashMap[ContainerId, String] | ||
|
|
||
| // Containers that we no longer care about. We've either already told the RM to release them or | ||
|
|
@@ -84,7 +83,7 @@ private[yarn] class YarnAllocator( | |
| private var executorIdCounter = 0 | ||
| @volatile private var numExecutorsFailed = 0 | ||
|
|
||
| @volatile private var maxExecutors = args.numExecutors | ||
| @volatile private var targetNumExecutors = args.numExecutors | ||
|
|
||
| // Keep track of which container is running which executor to remove the executors later | ||
| private val executorIdToContainer = new HashMap[String, Container] | ||
|
|
@@ -133,10 +132,12 @@ private[yarn] class YarnAllocator( | |
| amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).map(_.size).sum | ||
|
|
||
| /** | ||
| * Request as many executors from the ResourceManager as needed to reach the desired total. | ||
| * Request as many executors from the ResourceManager as needed to reach the desired total. If | ||
| * the requested total is smaller than the current number of running executors, no executors will | ||
| * be killed. | ||
| */ | ||
| def requestTotalExecutors(requestedTotal: Int): Unit = synchronized { | ||
| maxExecutors = requestedTotal | ||
| targetNumExecutors = requestedTotal | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -147,8 +148,8 @@ private[yarn] class YarnAllocator( | |
| val container = executorIdToContainer.remove(executorId).get | ||
| internalReleaseContainer(container) | ||
| numExecutorsRunning -= 1 | ||
| maxExecutors -= 1 | ||
| assert(maxExecutors >= 0, "Allocator killed more executors than are allocated!") | ||
| targetNumExecutors -= 1 | ||
| assert(targetNumExecutors >= 0, "Allocator killed more executors than are allocated!") | ||
| } else { | ||
| logWarning(s"Attempted to kill unknown executor $executorId!") | ||
| } | ||
|
|
@@ -163,15 +164,8 @@ private[yarn] class YarnAllocator( | |
| * This must be synchronized because variables read in this method are mutated by other methods. | ||
| */ | ||
| def allocateResources(): Unit = synchronized { | ||
| val numPendingAllocate = getNumPendingAllocate | ||
| val missing = maxExecutors - numPendingAllocate - numExecutorsRunning | ||
| updateResourceRequests() | ||
|
|
||
| if (missing > 0) { | ||
| logInfo(s"Will request $missing executor containers, each with ${resource.getVirtualCores} " + | ||
| s"cores and ${resource.getMemory} MB memory including $memoryOverhead MB overhead") | ||
| } | ||
|
|
||
| addResourceRequests(missing) | ||
| val progressIndicator = 0.1f | ||
| // Poll the ResourceManager. This doubles as a heartbeat if there are no pending container | ||
| // requests. | ||
|
|
@@ -201,15 +195,36 @@ private[yarn] class YarnAllocator( | |
| } | ||
|
|
||
| /** | ||
| * Request numExecutors additional containers from YARN. Visible for testing. | ||
| * Update the set of container requests that we will sync with the RM based on the number of | ||
| * executors we have currently running and our target number of executors. | ||
| * | ||
| * Visible for testing. | ||
| */ | ||
| def addResourceRequests(numExecutors: Int): Unit = { | ||
| for (i <- 0 until numExecutors) { | ||
| val request = new ContainerRequest(resource, null, null, RM_REQUEST_PRIORITY) | ||
| amClient.addContainerRequest(request) | ||
| val nodes = request.getNodes | ||
| val hostStr = if (nodes == null || nodes.isEmpty) "Any" else nodes.last | ||
| logInfo("Container request (host: %s, capability: %s".format(hostStr, resource)) | ||
| def updateResourceRequests(): Unit = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how about making it
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| val numPendingAllocate = getNumPendingAllocate | ||
| val missing = targetNumExecutors - numPendingAllocate - numExecutorsRunning | ||
|
|
||
| if (missing > 0) { | ||
| logInfo(s"Will request $missing executor containers, each with ${resource.getVirtualCores} " + | ||
| s"cores and ${resource.getMemory} MB memory including $memoryOverhead MB overhead") | ||
|
|
||
| for (i <- 0 until missing) { | ||
| val request = new ContainerRequest(resource, null, null, RM_REQUEST_PRIORITY) | ||
| amClient.addContainerRequest(request) | ||
| val nodes = request.getNodes | ||
| val hostStr = if (nodes == null || nodes.isEmpty) "Any" else nodes.last | ||
| logInfo(s"Container request (host: $hostStr, capability: $resource)") | ||
| } | ||
| } else if (missing < 0) { | ||
| val numToCancel = math.min(numPendingAllocate, -missing) | ||
| logInfo(s"Canceling requests for $numToCancel executor containers") | ||
|
|
||
| val matchingRequests = amClient.getMatchingRequests(RM_REQUEST_PRIORITY, ANY_HOST, resource) | ||
| if (!matchingRequests.isEmpty) { | ||
| matchingRequests.head.take(numToCancel).foreach(amClient.removeContainerRequest) | ||
| } else { | ||
| logWarning("Expected to find pending requests, but found none.") | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -266,7 +281,7 @@ private[yarn] class YarnAllocator( | |
| * containersToUse or remaining. | ||
| * | ||
| * @param allocatedContainer container that was given to us by YARN | ||
| * @location resource name, either a node, rack, or * | ||
| * @param location resource name, either a node, rack, or * | ||
| * @param containersToUse list of containers that will be used | ||
| * @param remaining list of containers that will not be used | ||
| */ | ||
|
|
@@ -294,7 +309,7 @@ private[yarn] class YarnAllocator( | |
| private def runAllocatedContainers(containersToUse: ArrayBuffer[Container]): Unit = { | ||
| for (container <- containersToUse) { | ||
| numExecutorsRunning += 1 | ||
| assert(numExecutorsRunning <= maxExecutors) | ||
| assert(numExecutorsRunning <= targetNumExecutors) | ||
| val executorHostname = container.getNodeId.getHost | ||
| val containerId = container.getId | ||
| executorIdCounter += 1 | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this true based on your code down there? If there are 10 executors running and 0 pending, and I set
targetNumExecutorsto 0, won't it find 10 matching requests and kill them?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
never mind, I guess the semantics of
amClient.removeContainerRequestrefers only to the request itself, but not allocated containers.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes,amClient.removeContainerRequest is only to remove requests, not allocated containers.