Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

YARN-7237: Enable asynchronous scheduling by default for capacity scheduler #7138

Open
wants to merge 3 commits into
base: trunk
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ public class CapacitySchedulerConfiguration extends ReservationSchedulerConfigur
DEFAULT_SCHEDULE_ASYNCHRONOUSLY_MAXIMUM_PENDING_BACKLOGS = 100;

@Private
public static final boolean DEFAULT_SCHEDULE_ASYNCHRONOUSLY_ENABLE = false;
public static final boolean DEFAULT_SCHEDULE_ASYNCHRONOUSLY_ENABLE = true;

@Private
public static final String QUEUE_MAPPING = PREFIX + "queue-mappings";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public void convertSiteProperties(Configuration conf,
FairSchedulerConfiguration.CONTINUOUS_SCHEDULING_ENABLED,
FairSchedulerConfiguration.DEFAULT_CONTINUOUS_SCHEDULING_ENABLED)) {
yarnSiteConfig.setBoolean(
CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_ENABLE, true);
CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_ENABLE, enableAsyncScheduler);
int interval = conf.getInt(
FairSchedulerConfiguration.CONTINUOUS_SCHEDULING_SLEEP_MS,
FairSchedulerConfiguration.DEFAULT_CONTINUOUS_SCHEDULING_SLEEP_MS);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -927,7 +927,11 @@ public void testReleaseOutdatedReservedContainer() throws Exception {
* First proposal should be accepted, second proposal should be rejected
* because it try to release an outdated reserved container
*/
MockRM rm1 = new MockRM();
// disable async-scheduling for simulating complex scene
Configuration disableAsyncConf = new Configuration(conf);
disableAsyncConf.setBoolean(
CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_ENABLE, false);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why disable async scheduler in UT? And enable by default? I think we'd better enable async schedule by default.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The following are the reasons why async scheduling is disabled in UT :

  1. unlike async scheduling, sync scheduling gives more flexibility while writing UTs to mimic complex scenarios since the UT writer will have full control over when to schedule, where as with async scheduling this cannot be done.

  2. There were 200+ failing when async scheduling was enabled (due to scheduling assertion)

MockRM rm1 = new MockRM(disableAsyncConf);
rm1.getRMContext().setNodeLabelManager(mgr);
rm1.start();
MockNM nm1 = rm1.registerNode("h1:1234", 8 * GB);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public void testSiteContinuousSchedulingConversion() {
FairSchedulerConfiguration.CONTINUOUS_SCHEDULING_SLEEP_MS, 666);

converter.convertSiteProperties(yarnConfig, yarnConvertedConfig, false,
false, false, null);
true, false, null);

assertTrue("Cont. scheduling", yarnConvertedConfig.getBoolean(
CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_ENABLE, false));
Expand Down Expand Up @@ -224,7 +224,7 @@ public void testAsyncSchedulingDisabledConversion() {

assertFalse("Asynchronous scheduling", yarnConvertedConfig.getBoolean(
CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_ENABLE,
CapacitySchedulerConfiguration.DEFAULT_SCHEDULE_ASYNCHRONOUSLY_ENABLE));
false));
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ public void testNoChildQueueConversion() {
FSQueue root = createFSQueues();
converter.convertWeightsForChildQueues(root, csConfig);

assertEquals("Converted items", 19,
assertEquals("Converted items", 20,
csConfig.getPropsWithPrefix(PREFIX).size());
}

Expand All @@ -76,7 +76,7 @@ public void testMultiWeightConversion() {

converter.convertWeightsForChildQueues(root, csConfig);

assertEquals("Number of properties", 22,
assertEquals("Number of properties", 23,
csConfig.getPropsWithPrefix(PREFIX).size());
// this is no fixing - it's the result of BigDecimal rounding
assertEquals("root.a capacity", 16.667f,
Expand All @@ -95,7 +95,7 @@ public void testMultiWeightConversionWhenOfThemIsZero() {

assertFalse("Capacity zerosum allowed",
csConfig.getAllowZeroCapacitySum(ROOT));
assertEquals("Number of properties", 22,
assertEquals("Number of properties", 23,
csConfig.getPropsWithPrefix(PREFIX).size());
assertEquals("root.a capacity", 0.000f,
csConfig.getNonLabeledQueueCapacity(ROOT_A), 0.0f);
Expand All @@ -111,7 +111,7 @@ public void testMultiWeightConversionWhenAllOfThemAreZero() {

converter.convertWeightsForChildQueues(root, csConfig);

assertEquals("Number of properties", 23,
assertEquals("Number of properties", 24,
csConfig.getPropsWithPrefix(PREFIX).size());
assertTrue("Capacity zerosum allowed",
csConfig.getAllowZeroCapacitySum(ROOT));
Expand All @@ -129,7 +129,7 @@ public void testCapacityFixingWithThreeQueues() {

converter.convertWeightsForChildQueues(root, csConfig);

assertEquals("Number of properties", 22,
assertEquals("Number of properties", 23,
csConfig.getPropsWithPrefix(PREFIX).size());
assertEquals("root.a capacity", 33.334f,
csConfig.getNonLabeledQueueCapacity(ROOT_A), 0.0f);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ public void testNoChildQueueConversion() {

assertEquals("root weight", 1.0f,
csConfig.getNonLabeledQueueWeight(ROOT), 0.0f);
assertEquals("Converted items", 21,
assertEquals("Converted items", 22,
csConfig.getPropsWithPrefix(PREFIX).size());
}

Expand All @@ -67,7 +67,7 @@ public void testSingleWeightConversion() {
csConfig.getNonLabeledQueueWeight(ROOT), 0.0f);
assertEquals("root.a weight", 1.0f,
csConfig.getNonLabeledQueueWeight(ROOT_A), 0.0f);
assertEquals("Number of properties", 22,
assertEquals("Number of properties", 23,
csConfig.getPropsWithPrefix(PREFIX).size());
}

Expand All @@ -77,7 +77,7 @@ public void testMultiWeightConversion() {

converter.convertWeightsForChildQueues(root, csConfig);

assertEquals("Number of properties", 24,
assertEquals("Number of properties", 25,
csConfig.getPropsWithPrefix(PREFIX).size());
assertEquals("root weight", 1.0f,
csConfig.getNonLabeledQueueWeight(ROOT), 0.0f);
Expand All @@ -103,7 +103,7 @@ public void testAutoCreateV2FlagOnParentWithoutChildren() {
FSQueue root = createParent(new ArrayList<>());
converter.convertWeightsForChildQueues(root, csConfig);

assertEquals("Number of properties", 21,
assertEquals("Number of properties", 22,
csConfig.getPropsWithPrefix(PREFIX).size());
assertTrue("root autocreate v2 enabled",
csConfig.isAutoQueueCreationV2Enabled(ROOT));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ private Configuration createConfig() {
conf.set("yarn.scheduler.capacity.root.a.max-parallel-app", "42");
conf.set("yarn.scheduler.capacity.root.b.capacity", "50");
conf.set("yarn.scheduler.capacity.root.c.capacity", "37.5");
conf.set("yarn.scheduler.capacity.schedule-asynchronously.enable", "false");
return conf;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<configuration>

<property>
<name>yarn.scheduler.capacity.maximum-applications</name>
<value>10000</value>
<description>
Maximum number of applications that can be pending and running.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
<value>0.1</value>
<description>
Maximum percent of resources in the cluster which can be used to run
application masters i.e. controls number of concurrent running
applications.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.resource-calculator</name>
<value>org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator</value>
<description>
The ResourceCalculator implementation to be used to compare
Resources in the scheduler.
The default i.e. DefaultResourceCalculator only uses Memory while
DominantResourceCalculator uses dominant-resource to compare
multi-dimensional resources such as Memory, CPU etc.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.root.queues</name>
<value>default</value>
<description>
The queues at the this level (root is the root queue).
</description>
</property>

<property>
<name>yarn.scheduler.capacity.root.default.capacity</name>
<value>100</value>
<description>Default queue target capacity.</description>
</property>

<property>
<name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
<value>1</value>
<description>
Default queue user limit a percentage from 0.0 to 1.0.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
<value>100</value>
<description>
The maximum capacity of the default queue.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.root.default.state</name>
<value>RUNNING</value>
<description>
The state of the default queue. State can be one of RUNNING or STOPPED.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
<value>*</value>
<description>
The ACL of who can submit jobs to the default queue.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
<value>*</value>
<description>
The ACL of who can administer jobs on the default queue.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.root.default.acl_application_max_priority</name>
<value>*</value>
<description>
The ACL of who can submit applications with configured priority.
For e.g, [user={name} group={name} max_priority={priority} default_priority={priority}]
</description>
</property>

<property>
<name>yarn.scheduler.capacity.root.default.maximum-application-lifetime
</name>
<value>-1</value>
<description>
Maximum lifetime of an application which is submitted to a queue
in seconds. Any value less than or equal to zero will be considered as
disabled.
This will be a hard time limit for all applications in this
queue. If positive value is configured then any application submitted
to this queue will be killed after exceeds the configured lifetime.
User can also specify lifetime per application basis in
application submission context. But user lifetime will be
overridden if it exceeds queue maximum lifetime. It is point-in-time
configuration.
Note : Configuring too low value will result in killing application
sooner. This feature is applicable only for leaf queue.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.root.default.default-application-lifetime
</name>
<value>-1</value>
<description>
Default lifetime of an application which is submitted to a queue
in seconds. Any value less than or equal to zero will be considered as
disabled.
If the user has not submitted application with lifetime value then this
value will be taken. It is point-in-time configuration.
Note : Default lifetime can't exceed maximum lifetime. This feature is
applicable only for leaf queue.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.node-locality-delay</name>
<value>40</value>
<description>
Number of missed scheduling opportunities after which the CapacityScheduler
attempts to schedule rack-local containers.
When setting this parameter, the size of the cluster should be taken into account.
We use 40 as the default value, which is approximately the number of nodes in one rack.
Note, if this value is -1, the locality constraint in the container request
will be ignored, which disables the delay scheduling.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.rack-locality-additional-delay</name>
<value>-1</value>
<description>
Number of additional missed scheduling opportunities over the node-locality-delay
ones, after which the CapacityScheduler attempts to schedule off-switch containers,
instead of rack-local ones.
Example: with node-locality-delay=40 and rack-locality-delay=20, the scheduler will
attempt rack-local assignments after 40 missed opportunities, and off-switch assignments
after 40+20=60 missed opportunities.
When setting this parameter, the size of the cluster should be taken into account.
We use -1 as the default value, which disables this feature. In this case, the number
of missed opportunities for assigning off-switch containers is calculated based on
the number of containers and unique locations specified in the resource request,
as well as the size of the cluster.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.queue-mappings</name>
<value></value>
<description>
A list of mappings that will be used to assign jobs to queues
The syntax for this list is [u|g]:[name]:[queue_name][,next mapping]*
Typically this list will be used to map users to queues,
for example, u:%user:%user maps all users to queues with the same name
as the user.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.queue-mappings-override.enable</name>
<value>false</value>
<description>
If a queue mapping is present, will it override the value specified
by the user? This can be used by administrators to place jobs in queues
that are different than the one specified by the user.
The default is false.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.per-node-heartbeat.maximum-offswitch-assignments</name>
<value>1</value>
<description>
Controls the number of OFF_SWITCH assignments allowed
during a node's heartbeat. Increasing this value can improve
scheduling rate for OFF_SWITCH containers. Lower values reduce
"clumping" of applications on particular nodes. The default is 1.
Legal values are 1-MAX_INT. This config is refreshable.
</description>
</property>


<property>
<name>yarn.scheduler.capacity.application.fail-fast</name>
<value>false</value>
<description>
Whether RM should fail during recovery if previous applications'
queue is no longer valid.
</description>
</property>

<property>
<name>yarn.scheduler.capacity.workflow-priority-mappings</name>
<value></value>
<description>
A list of mappings that will be used to override application priority.
The syntax for this list is
[workflowId]:[full_queue_name]:[priority][,next mapping]*
where an application submitted (or mapped to) queue "full_queue_name"
and workflowId "workflowId" (as specified in application submission
context) will be given priority "priority".
</description>
</property>

<property>
<name>yarn.scheduler.capacity.workflow-priority-mappings-override.enable</name>
<value>false</value>
<description>
If a priority mapping is present, will it override the value specified
by the user? This can be used by administrators to give applications a
priority that is different than the one specified by the user.
The default is false.
</description>
</property>

<!-- Although asynchronous scheduling is enabled by default, for unit testing-->
<!-- disabling it by default to give more control over container scheduling while-->
<!-- simulating complex tests.-->
<property>
<name>yarn.scheduler.capacity.schedule-asynchronously.enable</name>
<value>false</value>
<description>
Whether to enable asynchronous scheduling.
</description>
</property>

</configuration>
Loading
Loading