diff --git a/src/hadoop-node-manager/deploy/hadoop-node-manager-configuration/yarn-site.xml.template b/src/hadoop-node-manager/deploy/hadoop-node-manager-configuration/yarn-site.xml.template index 11e6cacd01..6b3680e7e8 100644 --- a/src/hadoop-node-manager/deploy/hadoop-node-manager-configuration/yarn-site.xml.template +++ b/src/hadoop-node-manager/deploy/hadoop-node-manager-configuration/yarn-site.xml.template @@ -15,267 +15,268 @@ - - yarn.nodemanager.resource.cpu-vcores - {cpu_vcores} - Number of CPU cores that can be allocated for containers. - - - - yarn.nodemanager.resource.memory-mb - {mem_total} - Number of memory that can be allocated for containers. - - - - yarn.nodemanager.vmem-pmem-ratio - 6 - - Ratio between virtual memory to physical memory when setting memory limits for containers. - Container allocations are expressed in terms of physical memory, and virtual memory usage - is allowed to exceed this allocation by this ratio. - - - - - yarn.resourcemanager.max-completed-applications - 1000 - maximum number of completed applications - - - - yarn.scheduler.maximum-allocation-mb - 1048576 - default is 8GB, here we set 1024G - - - - yarn.resourcemanager.scheduler.class - org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler - - - - yarn.nodemanager.aux-services.mapreduce_shuffle.class - org.apache.hadoop.mapred.ShuffleHandler - - - - yarn.nodemanager.delete.debug-delay-sec - 604800 - - - - yarn.resourcemanager.store.class - org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore - - - - yarn.resourcemanager.zk-address - {ZOOKEEPER_QUORUM} - - - - yarn.resourcemanager.recovery.enabled - true - - - - yarn.log-aggregation-enable - true - - - - yarn.log-aggregation.retain-seconds - {{ cluster_cfg[ "hadoop-resource-manager" ][ "yarn_log_retain_seconds" ] }} - - - - yarn.nodemanager.remote-app-log-dir - /sys/logs - - - - yarn.resourcemanager.hostname - {RESOURCEMANAGER_ADDRESS} + + yarn.nodemanager.resource.cpu-vcores + {cpu_vcores} + Number of CPU cores that can be allocated for containers. + + + + yarn.nodemanager.resource.memory-mb + {mem_total} + Number of memory that can be allocated for containers. + + + + yarn.nodemanager.vmem-pmem-ratio + 6 + + Ratio between virtual memory to physical memory when setting memory limits for containers. + Container allocations are expressed in terms of physical memory, and virtual memory usage + is allowed to exceed this allocation by this ratio. + + + + + yarn.resourcemanager.max-completed-applications + 1000 + maximum number of completed applications + + + + yarn.scheduler.maximum-allocation-mb + 1572864 + default is 8GB, here we set 1.5TB + + + + yarn.resourcemanager.scheduler.class + org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler + + + + yarn.nodemanager.aux-services.mapreduce_shuffle.class + org.apache.hadoop.mapred.ShuffleHandler + + + + yarn.nodemanager.delete.debug-delay-sec + 604800 + + + + yarn.resourcemanager.store.class + org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore + + + + yarn.resourcemanager.zk-address + {ZOOKEEPER_QUORUM} + + + + yarn.resourcemanager.recovery.enabled + true + + + + yarn.log-aggregation-enable + true + + + + yarn.log-aggregation.retain-seconds + {{ cluster_cfg[ "hadoop-resource-manager" ][ "yarn_log_retain_seconds" ] }} + + + + yarn.nodemanager.remote-app-log-dir + /sys/logs + + + + yarn.resourcemanager.hostname + {RESOURCEMANAGER_ADDRESS} The hostname of the RM. - - - - yarn.nodemanager.hostname - {POD_IP} - - - - yarn.nodemanager.aux-services - mapreduce_shuffle - - - - yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage - 95.0 - - - - yarn.node-labels.enabled - true - - - - yarn.node-labels.fs-store.root-dir - hdfs://{HDFS_ADDRESS}:9000/yarn/node-labels - - - - yarn.app.mapreduce.am.staging-dir - /tmp/hadoop-yarn/staging - - - - yarn.log.server.url - http://{LOGSERVER_ADDRESS}:8188/applicationhistory/logs - - - - yarn.log.server.web-service.url - http://{LOGSERVER_ADDRESS}:8188/ws/v1/applicationhistory - - - - yarn.nodemanager.recovery.enabled - true - - - - yarn.nodemanager.recovery.dir - /var/lib/yarn/yarn-nm-recovery - - - - yarn.nodemanager.log-dirs - /var/lib/yarn/userlogs - - - - yarn.nodemanager.address - {POD_IP}:8041 - - - - yarn.timeline-service.enabled - true - - - - - Error filename pattern, to identify the file in the container's - Log directory which contain the container's error log. As error file - redirection is done by client/AM and yarn will not be aware of the error - file name. YARN uses this pattern to identify the error file and tail - the error log as diagnostics when the container execution returns non zero - value. Filename patterns are case sensitive and should match the - specifications of FileSystem.globStatus(Path) api. If multiple filenames - matches the pattern, first file matching the pattern will be picked. - - yarn.nodemanager.container.stderr.pattern - runtime.pai.agg.error - - - - Size of the container error file which needs to be tailed, in bytes. - To avoid unexpected diagnostics truncation: - 40KB = 10 * default(4096) - - yarn.nodemanager.container.stderr.tail.bytes - 40960 - - - - Maximum size of contain's diagnostics to keep for relaunching container case. - To avoid unexpected diagnostics truncation: - 100KB = 10 * default(10000) - - yarn.nodemanager.container-diagnostics-maximum-size - 100000 - - - - Defines the limit of the diagnostics message of an application - attempt, in kilo characters (character count * 1024). - When using ZooKeeper to store application state behavior, it's - important to limit the size of the diagnostic messages to - prevent YARN from overwhelming ZooKeeper. In cases where - yarn.resourcemanager.state-store.max-completed-applications is set to - a large number, it may be desirable to reduce the value of this property - to limit the total data stored. - To avoid unexpected diagnostics truncation: - 640KB = 10 * default(64) - - yarn.app.attempt.diagnostics.limit.kc - 640 - - - - yarn.resourcemanager.system-metrics-publisher.enabled - true - - - - yarn.timeline-service.generic-application-history.enabled - true - - - - yarn.timeline-service.hostname - {TIMELINE_SERVER_ADDRESS} - - - - yarn.timeline-service.bind-host - 0.0.0.0 - - - - yarn.timeline-service.generic-application-history.max-applications - 1000 - - - - yarn.timeline-service.generic-application-history.fs-history-store.uri - /yarn/timeline/generic-history - - - - yarn.timeline-service.ttl-enable - true - - - - yarn.timeline-service.ttl-ms - 2419200000 - - - - yarn.timeline-service.handler-thread-count - 16 - - - - yarn.timeline-service.generic-application-history.store-class - org.apache.hadoop.yarn.server.applicationhistoryservice.FileSystemApplicationHistoryStore - - - - yarn.timeline-service.generic-application-history.save-non-am-container-meta-info - true - - - - yarn.timeline-service.http-cross-origin.enabled - true - - - - Percentage of GPU that can be allocated + + + + yarn.nodemanager.hostname + {POD_IP} + + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + + yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage + 95.0 + + + + yarn.node-labels.enabled + true + + + + yarn.node-labels.fs-store.root-dir + hdfs://{HDFS_ADDRESS}:9000/yarn/node-labels + + + + yarn.app.mapreduce.am.staging-dir + /tmp/hadoop-yarn/staging + + + + yarn.log.server.url + http://{LOGSERVER_ADDRESS}:8188/applicationhistory/logs + + + + yarn.log.server.web-service.url + http://{LOGSERVER_ADDRESS}:8188/ws/v1/applicationhistory + + + + yarn.nodemanager.recovery.enabled + true + + + + yarn.nodemanager.recovery.dir + /var/lib/yarn/yarn-nm-recovery + + + + yarn.nodemanager.log-dirs + /var/lib/yarn/userlogs + + + + yarn.nodemanager.address + {POD_IP}:8041 + + + + yarn.timeline-service.enabled + true + + + + + Error filename pattern, to identify the file in the container's + Log directory which contain the container's error log. As error file + redirection is done by client/AM and yarn will not be aware of the error + file name. YARN uses this pattern to identify the error file and tail + the error log as diagnostics when the container execution returns non zero + value. Filename patterns are case sensitive and should match the + specifications of FileSystem.globStatus(Path) api. If multiple filenames + matches the pattern, first file matching the pattern will be picked. + + yarn.nodemanager.container.stderr.pattern + runtime.pai.agg.error + + + + Size of the container error file which needs to be tailed, in bytes. + To avoid unexpected diagnostics truncation: + 40KB = 10 * default(4096) + + yarn.nodemanager.container.stderr.tail.bytes + 40960 + + + + Maximum size of contain's diagnostics to keep for relaunching container case. + To avoid unexpected diagnostics truncation: + 100KB = 10 * default(10000) + + yarn.nodemanager.container-diagnostics-maximum-size + 100000 + + + + Defines the limit of the diagnostics message of an application + attempt, in kilo characters (character count * 1024). + When using ZooKeeper to store application state behavior, it's + important to limit the size of the diagnostic messages to + prevent YARN from overwhelming ZooKeeper. In cases where + yarn.resourcemanager.state-store.max-completed-applications is set to + a large number, it may be desirable to reduce the value of this property + to limit the total data stored. + To avoid unexpected diagnostics truncation: + 640KB = 10 * default(64) + + yarn.app.attempt.diagnostics.limit.kc + 640 + + + + yarn.resourcemanager.system-metrics-publisher.enabled + true + + + + yarn.timeline-service.generic-application-history.enabled + true + + + + yarn.timeline-service.hostname + {TIMELINE_SERVER_ADDRESS} + + + + yarn.timeline-service.bind-host + 0.0.0.0 + + + + yarn.timeline-service.generic-application-history.max-applications + 1000 + + + + yarn.timeline-service.generic-application-history.fs-history-store.uri + /yarn/timeline/generic-history + + + + yarn.timeline-service.ttl-enable + true + + + + yarn.timeline-service.ttl-ms + 2419200000 + + + + yarn.timeline-service.handler-thread-count + 16 + + + + yarn.timeline-service.generic-application-history.store-class + org.apache.hadoop.yarn.server.applicationhistoryservice.FileSystemApplicationHistoryStore + + + + yarn.timeline-service.generic-application-history.save-non-am-container-meta-info + true + + + + yarn.timeline-service.http-cross-origin.enabled + true + + + + + Percentage of GPU that can be allocated for containers. This setting allows users to limit the amount of GPU that YARN containers use. Currently functional only on Linux using cgroups. The default is to use 100% of GPU. @@ -302,10 +303,12 @@ [100-65535] - Rounds of updating ports. This parameter is circle controller for updating + + Rounds of updating ports. This parameter is circle controller for updating local allocated ports info, since the ports info is big. We can control the update frequency to have balance with cluster scale and ports info's - accuracy + accuracy + yarn.nodemanager.resource.ports-update-rounds 10 @@ -359,8 +362,8 @@ - The duration (in ms) the YARN client waits for an expected state change - to occur. -1 means unlimited wait time. + The duration (in ms) the YARN client waits for an expected state change + to occur. -1 means unlimited wait time. yarn.client.application-client-protocol.poll-timeout-ms 900000 @@ -379,8 +382,8 @@ yarn.nodemanager.container-executor.class org.apache.hadoop.yarn.server.nodemanager.DockerContainerExecutor - This is the container executor setting that ensures that all -jobs are started with the DockerContainerExecutor. + This is the container executor setting that ensures that all + jobs are started with the DockerContainerExecutor. @@ -393,7 +396,7 @@ jobs are started with the DockerContainerExecutor. yarn.nodemanager.docker-container-executor.exec-option -v /dev:/dev -v /var/run/docker.sock:/var/run/docker.sock -v /var/drivers:/var/drivers -v {HOST_YARN_NODEMANAGER_STORAGE}:/var/lib/yarn -v /tmp/pai-root:/tmp/pai-root -v /etc/hadoop-configuration-for-jobs:/hadoop-configuration-for-jobs -v {HOST_HADOOP_TMP_STORAGE}:/var/lib/hadoopdata -e HDFS_ADDRESS={HDFS_ADDRESS} -e LOGSERVER_ADDRESS={LOGSERVER_ADDRESS} -e TIMELINE_SERVER_ADDRESS={TIMELINE_SERVER_ADDRESS} -e RESOURCEMANAGER_ADDRESS={RESOURCEMANAGER_ADDRESS} -e ZOOKEEPER_QUORUM={ZOOKEEPER_QUORUM} - Docker run option when launch container. + Docker run option when launch container. @@ -401,7 +404,7 @@ jobs are started with the DockerContainerExecutor. yarn.nodemanager.docker-container-executor.script-command trap " " TERM; cp -r /docker/* /usr/bin/ && cp /hadoop-configuration-for-jobs/* $HADOOP_CONF_DIR/ - Image command before launch_container script. + Image command before launch_container script. diff --git a/src/hadoop-resource-manager/deploy/hadoop-resource-manager-configuration/yarn-site.xml.template b/src/hadoop-resource-manager/deploy/hadoop-resource-manager-configuration/yarn-site.xml.template index 7cd9593b2a..2f2b5cf9f1 100644 --- a/src/hadoop-resource-manager/deploy/hadoop-resource-manager-configuration/yarn-site.xml.template +++ b/src/hadoop-resource-manager/deploy/hadoop-resource-manager-configuration/yarn-site.xml.template @@ -16,260 +16,260 @@ - - yarn.nodemanager.resource.cpu-vcores - 24 - Number of CPU cores that can be allocated for containers. - - - - yarn.nodemanager.resource.memory-mb - 204800 - Number of memory that can be allocated for containers. - - - - yarn.resourcemanager.max-completed-applications - 1000 - maximum number of completed applications - - - - yarn.resourcemanager.bind-host - 0.0.0.0 - - - - yarn.scheduler.maximum-allocation-mb - 1048576 - default is 8GB, here we set 1024G - - - - yarn.scheduler.configuration.store.class - zk - default is file, change it to zk to enable config by rest api - - - - yarn.resourcemanager.scheduler.class - org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler - - - - yarn.resourcemanager.scheduler.monitor.enable - true - - - - yarn.resourcemanager.scheduler.monitor.policies - org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy - - - - yarn.resourcemanager.store.class - org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore - - - - yarn.resourcemanager.hostname - {RESOURCEMANAGER_ADDRESS} - The hostname of the RM. - - - - yarn.resourcemanager.webapp.address - {RESOURCEMANAGER_ADDRESS}:8088 - - - - yarn.resourcemanager.zk-address - {ZOOKEEPER_QUORUM} - - - - yarn.resourcemanager.recovery.enabled - true - - - - yarn.resourcemanager.connect.retry-interval.ms - 1000 - - - - yarn.log-aggregation-enable - true - - - - yarn.log-aggregation.retain-seconds - {{ cluster_cfg[ "hadoop-resource-manager" ][ "yarn_log_retain_seconds" ] }} - - - - yarn.nodemanager.remote-app-log-dir - /sys/logs - - - - yarn.nodemanager.aux-services - mapreduce_shuffle - - - - yarn.nodemanager.delete.debug-delay-sec - 604800 - - - - yarn.node-labels.fs-store.root-dir - hdfs://{HDFS_ADDRESS}:9000/yarn/node-labels - - - - yarn.node-labels.enabled - true - - - - yarn.app.mapreduce.am.staging-dir - /tmp/hadoop-yarn/staging - - - - yarn.log.server.url - http://{LOGSERVER_ADDRESS}:8188/applicationhistory/logs - - - - yarn.nodemanager.recovery.enabled - true - - - - yarn.nodemanager.recovery.dir - /var/lib/yarn/yarn-nm-recovery - - - - yarn.nodemanager.log-dirs - /var/lib/yarn/userlogs - - - - yarn.timeline-service.enabled - true - - - - - Error filename pattern, to identify the file in the container's - Log directory which contain the container's error log. As error file - redirection is done by client/AM and yarn will not be aware of the error - file name. YARN uses this pattern to identify the error file and tail - the error log as diagnostics when the container execution returns non zero - value. Filename patterns are case sensitive and should match the - specifications of FileSystem.globStatus(Path) api. If multiple filenames - matches the pattern, first file matching the pattern will be picked. - - yarn.nodemanager.container.stderr.pattern - runtime.pai.agg.error - - - - Size of the container error file which needs to be tailed, in bytes. - To avoid unexpected diagnostics truncation: - 40KB = 10 * default(4096) - - yarn.nodemanager.container.stderr.tail.bytes - 40960 - - - - Maximum size of contain's diagnostics to keep for relaunching container case. - To avoid unexpected diagnostics truncation: - 100KB = 10 * default(10000) - - yarn.nodemanager.container-diagnostics-maximum-size - 100000 - - - - Defines the limit of the diagnostics message of an application - attempt, in kilo characters (character count * 1024). - When using ZooKeeper to store application state behavior, it's - important to limit the size of the diagnostic messages to - prevent YARN from overwhelming ZooKeeper. In cases where - yarn.resourcemanager.state-store.max-completed-applications is set to - a large number, it may be desirable to reduce the value of this property - to limit the total data stored. - To avoid unexpected diagnostics truncation: - 640KB = 10 * default(64) - - yarn.app.attempt.diagnostics.limit.kc - 640 - - - - yarn.resourcemanager.system-metrics-publisher.enabled - true - - - - yarn.timeline-service.generic-application-history.enabled - true - - - - yarn.timeline-service.hostname - {TIMELINE_SERVER_ADDRESS} - - - - yarn.timeline-service.bind-host - 0.0.0.0 - - - - yarn.timeline-service.generic-application-history.max-applications - 1000 - - - - yarn.timeline-service.generic-application-history.fs-history-store.uri - /yarn/timeline/generic-history - - - - yarn.timeline-service.ttl-enable - true - - - - yarn.timeline-service.ttl-ms - 2419200000 - - - - yarn.timeline-service.handler-thread-count - 16 - - - - yarn.timeline-service.generic-application-history.store-class - org.apache.hadoop.yarn.server.applicationhistoryservice.FileSystemApplicationHistoryStore - - - - yarn.timeline-service.generic-application-history.save-non-am-container-meta-info - true - - - - yarn.timeline-service.http-cross-origin.enabled - true - + + yarn.nodemanager.resource.cpu-vcores + 24 + Number of CPU cores that can be allocated for containers. + + + + yarn.nodemanager.resource.memory-mb + 204800 + Number of memory that can be allocated for containers. + + + + yarn.resourcemanager.max-completed-applications + 1000 + maximum number of completed applications + + + + yarn.resourcemanager.bind-host + 0.0.0.0 + + + + yarn.scheduler.maximum-allocation-mb + 1572864 + default is 8GB, here we set 1.5TB + + + + yarn.scheduler.configuration.store.class + zk + default is file, change it to zk to enable config by rest api + + + + yarn.resourcemanager.scheduler.class + org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler + + + + yarn.resourcemanager.scheduler.monitor.enable + true + + + + yarn.resourcemanager.scheduler.monitor.policies + org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy + + + + yarn.resourcemanager.store.class + org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore + + + + yarn.resourcemanager.hostname + {RESOURCEMANAGER_ADDRESS} + The hostname of the RM. + + + + yarn.resourcemanager.webapp.address + {RESOURCEMANAGER_ADDRESS}:8088 + + + + yarn.resourcemanager.zk-address + {ZOOKEEPER_QUORUM} + + + + yarn.resourcemanager.recovery.enabled + true + + + + yarn.resourcemanager.connect.retry-interval.ms + 1000 + + + + yarn.log-aggregation-enable + true + + + + yarn.log-aggregation.retain-seconds + {{ cluster_cfg[ "hadoop-resource-manager" ][ "yarn_log_retain_seconds" ] }} + + + + yarn.nodemanager.remote-app-log-dir + /sys/logs + + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + + yarn.nodemanager.delete.debug-delay-sec + 604800 + + + + yarn.node-labels.fs-store.root-dir + hdfs://{HDFS_ADDRESS}:9000/yarn/node-labels + + + + yarn.node-labels.enabled + true + + + + yarn.app.mapreduce.am.staging-dir + /tmp/hadoop-yarn/staging + + + + yarn.log.server.url + http://{LOGSERVER_ADDRESS}:8188/applicationhistory/logs + + + + yarn.nodemanager.recovery.enabled + true + + + + yarn.nodemanager.recovery.dir + /var/lib/yarn/yarn-nm-recovery + + + + yarn.nodemanager.log-dirs + /var/lib/yarn/userlogs + + + + yarn.timeline-service.enabled + true + + + + + Error filename pattern, to identify the file in the container's + Log directory which contain the container's error log. As error file + redirection is done by client/AM and yarn will not be aware of the error + file name. YARN uses this pattern to identify the error file and tail + the error log as diagnostics when the container execution returns non zero + value. Filename patterns are case sensitive and should match the + specifications of FileSystem.globStatus(Path) api. If multiple filenames + matches the pattern, first file matching the pattern will be picked. + + yarn.nodemanager.container.stderr.pattern + runtime.pai.agg.error + + + + Size of the container error file which needs to be tailed, in bytes. + To avoid unexpected diagnostics truncation: + 40KB = 10 * default(4096) + + yarn.nodemanager.container.stderr.tail.bytes + 40960 + + + + Maximum size of contain's diagnostics to keep for relaunching container case. + To avoid unexpected diagnostics truncation: + 100KB = 10 * default(10000) + + yarn.nodemanager.container-diagnostics-maximum-size + 100000 + + + + Defines the limit of the diagnostics message of an application + attempt, in kilo characters (character count * 1024). + When using ZooKeeper to store application state behavior, it's + important to limit the size of the diagnostic messages to + prevent YARN from overwhelming ZooKeeper. In cases where + yarn.resourcemanager.state-store.max-completed-applications is set to + a large number, it may be desirable to reduce the value of this property + to limit the total data stored. + To avoid unexpected diagnostics truncation: + 640KB = 10 * default(64) + + yarn.app.attempt.diagnostics.limit.kc + 640 + + + + yarn.resourcemanager.system-metrics-publisher.enabled + true + + + + yarn.timeline-service.generic-application-history.enabled + true + + + + yarn.timeline-service.hostname + {TIMELINE_SERVER_ADDRESS} + + + + yarn.timeline-service.bind-host + 0.0.0.0 + + + + yarn.timeline-service.generic-application-history.max-applications + 1000 + + + + yarn.timeline-service.generic-application-history.fs-history-store.uri + /yarn/timeline/generic-history + + + + yarn.timeline-service.ttl-enable + true + + + + yarn.timeline-service.ttl-ms + 2419200000 + + + + yarn.timeline-service.handler-thread-count + 16 + + + + yarn.timeline-service.generic-application-history.store-class + org.apache.hadoop.yarn.server.applicationhistoryservice.FileSystemApplicationHistoryStore + + + + yarn.timeline-service.generic-application-history.save-non-am-container-meta-info + true + + + + yarn.timeline-service.http-cross-origin.enabled + true + yarn.resourcemanager.rm.container-allocation.expiry-interval-ms @@ -290,7 +290,7 @@ yarn.resourcemanager.container-tokens.master-key-rolling-interval-secs 88000 - + yarn.resourcemanager.nodes.exclude-path /exclude-path/nodes @@ -313,35 +313,40 @@ - The duration (in ms) the YARN client waits for an expected state change - to occur. -1 means unlimited wait time. + The duration (in ms) the YARN client waits for an expected state change + to occur. -1 means unlimited wait time. yarn.client.application-client-protocol.poll-timeout-ms 900000 - The minimum allocation for every container request at the RM, + + The minimum allocation for every container request at the RM, in terms of GPUs. Requests lower than this will throw a - InvalidResourceRequestException. + InvalidResourceRequestException. + yarn.scheduler.minimum-allocation-gpus 0 - The maximum allocation for every container request at the RM, + + The maximum allocation for every container request at the RM, in terms of GPUs. Requests higher than this will throw a - InvalidResourceRequestException. + InvalidResourceRequestException. + yarn.scheduler.maximum-allocation-gpus - 8 + 16 - The maximum allocation for every container request at the RM in terms of virtual CPU cores. - Requests higher than this will throw an InvalidResourceRequestException. + + The maximum allocation for every container request at the RM in terms of virtual CPU cores. + Requests higher than this will throw an InvalidResourceRequestException. + yarn.scheduler.maximum-allocation-vcores - 32 + 96 - - +