Skip to content

[Subnet Prioritization] Support capacity-optimized-prioritized and prioritized Allocation Strategy #6865

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 23 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
bcb3346
[Subnet Prioritization] Add prioritized|capacity-optimized-prioritize…
Allenz5 Jun 3, 2025
bd77a73
Merge branch 'develop' of github.com:Allenz5/aws-parallelcluster into…
Allenz5 Jun 3, 2025
96d1e01
Merge branch 'aws:develop' into develop
Allenz5 Jun 5, 2025
1360560
[Subnet Prioritization] Add test cases for instance allocation strate…
Allenz5 Jun 5, 2025
8cf10aa
[Subnet Prioritization] Update the default value and update policy of…
Allenz5 Jun 5, 2025
7f303dd
[Subnet Prioritization] Move AllocationStrategy Enum from pcluster.co…
Allenz5 Jun 6, 2025
26ddfec
[Subnet Prioritization] Add validator and validator test for enable_s…
Allenz5 Jun 6, 2025
13d8cfa
[Subnet Prioritization] Move AllocationStrategy Enum from cluster_con…
Allenz5 Jun 6, 2025
f8de4a5
Revert "[Subnet Prioritization] Move AllocationStrategy Enum from clu…
Allenz5 Jun 6, 2025
c7889cd
[Subnet Prioritization] Register enable_single_availability_zone_vali…
Allenz5 Jun 6, 2025
b6506cf
[Subnet Prioritization] Change default value of enable_single_availab…
Allenz5 Jun 9, 2025
a3b7672
[Subnet Prioritization] Add enable_single_availability_zone parameter…
Allenz5 Jun 9, 2025
33d3fd1
Merge branch 'aws:develop' into develop
Allenz5 Jun 9, 2025
406d5e8
[Subnet Prioritization] Fix format issues
Allenz5 Jun 11, 2025
dee33ca
Merge branch 'develop' of github.com:Allenz5/aws-parallelcluster into…
Allenz5 Jun 11, 2025
f2a0c7e
[Subnet Prioritization] Update CHANGELOG.md
Allenz5 Jun 11, 2025
2e94485
[Subnet Prioritization] Remove duplicated AllocationStrategy Enum
Allenz5 Jun 11, 2025
d2c2ba6
[Subnet Prioritization] Remove duplicated EnableSingleAvailabilityZon…
Allenz5 Jun 11, 2025
db7a9c5
[Subnet Prioritization] Update the failure message of InstancesAlloca…
Allenz5 Jun 11, 2025
99773c4
[Subnet Prioritization] Update enable_single_availability_zone_valida…
Allenz5 Jun 11, 2025
ce299af
[Subnet Prioritization] Update format
Allenz5 Jun 12, 2025
aa81737
[Subnet Prioritization] Fix EnableSingleAvailabilityZoneValidator
Allenz5 Jun 12, 2025
24129ff
[Subnet Prioritization] Fix format issue
Allenz5 Jun 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ CHANGELOG

**CHANGES**
- Ubuntu 20.04 is no longer supported.
- Support prioritized and capacity-optimized-prioritized Allocation Strategy

**BUG FIXES**
- Fix an issue where Security Group validation failed when a rule contained both IPv4 ranges (IpRanges) and security group references (UserIdGroupPairs).
Expand Down
27 changes: 18 additions & 9 deletions cli/src/pcluster/config/cluster_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from pcluster.aws.common import AWSClientError, get_region
from pcluster.config.common import (
AdditionalIamPolicy,
AllocationStrategy,
BaseDeploymentSettings,
BaseDevSettings,
BaseTag,
Expand Down Expand Up @@ -185,6 +186,7 @@
from pcluster.validators.monitoring_validators import DetailedMonitoringValidator, LogRotationValidator
from pcluster.validators.networking_validators import (
ElasticIpValidator,
EnableSingleAvailabilityZoneValidator,
MultiAzPlacementGroupValidator,
QueueSubnetsValidator,
SecurityGroupsValidator,
Expand Down Expand Up @@ -813,10 +815,17 @@ def __init__(self, subnet_ids: List[str], assign_public_ip: str = None, **kwargs
class SlurmQueueNetworking(_QueueNetworking):
"""Represent the networking configuration for the slurm Queue."""

def __init__(self, placement_group: PlacementGroup = None, proxy: Proxy = None, **kwargs):
def __init__(
self,
placement_group: PlacementGroup = None,
proxy: Proxy = None,
enable_single_availability_zone: bool = None,
**kwargs,
):
super().__init__(**kwargs)
self.placement_group = placement_group or PlacementGroup(implied=True)
self.proxy = proxy
self.enable_single_availability_zone = enable_single_availability_zone


class AwsBatchQueueNetworking(_QueueNetworking):
Expand Down Expand Up @@ -2565,14 +2574,6 @@ def _register_validators(self, context: ValidatorContext = None):
)


class AllocationStrategy(Enum):
"""Define supported allocation strategies."""

LOWEST_PRICE = "lowest-price"
CAPACITY_OPTIMIZED = "capacity-optimized"
PRICE_CAPACITY_OPTIMIZED = "price-capacity-optimized"


class SlurmQueue(_CommonQueue):
"""Represents a Slurm Queue that has Compute Resources with both Single and Multiple Instance Types."""

Expand Down Expand Up @@ -2633,6 +2634,14 @@ def _register_validators(self, context: ValidatorContext = None):
max_length=MAX_COMPUTE_RESOURCES_PER_QUEUE,
resource_name="ComputeResources per Queue",
)
if any(
isinstance(compute_resource, SlurmFlexibleComputeResource) for compute_resource in self.compute_resources
):
self._register_validator(
EnableSingleAvailabilityZoneValidator,
allocation_strategy=self.allocation_strategy,
enable_single_availability_zone=self.networking.enable_single_availability_zone,
)
self._register_validator(
QueueSubnetsValidator,
queue_name=self.name,
Expand Down
10 changes: 10 additions & 0 deletions cli/src/pcluster/config/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@
LOGGER = logging.getLogger(__name__)


class AllocationStrategy(Enum):
"""Define supported allocation strategies."""

LOWEST_PRICE = "lowest-price"
CAPACITY_OPTIMIZED = "capacity-optimized"
PRICE_CAPACITY_OPTIMIZED = "price-capacity-optimized"
PRIORITIZED = "prioritized"
CAPACITY_OPTIMIZED_PRIORITIZED = "capacity-optimized-prioritized"


class CapacityType(Enum):
"""Enum to identify the type compute supported by the queues."""

Expand Down
4 changes: 2 additions & 2 deletions cli/src/pcluster/schemas/cluster_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from pcluster.config.cluster_config import (
AdditionalPackages,
Alarms,
AllocationStrategy,
AmiSearchFilters,
AwsBatchClusterConfig,
AwsBatchComputeResource,
Expand Down Expand Up @@ -95,7 +94,7 @@
SlurmSettings,
Timeouts,
)
from pcluster.config.common import BaseTag, CapacityType, DefaultUserHomeType
from pcluster.config.common import AllocationStrategy, BaseTag, CapacityType, DefaultUserHomeType
from pcluster.config.update_policy import UpdatePolicy
from pcluster.constants import (
DELETION_POLICIES,
Expand Down Expand Up @@ -752,6 +751,7 @@ class SlurmQueueNetworkingSchema(QueueNetworkingSchema):
PlacementGroupSchema, metadata={"update_policy": UpdatePolicy.MANAGED_PLACEMENT_GROUP}
)
proxy = fields.Nested(QueueProxySchema, metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY})
enable_single_availability_zone = fields.Bool(metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did we test the use of keeping this UpdatePolicy.QUEUE_UPDATE_STRATEGY for this paramter?


@post_load
def make_resource(self, data, **kwargs):
Expand Down
27 changes: 21 additions & 6 deletions cli/src/pcluster/validators/instances_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from typing import Callable, Dict

from pcluster.aws.aws_resources import InstanceTypeInfo
from pcluster.config import cluster_config
from pcluster.config import cluster_config, common
from pcluster.constants import MIN_MEMORY_ABSOLUTE_DIFFERENCE, MIN_MEMORY_PRECENTAGE_DIFFERENCE
from pcluster.validators.common import FailureLevel, Validator

Expand Down Expand Up @@ -221,17 +221,19 @@ class InstancesAllocationStrategyValidator(Validator, _FlexibleInstanceTypesVali
"""Confirm Allocation Strategy matches with the Capacity Type."""

def _validate(self, compute_resource_name: str, capacity_type: Enum, allocation_strategy: Enum, **kwargs):
"""On-demand Capacity type only supports "lowest-price" allocation strategy."""
"""On-demand Capacity type only supports "lowest-price" and "prioritized" allocation strategy."""
if (
capacity_type == cluster_config.CapacityType.ONDEMAND
and allocation_strategy
and allocation_strategy != cluster_config.AllocationStrategy.LOWEST_PRICE
and allocation_strategy != common.AllocationStrategy.LOWEST_PRICE
and allocation_strategy != common.AllocationStrategy.PRIORITIZED
):
alloc_strategy_msg = allocation_strategy.value if allocation_strategy else "not set"
self._add_failure(
f"Compute Resource {compute_resource_name} is using an OnDemand CapacityType but the Allocation "
f"Strategy specified is {alloc_strategy_msg}. OnDemand CapacityType can only use '"
f"{cluster_config.AllocationStrategy.LOWEST_PRICE.value}' allocation strategy.",
f"Compute Resource {compute_resource_name} is using an OnDemand CapacityType but "
f"the Allocation Strategy specified is {alloc_strategy_msg}. OnDemand CapacityType can only use '"
f"{common.AllocationStrategy.LOWEST_PRICE.value}' or '{common.AllocationStrategy.PRIORITIZED.value}' "
"allocation strategy.",
FailureLevel.ERROR,
)
if capacity_type == cluster_config.CapacityType.CAPACITY_BLOCK and allocation_strategy:
Expand All @@ -241,6 +243,19 @@ def _validate(self, compute_resource_name: str, capacity_type: Enum, allocation_
"When using CAPACITY_BLOCK CapacityType, allocation strategy should not be set.",
FailureLevel.ERROR,
)
if (
capacity_type == cluster_config.CapacityType.SPOT
and allocation_strategy == common.AllocationStrategy.PRIORITIZED
):
self._add_failure(
f"Compute Resource {compute_resource_name} is using a SPOT CapacityType but the "
f"Allocation Strategy specified is {allocation_strategy.value}. SPOT CapacityType can only use "
f"'{common.AllocationStrategy.LOWEST_PRICE.value}', "
f"'{common.AllocationStrategy.CAPACITY_OPTIMIZED.value}', "
f"'{common.AllocationStrategy.PRICE_CAPACITY_OPTIMIZED.value}' "
f"or '{common.AllocationStrategy.CAPACITY_OPTIMIZED_PRIORITIZED.value}' allocation strategy.",
FailureLevel.ERROR,
)


class InstancesMemorySchedulingWarningValidator(Validator):
Expand Down
25 changes: 25 additions & 0 deletions cli/src/pcluster/validators/networking_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.
from collections import Counter
from enum import Enum
from typing import List, Union

from pcluster.aws.aws_api import AWSApi
from pcluster.aws.common import AWSClientError
from pcluster.config import common
from pcluster.validators.common import FailureLevel, Validator


Expand Down Expand Up @@ -64,6 +66,29 @@ def _validate(self, subnet_ids: List[str]):
self._add_failure(str(e), FailureLevel.ERROR)


class EnableSingleAvailabilityZoneValidator(Validator):
"""
Single Availability Zone validator.

Check that enable_single_availability_zone should be used with prioritized
or capacity-optimized-prioritized Allocation Strategy
"""

def _validate(self, allocation_strategy: Enum, enable_single_availability_zone: bool):
prioritized_allocation_strategies = (
common.AllocationStrategy.PRIORITIZED,
common.AllocationStrategy.CAPACITY_OPTIMIZED_PRIORITIZED,
)
if enable_single_availability_zone and allocation_strategy not in prioritized_allocation_strategies:
self._add_failure(
"Enable_single_availability_zone is specified as "
f"'{enable_single_availability_zone}' while allocation_strategy is specified as "
f"'{allocation_strategy.value}'. Enable_single_availability_zone should only be used with "
"prioritized or capacity-optimized-prioritized Allocation Strategy.",
FailureLevel.ERROR,
)


class QueueSubnetsValidator(Validator):
"""
Queue Subnets validator.
Expand Down
Loading
Loading