Skip to content

AWS Capacity Reservation support #1977

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
7fbd659
docs: Update required AWS permissions
tsolovev Nov 7, 2024
9e69bcd
feature: Add reservation support for fleets
tsolovev Nov 8, 2024
18e662a
fix: Remove CLI flags from legacy parser
tsolovev Nov 9, 2024
0e57cb1
Merge remote-tracking branch 'origin/issue_1155_aws_capacity_block' i…
tsolovev Nov 9, 2024
f762302
fix: Remove CLI flag from deprecated parsers
tsolovev Nov 9, 2024
0fd0be7
Merge branch 'refs/heads/master' into issue_1155_aws_capacity_block
tsolovev Nov 13, 2024
8a7d5af
feature: Reservation support for runs (service/devenv/task)
tsolovev Nov 13, 2024
a7410ac
feature(cli): Reservation support for plans and lists
tsolovev Nov 13, 2024
5647aff
feature(cli): Fix reservation indentation
tsolovev Nov 13, 2024
d48d3ec
refactor: Improve function signatures
tsolovev Nov 13, 2024
2b93408
feature(api): Add client backward compatibility for updated run/fleet…
tsolovev Nov 13, 2024
2a447a1
test: Update tests to reflect model changes
tsolovev Nov 14, 2024
002e0ed
test: Update tests to reflect model changes
tsolovev Nov 14, 2024
3a0556e
fix: Patch "InstanceMarketOptions" field for capacity block reservations
tsolovev Dec 6, 2024
57477a1
Merge branch 'master' into issue_1804_aws_efa_ifs
tsolovev Dec 6, 2024
da3bd98
fix(aws): Remove currently unsupported p5e. instance from compute filter
tsolovev Dec 9, 2024
5edbd8a
feature(aws): Improve exception handling in get_reservation()
tsolovev Dec 10, 2024
db0d4d1
fix(aws): Patch ReservationType field check
tsolovev Dec 10, 2024
2a7e572
Fix KeyError: 'ReservationType'
r4victor Dec 10, 2024
da908e6
Minor typing and logging fixes
r4victor Dec 10, 2024
ffd6917
Fix reservations when applying fleets
r4victor Dec 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/docs/reference/server/config.yml.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,15 @@ There are two ways to configure AWS: using an access key or using the default cr
"ec2:AttachVolume",
"ec2:AuthorizeSecurityGroupEgress",
"ec2:AuthorizeSecurityGroupIngress",
"ec2:CreatePlacementGroup",
"ec2:CancelSpotInstanceRequests",
"ec2:CreateSecurityGroup",
"ec2:CreateTags",
"ec2:CreateVolume",
"ec2:DeletePlacementGroup",
"ec2:DeleteVolume",
"ec2:DescribeAvailabilityZones",
"ec2:DescribeCapacityReservations"
"ec2:DescribeImages",
"ec2:DescribeInstances",
"ec2:DescribeInstanceAttribute",
Expand Down
3 changes: 3 additions & 0 deletions src/dstack/_internal/cli/services/configurators/fleet.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ def th(s: str) -> str:
fleet_type = "cloud"
nodes = plan.spec.configuration.nodes or "-"
placement = plan.spec.configuration.placement or InstanceGroupPlacement.ANY
reservation = plan.spec.configuration.reservation
backends = None
if plan.spec.configuration.backends is not None:
backends = ", ".join(b.value for b in plan.spec.configuration.backends)
Expand Down Expand Up @@ -297,6 +298,8 @@ def th(s: str) -> str:
configuration_table.add_row(th("Resources"), resources)
if spot_policy is not None:
configuration_table.add_row(th("Spot policy"), spot_policy)
if reservation is not None:
configuration_table.add_row(th("Reservation"), reservation)

offers_table = Table(box=None)
offers_table.add_column("#")
Expand Down
8 changes: 8 additions & 0 deletions src/dstack/_internal/cli/utils/fleet.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def get_fleets_table(
) -> Table:
table = Table(box=None)
table.add_column("FLEET", no_wrap=True)
if verbose:
table.add_column("RESERVATION")
table.add_column("INSTANCE")
table.add_column("BACKEND")
table.add_column("RESOURCES")
Expand Down Expand Up @@ -61,6 +63,12 @@ def get_fleets_table(
format_date(instance.created),
]

if verbose and i == 0:
row.insert(
1,
fleet.spec.configuration.reservation if i == 0 else "",
)

if verbose:
error = ""
if instance.status == InstanceStatus.TERMINATED and instance.termination_reason:
Expand Down
4 changes: 4 additions & 0 deletions src/dstack/_internal/cli/utils/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def th(s: str) -> str:
props.add_row(th("Creation policy"), creation_policy)
props.add_row(th("Termination policy"), termination_policy)
props.add_row(th("Termination idle time"), termination_idle_time)
props.add_row(th("Reservation"), run_plan.run_spec.configuration.reservation)

offers = Table(box=None)
offers.add_column("#")
Expand Down Expand Up @@ -121,6 +122,8 @@ def get_runs_table(
if verbose:
table.add_column("INSTANCE", no_wrap=True)
table.add_column("RESOURCES")
if verbose:
table.add_column("RESERVATION", no_wrap=True)
table.add_column("PRICE", no_wrap=True)
table.add_column("STATUS", no_wrap=True)
table.add_column("SUBMITTED", style="grey58", no_wrap=True)
Expand Down Expand Up @@ -154,6 +157,7 @@ def get_runs_table(
"BACKEND": f"{jpd.backend.value.replace('remote', 'ssh')} ({jpd.region})",
"INSTANCE": jpd.instance_type.name,
"RESOURCES": jpd.instance_type.resources.pretty_format(include_spot=True),
"RESERVATION": jpd.reservation,
"PRICE": f"${jpd.price:.4}",
}
)
Expand Down
4 changes: 4 additions & 0 deletions src/dstack/_internal/core/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT = [
BackendType.AWS,
]
BACKENDS_WITH_RESERVATION_SUPPORT = [
BackendType.AWS,
]

BACKENDS_WITH_GATEWAY_SUPPORT = [
BackendType.AWS,
BackendType.AZURE,
Expand Down
52 changes: 51 additions & 1 deletion src/dstack/_internal/core/backends/aws/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,40 @@ def __init__(self, config: AWSConfig):
def get_offers(
self, requirements: Optional[Requirements] = None
) -> List[InstanceOfferWithAvailability]:
filter = _supported_instances
if requirements and requirements.reservation:
region_to_reservation = {}
for region in self.config.regions:
reservation = aws_resources.get_reservation(
ec2_client=self.session.client("ec2", region_name=region),
reservation_id=requirements.reservation,
instance_count=1,
)
if reservation is not None:
region_to_reservation[region] = reservation

def _supported_instances_with_reservation(offer: InstanceOffer) -> bool:
# Filter: only instance types supported by dstack
if not _supported_instances(offer):
return False
# Filter: Spot instances can't be used with reservations
if offer.instance.resources.spot:
return False
region = offer.region
reservation = region_to_reservation.get(region)
# Filter: only instance types matching the capacity reservation
if not bool(reservation and offer.instance.name == reservation["InstanceType"]):
return False
return True

filter = _supported_instances_with_reservation

offers = get_catalog_offers(
backend=BackendType.AWS,
locations=self.config.regions,
requirements=requirements,
configurable_disk_size=CONFIGURABLE_DISK_SIZE,
extra_filter=_supported_instances,
extra_filter=filter,
)
regions = set(i.region for i in offers)

Expand Down Expand Up @@ -148,6 +176,7 @@ def create_instance(
ec2_client=ec2_client, instance_type=instance_offer.instance.name
)
enable_efa = max_efa_interfaces > 0
is_capacity_block = False
try:
vpc_id, subnet_ids = get_vpc_id_subnet_id_or_error(
ec2_client=ec2_client,
Expand All @@ -160,6 +189,22 @@ def create_instance(
ec2_client=ec2_client,
subnet_ids=subnet_ids,
)
if instance_config.reservation:
reservation = aws_resources.get_reservation(
ec2_client=ec2_client,
reservation_id=instance_config.reservation,
instance_count=1,
)
if reservation is not None:
# Filter out az different from capacity reservation
subnet_id_to_az_map = {
k: v
for k, v in subnet_id_to_az_map.items()
if v == reservation["AvailabilityZone"]
}
if reservation.get("ReservationType") == "capacity-block":
is_capacity_block = True

except botocore.exceptions.ClientError as e:
logger.warning("Got botocore.exceptions.ClientError: %s", e)
raise NoCapacityError()
Expand Down Expand Up @@ -193,6 +238,8 @@ def create_instance(
allocate_public_ip=allocate_public_ip,
placement_group_name=instance_config.placement_group_name,
enable_efa=enable_efa,
reservation_id=instance_config.reservation,
is_capacity_block=is_capacity_block,
)
)
instance = response[0]
Expand All @@ -212,6 +259,7 @@ def create_instance(
internal_ip=instance.private_ip_address,
region=instance_offer.region,
availability_zone=az,
reservation=instance.capacity_reservation_id,
price=instance_offer.price,
username=username,
ssh_port=22,
Expand Down Expand Up @@ -240,6 +288,7 @@ def run_job(
SSHKey(public=project_ssh_public_key.strip()),
],
user=run.user,
reservation=run.run_spec.configuration.reservation,
)
if len(volumes) > 0:
volume = volumes[0]
Expand Down Expand Up @@ -706,6 +755,7 @@ def _supported_instances(offer: InstanceOffer) -> bool:
"g4dn.",
"g5.",
"g6.",
"g6e.",
"gr6.",
"p3.",
"p4d.",
Expand Down
51 changes: 51 additions & 0 deletions src/dstack/_internal/core/backends/aws/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ def create_instances_struct(
allocate_public_ip: bool = True,
placement_group_name: Optional[str] = None,
enable_efa: bool = False,
reservation_id: Optional[str] = None,
is_capacity_block: bool = False,
) -> Dict[str, Any]:
struct: Dict[str, Any] = dict(
BlockDeviceMappings=[
Expand Down Expand Up @@ -173,6 +175,9 @@ def create_instances_struct(
"InstanceInterruptionBehavior": "terminate",
},
}

if is_capacity_block:
struct["InstanceMarketOptions"] = {"MarketType": "capacity-block"}
if enable_efa and not subnet_id:
raise ComputeError("EFA requires subnet")
# AWS allows specifying either NetworkInterfaces for specific subnet_id
Expand Down Expand Up @@ -205,6 +210,11 @@ def create_instances_struct(
"GroupName": placement_group_name,
}

if reservation_id is not None:
struct["CapacityReservationSpecification"] = {
"CapacityReservationTarget": {"CapacityReservationId": reservation_id}
}

return struct


Expand Down Expand Up @@ -596,3 +606,44 @@ def _is_private_subnet_with_internet_egress(
return True

return False


def get_reservation(
ec2_client: botocore.client.BaseClient,
reservation_id: str,
instance_count: int = 0,
instance_types: Optional[List[str]] = None,
is_capacity_block: bool = False,
) -> Optional[Dict[str, Any]]:
filters = [{"Name": "state", "Values": ["active"]}]
if instance_types:
filters.append({"Name": "instance-type", "Values": instance_types})
try:
response = ec2_client.describe_capacity_reservations(
CapacityReservationIds=[reservation_id], Filters=filters
)
Comment on lines +622 to +624
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I pass reservation_id in invalid format (e.g. some_res_1234), the AWS will error

raise error_class(parsed_response, operation_name) 
                    botocore.exceptions.ClientError: An error occurred     
                    (InvalidCapacityReservationId.Malformed) when calling  
                    the DescribeCapacityReservations operation: Capacity   
                    Reservation ID 'res123456' is malformed

dstack will handle this error and won't crash, but it will be logged with error log level as unexpected AWS errors. It's recommended to catch such expected errors. They can be then re-raised as ComputeError.

except botocore.exceptions.ParamValidationError as e:
logger.debug(
"Skipping reservation %s. Parameter validation error: %s", reservation_id, repr(e)
)
return None
except botocore.exceptions.ClientError as e:
error_code = e.response.get("Error", {}).get("Code")
if error_code == "InvalidCapacityReservationId.Malformed":
logger.debug("Skipping reservation %s. Malformed ID.", reservation_id)
return None
if error_code == "InvalidCapacityReservationId.NotFound":
logger.debug(
"Skipping reservation %s. Capacity Reservation not found.", reservation_id
)
return None
raise
reservation = response["CapacityReservations"][0]

if instance_count > 0 and reservation["AvailableInstanceCount"] < instance_count:
return None

if is_capacity_block and reservation["ReservationType"] != "capacity-block":
return None

return reservation
4 changes: 4 additions & 0 deletions src/dstack/_internal/core/models/fleets.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ class InstanceGroupParams(CoreModel):
Optional[InstanceGroupPlacement],
Field(description="The placement of instances: `any` or `cluster`"),
] = None
reservation: Annotated[
Optional[str],
Field(description="The existing reservation for the instances"),
] = None
Comment on lines +131 to +134
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just pointing out another client backward incompatibility that we need to address.

resources: Annotated[
Optional[ResourcesSpec],
Field(description="The resources requirements"),
Expand Down
1 change: 1 addition & 0 deletions src/dstack/_internal/core/models/instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ class InstanceConfiguration(CoreModel):
instance_id: Optional[str] = None
availability_zone: Optional[str] = None
placement_group_name: Optional[str] = None
reservation: Optional[str] = None

def get_public_keys(self) -> List[str]:
return [ssh_key.public.strip() for ssh_key in self.ssh_keys]
Expand Down
4 changes: 4 additions & 0 deletions src/dstack/_internal/core/models/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ class ProfileParams(CoreModel):
description="The cloud-specific instance types to consider for provisioning (e.g., `[p3.8xlarge, n1-standard-4]`)"
),
]
reservation: Annotated[
Optional[str],
Field(description="The existing reservation for the instances"),
]
spot_policy: Annotated[
Optional[SpotPolicy],
Field(
Expand Down
2 changes: 2 additions & 0 deletions src/dstack/_internal/core/models/runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ class Requirements(CoreModel):
resources: ResourcesSpec
max_price: Optional[float]
spot: Optional[bool]
reservation: Optional[str]

def pretty_format(self, resources_only: bool = False):
res = self.resources.pretty_format()
Expand Down Expand Up @@ -211,6 +212,7 @@ class JobProvisioningData(CoreModel):
instance_network: Optional[str] = None
region: str
availability_zone: Optional[str] = None
reservation: Optional[str] = None
price: float
username: str
# ssh_port be different from 22 for some backends.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,7 @@ def _get_or_create_fleet_model_for_job(
configuration=FleetConfiguration(
name=run.run_spec.run_name,
placement=placement,
reservation=run.run_spec.configuration.reservation,
),
profile=run.run_spec.merged_profile,
autocreated=True,
Expand Down
4 changes: 4 additions & 0 deletions src/dstack/_internal/server/services/fleets.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ async def create_fleet(
pool=pool,
spec=spec,
placement_group_name=placement_group_name,
reservation=spec.configuration.reservation,
instance_num=i,
)
fleet_model.instances.append(instance_model)
Expand All @@ -364,6 +365,7 @@ async def create_fleet_instance_model(
pool: PoolModel,
spec: FleetSpec,
placement_group_name: Optional[str],
reservation: Optional[str],
instance_num: int,
) -> InstanceModel:
profile = spec.merged_profile
Expand All @@ -378,6 +380,7 @@ async def create_fleet_instance_model(
instance_name=f"{spec.configuration.name}-{instance_num}",
instance_num=instance_num,
placement_group_name=placement_group_name,
reservation=reservation,
)
return instance_model

Expand Down Expand Up @@ -738,6 +741,7 @@ def _get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements:
resources=fleet_spec.configuration.resources or ResourcesSpec(),
max_price=profile.max_price,
spot=get_policy_map(profile.spot_policy, default=SpotPolicy.ONDEMAND),
reservation=fleet_spec.configuration.reservation,
)
return requirements

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ def _requirements(self) -> Requirements:
resources=self.run_spec.configuration.resources,
max_price=self.run_spec.merged_profile.max_price,
spot=None if spot_policy == SpotPolicy.AUTO else (spot_policy == SpotPolicy.SPOT),
reservation=self.run_spec.merged_profile.reservation,
)

def _retry(self) -> Optional[Retry]:
Expand Down
6 changes: 6 additions & 0 deletions src/dstack/_internal/server/services/offers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from dstack._internal.core.backends import (
BACKENDS_WITH_CREATE_INSTANCE_SUPPORT,
BACKENDS_WITH_MULTINODE_SUPPORT,
BACKENDS_WITH_RESERVATION_SUPPORT,
)
from dstack._internal.core.backends.base import Backend
from dstack._internal.core.models.backends.base import BackendType
Expand Down Expand Up @@ -53,6 +54,11 @@ async def get_offers_by_requirements(
backend_types = BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
backend_types = [b for b in backend_types if b in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT]

if profile.reservation is not None:
if not backend_types:
backend_types = BACKENDS_WITH_RESERVATION_SUPPORT
backend_types = [b for b in backend_types if b in BACKENDS_WITH_RESERVATION_SUPPORT]

# For multi-node, restrict backend and region.
# The default behavior is to provision all nodes in the same backend and region.
if master_job_provisioning_data is not None:
Expand Down
Loading
Loading