Skip to content

Commit

Permalink
[GCP][Disk] Add supported instance type check for disk tier ultra (sk…
Browse files Browse the repository at this point in the history
…ypilot-org#3935)

* fix

* Update sky/clouds/gcp.py

Co-authored-by: Zhanghao Wu <zhanghao.wu@outlook.com>

* convert to private

* fix unittest

---------

Co-authored-by: Zhanghao Wu <zhanghao.wu@outlook.com>
  • Loading branch information
cblmemo and Michaelvll authored Sep 13, 2024
1 parent 4dd9051 commit 7940552
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 11 deletions.
75 changes: 65 additions & 10 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,21 @@ def make_deploy_resources_variables(
# issue when first booted.
image_id = 'skypilot:cpu-debian-11'

def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
if (r.disk_tier is not None and
r.disk_tier != resources_utils.DiskTier.BEST):
return r.disk_tier
# Failover disk tier from ultra to low.
all_tiers = list(reversed(resources_utils.DiskTier))
start_index = all_tiers.index(GCP._translate_disk_tier(r.disk_tier))
while start_index < len(all_tiers):
disk_tier = all_tiers[start_index]
ok, _ = GCP.check_disk_tier(r.instance_type, disk_tier)
if ok:
return disk_tier
start_index += 1
assert False, 'Low disk tier should always be supported on GCP.'

r = resources
# Find GPU spec, if any.
resources_vars = {
Expand All @@ -437,7 +452,7 @@ def make_deploy_resources_variables(
'custom_resources': None,
'use_spot': r.use_spot,
'gcp_project_id': self.get_project_id(dryrun),
**GCP._get_disk_specs(r.disk_tier),
**GCP._get_disk_specs(_failover_disk_tier()),
}
accelerators = r.accelerators
if accelerators is not None:
Expand Down Expand Up @@ -532,6 +547,10 @@ def _get_feasible_launchable_resources(
) -> 'resources_utils.FeasibleResources':
if resources.instance_type is not None:
assert resources.is_launchable(), resources
ok, _ = GCP.check_disk_tier(resources.instance_type,
resources.disk_tier)
if not ok:
return resources_utils.FeasibleResources([], [], None)
return resources_utils.FeasibleResources([resources], [], None)

if resources.accelerators is None:
Expand All @@ -544,15 +563,17 @@ def _get_feasible_launchable_resources(
# TODO: Add hints to all return values in this method to help
# users understand why the resources are not launchable.
return resources_utils.FeasibleResources([], [], None)
else:
r = resources.copy(
cloud=GCP(),
instance_type=host_vm_type,
accelerators=None,
cpus=None,
memory=None,
)
return resources_utils.FeasibleResources([r], [], None)
ok, _ = GCP.check_disk_tier(host_vm_type, resources.disk_tier)
if not ok:
return resources_utils.FeasibleResources([], [], None)
r = resources.copy(
cloud=GCP(),
instance_type=host_vm_type,
accelerators=None,
cpus=None,
memory=None,
)
return resources_utils.FeasibleResources([r], [], None)

# Find instance candidates to meet user's requirements
assert len(resources.accelerators.items()
Expand Down Expand Up @@ -615,6 +636,10 @@ def _get_feasible_launchable_resources(
else:
host_vm_type = instance_list[0]

ok, _ = GCP.check_disk_tier(host_vm_type, resources.disk_tier)
if not ok:
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
None)
acc_dict = {acc: acc_count}
r = resources.copy(
cloud=GCP(),
Expand Down Expand Up @@ -916,6 +941,36 @@ def _check_instance_type_accelerators_combination(
resources.instance_type, resources.accelerators, resources.zone,
'gcp')

@classmethod
def check_disk_tier(
cls, instance_type: Optional[str],
disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]:
if disk_tier != resources_utils.DiskTier.ULTRA or instance_type is None:
return True, ''
# Ultra disk tier (pd-extreme) only support m2, m3 and part of n2
# instance types, so we failover to lower tiers for other instance
# types. Reference:
# https://cloud.google.com/compute/docs/disks/extreme-persistent-disk#machine_shape_support # pylint: disable=line-too-long
series = instance_type.split('-')[0]
if series in ['m2', 'm3', 'n2']:
if series == 'n2':
num_cpus = int(instance_type.split('-')[2])
if num_cpus < 64:
return False, ('n2 series with less than 64 vCPUs are '
'not supported with pd-extreme.')
return True, ''
return False, (f'{series} series is not supported with pd-extreme. '
'Only m2, m3 series and n2 series with 64 or more vCPUs '
'are supported.')

@classmethod
def check_disk_tier_enabled(cls, instance_type: Optional[str],
disk_tier: resources_utils.DiskTier) -> None:
ok, msg = cls.check_disk_tier(instance_type, disk_tier)
if not ok:
with ux_utils.print_exception_no_traceback():
raise exceptions.NotSupportedError(msg)

@classmethod
def _get_disk_type(cls,
disk_tier: Optional[resources_utils.DiskTier]) -> str:
Expand Down
8 changes: 7 additions & 1 deletion sky/clouds/service_catalog/gcp_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from sky import exceptions
from sky import sky_logging
from sky.adaptors import common as adaptors_common
from sky.clouds import GCP
from sky.clouds.service_catalog import common
from sky.utils import resources_utils
from sky.utils import ux_utils
Expand Down Expand Up @@ -243,7 +244,6 @@ def get_default_instance_type(
cpus: Optional[str] = None,
memory: Optional[str] = None,
disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
del disk_tier # unused
if cpus is None and memory is None:
cpus = f'{_DEFAULT_NUM_VCPUS}+'
if memory is None:
Expand All @@ -254,6 +254,12 @@ def get_default_instance_type(
f'{family}-' for family in _DEFAULT_INSTANCE_FAMILY)
df = _df[_df['InstanceType'].notna()]
df = df[df['InstanceType'].str.startswith(instance_type_prefix)]

def _filter_disk_type(instance_type: str) -> bool:
valid, _ = GCP.check_disk_tier(instance_type, disk_tier)
return valid

df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
return common.get_instance_type_for_cpus_mem_impl(df, cpus,
memory_gb_or_ratio)

Expand Down

0 comments on commit 7940552

Please sign in to comment.