diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 9742f13c45f..b1015c92979 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -424,6 +424,21 @@ def make_deploy_resources_variables( # issue when first booted. image_id = 'skypilot:cpu-debian-11' + def _failover_disk_tier() -> Optional[resources_utils.DiskTier]: + if (r.disk_tier is not None and + r.disk_tier != resources_utils.DiskTier.BEST): + return r.disk_tier + # Failover disk tier from ultra to low. + all_tiers = list(reversed(resources_utils.DiskTier)) + start_index = all_tiers.index(GCP._translate_disk_tier(r.disk_tier)) + while start_index < len(all_tiers): + disk_tier = all_tiers[start_index] + ok, _ = GCP.check_disk_tier(r.instance_type, disk_tier) + if ok: + return disk_tier + start_index += 1 + assert False, 'Low disk tier should always be supported on GCP.' + r = resources # Find GPU spec, if any. resources_vars = { @@ -437,7 +452,7 @@ def make_deploy_resources_variables( 'custom_resources': None, 'use_spot': r.use_spot, 'gcp_project_id': self.get_project_id(dryrun), - **GCP._get_disk_specs(r.disk_tier), + **GCP._get_disk_specs(_failover_disk_tier()), } accelerators = r.accelerators if accelerators is not None: @@ -532,6 +547,10 @@ def _get_feasible_launchable_resources( ) -> 'resources_utils.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources + ok, _ = GCP.check_disk_tier(resources.instance_type, + resources.disk_tier) + if not ok: + return resources_utils.FeasibleResources([], [], None) return resources_utils.FeasibleResources([resources], [], None) if resources.accelerators is None: @@ -544,15 +563,17 @@ def _get_feasible_launchable_resources( # TODO: Add hints to all return values in this method to help # users understand why the resources are not launchable. return resources_utils.FeasibleResources([], [], None) - else: - r = resources.copy( - cloud=GCP(), - instance_type=host_vm_type, - accelerators=None, - cpus=None, - memory=None, - ) - return resources_utils.FeasibleResources([r], [], None) + ok, _ = GCP.check_disk_tier(host_vm_type, resources.disk_tier) + if not ok: + return resources_utils.FeasibleResources([], [], None) + r = resources.copy( + cloud=GCP(), + instance_type=host_vm_type, + accelerators=None, + cpus=None, + memory=None, + ) + return resources_utils.FeasibleResources([r], [], None) # Find instance candidates to meet user's requirements assert len(resources.accelerators.items() @@ -615,6 +636,10 @@ def _get_feasible_launchable_resources( else: host_vm_type = instance_list[0] + ok, _ = GCP.check_disk_tier(host_vm_type, resources.disk_tier) + if not ok: + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) acc_dict = {acc: acc_count} r = resources.copy( cloud=GCP(), @@ -916,6 +941,36 @@ def _check_instance_type_accelerators_combination( resources.instance_type, resources.accelerators, resources.zone, 'gcp') + @classmethod + def check_disk_tier( + cls, instance_type: Optional[str], + disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]: + if disk_tier != resources_utils.DiskTier.ULTRA or instance_type is None: + return True, '' + # Ultra disk tier (pd-extreme) only support m2, m3 and part of n2 + # instance types, so we failover to lower tiers for other instance + # types. Reference: + # https://cloud.google.com/compute/docs/disks/extreme-persistent-disk#machine_shape_support # pylint: disable=line-too-long + series = instance_type.split('-')[0] + if series in ['m2', 'm3', 'n2']: + if series == 'n2': + num_cpus = int(instance_type.split('-')[2]) + if num_cpus < 64: + return False, ('n2 series with less than 64 vCPUs are ' + 'not supported with pd-extreme.') + return True, '' + return False, (f'{series} series is not supported with pd-extreme. ' + 'Only m2, m3 series and n2 series with 64 or more vCPUs ' + 'are supported.') + + @classmethod + def check_disk_tier_enabled(cls, instance_type: Optional[str], + disk_tier: resources_utils.DiskTier) -> None: + ok, msg = cls.check_disk_tier(instance_type, disk_tier) + if not ok: + with ux_utils.print_exception_no_traceback(): + raise exceptions.NotSupportedError(msg) + @classmethod def _get_disk_type(cls, disk_tier: Optional[resources_utils.DiskTier]) -> str: diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 2b7c6a23a9f..f861b51920e 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -9,6 +9,7 @@ from sky import exceptions from sky import sky_logging from sky.adaptors import common as adaptors_common +from sky.clouds import GCP from sky.clouds.service_catalog import common from sky.utils import resources_utils from sky.utils import ux_utils @@ -243,7 +244,6 @@ def get_default_instance_type( cpus: Optional[str] = None, memory: Optional[str] = None, disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]: - del disk_tier # unused if cpus is None and memory is None: cpus = f'{_DEFAULT_NUM_VCPUS}+' if memory is None: @@ -254,6 +254,12 @@ def get_default_instance_type( f'{family}-' for family in _DEFAULT_INSTANCE_FAMILY) df = _df[_df['InstanceType'].notna()] df = df[df['InstanceType'].str.startswith(instance_type_prefix)] + + def _filter_disk_type(instance_type: str) -> bool: + valid, _ = GCP.check_disk_tier(instance_type, disk_tier) + return valid + + df = df.loc[df['InstanceType'].apply(_filter_disk_type)] return common.get_instance_type_for_cpus_mem_impl(df, cpus, memory_gb_or_ratio)