Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gcp quota optimization #2187

Merged
6 changes: 3 additions & 3 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1366,8 +1366,7 @@ def _retry_zones(
# instead of trying to provision and failing later.
try:
need_provision = to_provision.cloud.check_quota_available(
to_provision.region, to_provision.instance_type,
to_provision.use_spot)
to_provision)

except Exception as e: # pylint: disable=broad-except
need_provision = True
Expand All @@ -1385,7 +1384,8 @@ def _retry_zones(
raise exceptions.ResourcesUnavailableError(
f'{colorama.Fore.YELLOW}Found no quota for '
f'{to_provision.instance_type} {instance_descriptor} '
f'instances in region {to_provision.region}. '
f'instances in region {to_provision.region} '
f'in {to_provision.cloud}. '
f'{colorama.Style.RESET_ALL}'
f'To request quotas, check the instruction: '
f'https://skypilot.readthedocs.io/en/latest/cloud-setup/quota.html.' # pylint: disable=line-too-long
Expand Down
18 changes: 11 additions & 7 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,14 +698,14 @@ def _get_disk_specs(cls, disk_tier: Optional[str]) -> Dict[str, Any]:

@classmethod
def check_quota_available(cls,
region: str,
instance_type: str,
use_spot: bool = False) -> bool:
"""Check if AWS quota is available for `instance_type` in `region`.
resources: 'resources_lib.Resources') -> bool:
"""Check if AWS quota is available based on `resources`.

AWS-specific implementation of check_quota_available. The function works by
matching the instance_type to the corresponding AWS quota code, and then using
the boto3 Python API to query the region for the specific quota code.
AWS-specific implementation of check_quota_available. The function
works by matching the `instance_type` to the corresponding AWS quota
code, and then using the boto3 Python API to query the `region` for
the specific quota code (the `instance_type` and `region` as defined
by `resources`).

Returns:
False if the quota is found to be zero, and True otherwise.
Expand All @@ -714,6 +714,10 @@ def check_quota_available(cls,
botocore.exceptions.ClientError: error in Boto3 client request.
"""

instance_type = resources.instance_type
region = resources.region
use_spot = resources.use_spot

from sky.clouds.service_catalog import aws_catalog # pylint: disable=import-outside-toplevel,unused-import

quota_code = aws_catalog.get_quota_code(instance_type, use_spot)
Expand Down
28 changes: 13 additions & 15 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,21 +539,19 @@ def _equal_accelerators(

@classmethod
def check_quota_available(cls,
region: str,
instance_type: str,
use_spot: bool = False) -> bool:
"""Check if quota is available for `instance_type` in `region`.

(Currently, check_quota_available is only implemented for AWS.)
resources: 'resources_lib.Resources') -> bool:
"""Check if quota is available based on `resources`.

The _retry_zones function in cloud_vm_ray_backend goes through different
candidate regions and attempts to provision the requested instance_type
accelerators in the region, until a successful provisioning happens
or all regions with the requested accelerator have been looked at.
Previously, SkyPilot would attempt to provision resources in all of
these regions. However, many regions would have a zero quota or
inadequate quota, meaning these attempted provisions were destined
to fail from the get-go.
candidate regions and attempts to provision the requested
`instance_type` or `accelerator` accelerators in the `region`
(the `instance_type` or `accelerator`, and `region`, as defined in
`resources`) until a successful provisioning happens or all regions
with the requested accelerator have been looked at. Previously,
SkyPilot would attempt to provision resources in all of these regions.
However, many regions would have a zero quota or inadequate quota,
meaning these attempted provisions were destined to fail from
the get-go.

Checking the quota is substantially faster than attempting a failed
provision (~1 second vs 30+ seconds) so this function attempts to
Expand All @@ -565,7 +563,7 @@ def check_quota_available(cls,
quota utilization because many cloud providers' APIs don't have a
built-in command for checking the real-time utilization. Checking
real-time utilization is a more difficult endeavor that involves
monitoring etc., so we are holding off on that for now.
observability etc., so we are holding off on that for now.

If for at any point the function fails, whether it's because we can't
import the necessary dependencies or a query using a cloud provider's
Expand All @@ -585,7 +583,7 @@ def check_quota_available(cls,
Returns:
False if the quota is found to be zero, and true otherwise.
"""
del region, instance_type, use_spot # unused
del resources # unused

return True

Expand Down
58 changes: 58 additions & 0 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,6 +792,64 @@ def _get_disk_type(cls, disk_tier: Optional[str]) -> str:
def _label_filter_str(cls, tag_filters: Dict[str, str]) -> str:
return ' '.join(f'labels.{k}={v}' for k, v in tag_filters.items())

@classmethod
def check_quota_available(cls, resources: 'resources.Resources') -> bool:
"""Check if GCP quota is available based on `resources`.

GCP-specific implementation of check_quota_available. The function works by
matching the `accelerator` to the a corresponding GCP keyword, and then using
the GCP CLI commands to query for the specific quota (the `accelerator` as
defined by `resources`).

Returns:
False if the quota is found to be zero, and True otherwise.
Raises:
CalledProcessError: error with the GCP CLI command.
"""

if not resources.accelerators:
# TODO(hriday): We currently only support checking quotas for GPUs.
# For CPU-only instances, we need to try provisioning to check quotas.
return True
shethhriday29 marked this conversation as resolved.
Show resolved Hide resolved

accelerator = list(resources.accelerators.keys())[0]
use_spot = resources.use_spot
region = resources.region

from sky.clouds.service_catalog import gcp_catalog # pylint: disable=import-outside-toplevel

quota_code = gcp_catalog.get_quota_code(accelerator, use_spot)

if quota_code is None:
# Quota code not found in the catalog for the chosen instance_type, try provisioning anyway
return True

command = f'gcloud compute regions describe {region} |grep -B 1 "{quota_code}" | awk \'/limit/ {{print; exit}}\''
try:
proc = subprocess_utils.run(cmd=command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)

except subprocess.CalledProcessError as e:
logger.warning(f'Quota check command failed with error: '
f'{e.stderr.decode()}')
return True

# Extract quota from output
# Example output: "- limit: 16.0"
out = proc.stdout.decode()
try:
quota = int(float(out.split('limit:')[-1].strip()))
except (ValueError, IndexError, AttributeError) as e:
logger.warning('Parsing the subprocess output failed '
f'with error: {e}')
return True

if quota == 0:
return False
# Quota found to be greater than zero, try provisioning
return True

@classmethod
def query_status(cls, name: str, tag_filters: Dict[str, str],
region: Optional[str], zone: Optional[str],
Expand Down
12 changes: 7 additions & 5 deletions sky/clouds/service_catalog/aws_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,7 @@
_image_df = common.read_catalog('aws/images.csv',
pull_frequency_hours=_PULL_FREQUENCY_HOURS)

_quotas_df = common.read_catalog('aws/instance_quota_mapping.csv',
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
_quotas_df = common.read_catalog('aws/instance_quota_mapping.csv')


def _get_az_mappings(aws_user_hash: str) -> Optional[pd.DataFrame]:
Expand Down Expand Up @@ -159,9 +158,12 @@ def _get_df() -> pd.DataFrame:


def get_quota_code(instance_type: str, use_spot: bool) -> Optional[str]:
# Get the quota code from the accelerator instance type
# This will be used in the botocore command to check for
# a non-zero quota
"""Get the quota code based on `instance_type` and `use_spot`.

The quota code is fetched from `_quotas_df` based on the instance type
specified, and will then be utilized in a botocore API command in order
to check its quota.
"""

if use_spot:
spot_header = 'SpotInstanceCode'
Expand Down
23 changes: 23 additions & 0 deletions sky/clouds/service_catalog/gcp_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
_image_df = common.read_catalog('gcp/images.csv',
pull_frequency_hours=_PULL_FREQUENCY_HOURS)

_quotas_df = common.read_catalog('gcp/accelerator_quota_mapping.csv')

_TPU_REGIONS = [
'us-central1',
'europe-west4',
Expand Down Expand Up @@ -164,6 +166,27 @@ def _closest_power_of_two(x: int) -> int:
return 1 << ((x - 1).bit_length() - 1)


def get_quota_code(accelerator: str, use_spot: bool) -> Optional[str]:
"""Get the quota code based on `accelerator` and `use_spot`.

The quota code is fetched from `_quotas_df` based on the accelerator
specified, and will then be utilized in a GCP CLI command in order
to check for a non-zero quota.
"""

if use_spot:
spot_header = 'SpotInstanceCode'
else:
spot_header = 'OnDemandInstanceCode'
try:
quota_code = _quotas_df.loc[_quotas_df['Accelerator'] == accelerator,
spot_header].values[0]
return quota_code

except IndexError:
return None


def instance_type_exists(instance_type: str) -> bool:
"""Check the existence of the instance type."""
if instance_type == 'TPU-VM':
Expand Down
55 changes: 52 additions & 3 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import urllib.parse
import uuid
import os
import warnings

import colorama
import jinja2
Expand Down Expand Up @@ -182,16 +183,37 @@ def run_one_test(test: Test) -> Tuple[int, str, str]:


def get_aws_region_for_quota_failover() -> Optional[str]:

candidate_regions = AWS.regions_with_offering(instance_type='p3.16xlarge',
accelerators=None,
use_spot=True,
region=None,
zone=None)

for region in candidate_regions:
if not AWS.check_quota_available(
region=region.name, instance_type='p3.16xlarge', use_spot=True):
resources = sky.Resources(cloud=sky.AWS(),
instance_type='p3.16xlarge',
region=region.name,
use_spot=True)
if not AWS.check_quota_available(resources):
return region.name

return None


def get_gcp_region_for_quota_failover() -> Optional[str]:

candidate_regions = GCP.regions_with_offering(instance_type=None,
accelerators={'A100-80GB': 1},
use_spot=True,
region=None,
zone=None)

for region in candidate_regions:
if not GCP.check_quota_available(
sky.Resources(cloud=sky.GCP(),
region=region.name,
accelerators={'A100-80GB': 1},
use_spot=True)):
return region.name

return None
Expand Down Expand Up @@ -2294,6 +2316,10 @@ def test_aws_zero_quota_failover():
region = get_aws_region_for_quota_failover()

if not region:
pytest.xfail(
'Unable to test zero quota failover optimization — quotas '
'for EC2 P3 instances were found on all AWS regions. Is this '
'expected for your account?')
return

test = Test(
Expand All @@ -2306,6 +2332,29 @@ def test_aws_zero_quota_failover():
run_one_test(test)


@pytest.mark.gcp
def test_gcp_zero_quota_failover():

name = _get_cluster_name()
region = get_gcp_region_for_quota_failover()

if not region:
pytest.xfail(
'Unable to test zero quota failover optimization — quotas '
'for A100-80GB GPUs were found on all GCP regions. Is this '
'expected for your account?')
return

test = Test(
'gcp-zero-quota-failover',
[
f'sky launch -y -c {name} --cloud gcp --region {region} --gpus A100-80GB:1 --use-spot | grep "Found no quota"',
],
f'sky down -y {name}',
)
run_one_test(test)


# ------- Testing user ray cluster --------
def test_user_ray_cluster():
name = _get_cluster_name()
Expand Down