Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GCP: allow stop/autostop for spot VMs. #2877

Merged
merged 13 commits into from
Dec 28, 2023
15 changes: 14 additions & 1 deletion sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3679,7 +3679,7 @@ def _teardown(self,
except filelock.Timeout as e:
raise RuntimeError(
f'Cluster {cluster_name!r} is locked by {lock_path}. '
'Check to see if it is still being launched.') from e
'Check to see if it is still being launched') from e

# --- CloudVMRayBackend Specific APIs ---

Expand Down Expand Up @@ -4278,6 +4278,19 @@ def set_autostop(self,
down: bool = False,
stream_logs: bool = True) -> None:
if idle_minutes_to_autostop is not None:

# Check if we're stopping spot
assert (handle.launched_resources is not None and
handle.launched_resources.cloud is not None), handle
if handle.launched_resources.use_spot:
# This can be triggered by, for example:
# sky launch --cloud aws --use-spot --cpus 2+ -i0 -y
# The cluster will be UP, the launch exited with code 1, and
# any stage after PRE_EXEC is not executed.
cloud = handle.launched_resources.cloud
cloud.check_features_are_supported(
{clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE})
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved

code = autostop_lib.AutostopCodeGen.set_autostop(
idle_minutes_to_autostop, self.NAME, down)
returncode, _, stderr = self.run_on_head(handle,
Expand Down
6 changes: 5 additions & 1 deletion sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ class AWS(clouds.Cloud):
@classmethod
def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
return dict()
return {
clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE:
('Stopping spot instances is currently not supported on'
f' {cls._REPR}.'),
}

@classmethod
def max_cluster_name_length(cls) -> Optional[int]:
Expand Down
5 changes: 4 additions & 1 deletion sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
return {
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
(f'Migrating disk is not supported in {cls._REPR}.'),
(f'Migrating disk is currently not supported on {cls._REPR}.'),
clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE:
('Stopping spot instances is currently not supported on'
f' {cls._REPR}.'),
}

@classmethod
Expand Down
16 changes: 8 additions & 8 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,16 @@ class CloudImplementationFeatures(enum.Enum):

Used by Cloud.check_features_are_supported().

Note: If any new feature is added, please check and update
NOTE: If any new feature is added, please check and update
_cloud_unsupported_features in all clouds to make sure the
check_features_are_supported() works as expected.
"""
STOP = 'stop'
AUTOSTOP = 'autostop'
STOP = 'stop' # Includes both stop and autostop.
MULTI_NODE = 'multi-node'
CLONE_DISK_FROM_CLUSTER = 'clone_disk_from_cluster'
DOCKER_IMAGE = 'docker_image'
SPOT_INSTANCE = 'spot_instance'
STOP_SPOT_INSTANCE = 'stop_spot_instance'
CUSTOM_DISK_TIER = 'custom_disk_tier'
OPEN_PORTS = 'open_ports'

Expand Down Expand Up @@ -460,9 +460,9 @@ def check_features_are_supported(
cls, requested_features: Set[CloudImplementationFeatures]) -> None:
"""Errors out if the cloud does not support all requested features.

For instance, Lambda Cloud does not support autostop, so
For instance, Lambda Cloud does not support stop, so
Lambda.check_features_are_supported({
CloudImplementationFeatures.AUTOSTOP
CloudImplementationFeatures.STOP
}) raises the exception.

Raises:
Expand All @@ -476,9 +476,9 @@ def check_features_are_supported(
(str(cls._REPR).lower(), 'ssh_proxy_command'), None) is not None:
unsupported_features2reason.update({
CloudImplementationFeatures.DOCKER_IMAGE: (
f'Docker image is not supported in {cls._REPR} when proxy '
'command is set. Please remove proxy command in the config.'
),
f'Docker image is currently not supported on {cls._REPR} '
'when proxy command is set. Please remove proxy command in '
'the config.'),
})

unsupported_features = set(unsupported_features2reason.keys())
Expand Down
12 changes: 8 additions & 4 deletions sky/clouds/ibm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,19 @@ def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
return {
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
(f'Migrating disk is not supported in {cls._REPR}.'),
(f'Migrating disk is currently not supported on {cls._REPR}.'),
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
(f'Docker image is not supported in {cls._REPR}. '
(f'Docker image is currently not supported on {cls._REPR}. '
'You can try running docker command inside the '
'`run` section in task.yaml.'),
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
(f'Custom disk tier is not supported in {cls._REPR}.'),
(f'Custom disk tier is currently not supported on {cls._REPR}.'
),
clouds.CloudImplementationFeatures.OPEN_PORTS:
(f'Opening ports is not supported in {cls._REPR}.'),
(f'Opening ports is currently not supported on {cls._REPR}.'),
clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE:
('Stopping spot instances is currently not supported on'
f' {cls._REPR}.'),
}

@classmethod
Expand Down
5 changes: 3 additions & 2 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,12 @@ class Kubernetes(clouds.Cloud):
# https://kubernetes.io/blog/2022/12/05/forensic-container-checkpointing-alpha/ # pylint: disable=line-too-long
clouds.CloudImplementationFeatures.STOP: 'Kubernetes does not '
'support stopping VMs.',
clouds.CloudImplementationFeatures.AUTOSTOP: 'Kubernetes does not '
'support stopping VMs.',
clouds.CloudImplementationFeatures.SPOT_INSTANCE: 'Spot instances are '
'not supported in '
'Kubernetes.',
clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE:
('Stopping spot instances is currently not supported on Kubernetes.'
),
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: 'Custom disk '
'tiers are not '
'supported in '
Expand Down
10 changes: 6 additions & 4 deletions sky/clouds/lambda_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,17 @@ class Lambda(clouds.Cloud):
# STOP/AUTOSTOP: The Lambda cloud provider does not support stopping VMs.
_CLOUD_UNSUPPORTED_FEATURES = {
clouds.CloudImplementationFeatures.STOP: 'Lambda cloud does not support stopping VMs.',
clouds.CloudImplementationFeatures.AUTOSTOP: 'Lambda cloud does not support stopping VMs.',
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: f'Migrating disk is not supported in {_REPR}.',
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: f'Migrating disk is currently not supported on {_REPR}.',
clouds.CloudImplementationFeatures.DOCKER_IMAGE: (
f'Docker image is not supported in {_REPR}. '
f'Docker image is currently not supported on {_REPR}. '
'You can try running docker command inside the `run` section in task.yaml.'
),
clouds.CloudImplementationFeatures.SPOT_INSTANCE: f'Spot instances are not supported in {_REPR}.',
clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE:
('Stopping spot instances is currently not supported on'
f' {_REPR}.'),
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
clouds.CloudImplementationFeatures.OPEN_PORTS: f'Opening ports is not supported in {_REPR}.',
clouds.CloudImplementationFeatures.OPEN_PORTS: f'Opening ports is currently not supported on {_REPR}.',
}

@classmethod
Expand Down
2 changes: 0 additions & 2 deletions sky/clouds/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ class Local(clouds.Cloud):
_CLOUD_UNSUPPORTED_FEATURES = {
clouds.CloudImplementationFeatures.STOP:
('Local cloud does not support stopping instances.'),
clouds.CloudImplementationFeatures.AUTOSTOP:
('Local cloud does not support stopping instances.'),
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
('Migrating disk is not supported for Local.'),
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
Expand Down
9 changes: 6 additions & 3 deletions sky/clouds/oci.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,16 @@ def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
return {
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
(f'Migrating disk is not supported in {cls._REPR}.'),
(f'Migrating disk is currently not supported on {cls._REPR}.'),
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
(f'Docker image is not supported in {cls._REPR}. '
(f'Docker image is currently not supported on {cls._REPR}. '
'You can try running docker command inside the '
'`run` section in task.yaml.'),
clouds.CloudImplementationFeatures.OPEN_PORTS:
(f'Opening ports is not supported in {cls._REPR}.'),
(f'Opening ports is currently not supported on {cls._REPR}.'),
clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE:
('Stopping spot instances is currently not supported on'
f' {cls._REPR}.'),
}

@classmethod
Expand Down
9 changes: 6 additions & 3 deletions sky/clouds/scp.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,20 @@ class SCP(clouds.Cloud):
_CLOUD_UNSUPPORTED_FEATURES = {
clouds.CloudImplementationFeatures.MULTI_NODE: _MULTI_NODE,
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
(f'Migrating disk is not supported in {_REPR}.'),
(f'Migrating disk is currently not supported on {_REPR}.'),
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
(f'Docker image is not supported in {_REPR}. '
(f'Docker image is currently not supported on {_REPR}. '
'You can try running docker command inside the '
'`run` section in task.yaml.'),
clouds.CloudImplementationFeatures.SPOT_INSTANCE:
(f'Spot instances are not supported in {_REPR}.'),
clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE:
('Stopping spot instances is currently not supported on'
f' {_REPR}.'),
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
(f'Custom disk tiers are not supported in {_REPR}.'),
clouds.CloudImplementationFeatures.OPEN_PORTS:
(f'Opening ports is not supported in {_REPR}.'),
(f'Opening ports is currently not supported on {_REPR}.'),
}

_INDENT_PREFIX = ' '
Expand Down
53 changes: 39 additions & 14 deletions sky/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
the instances will stop, while the disks will still be charged. Those
disks will be reattached when restarting the cluster.

Currently, spot instance clusters cannot be stopped.
Currently, spot instance clusters cannot be stopped (except for GCP, which
does allow disk contents to be preserved when stopping spot VMs).

Args:
cluster_name: name of the cluster to stop.
Expand Down Expand Up @@ -325,11 +326,20 @@ def stop(cluster_name: str, purge: bool = False) -> None:
'is not supported.')
# Check cloud supports stopping instances
cloud = handle.launched_resources.cloud
assert cloud is not None, handle
cloud.check_features_are_supported(
{clouds.CloudImplementationFeatures.STOP})
if handle.launched_resources.use_spot:
# Check cloud supports stopping spot instances
supports_stop_spot = True
try:
cloud.check_features_are_supported(
{clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE})
except exceptions.NotSupportedError:
supports_stop_spot = False
# Allow GCP spot to be stopped since it preserves disk:
# https://cloud.google.com/compute/docs/instances/preemptible#preemption-process # pylint: disable=line-too-long
if handle.launched_resources.use_spot and not supports_stop_spot:
# Disable spot instances to be stopped.
# TODO(suquark): enable GCP+spot to be stopped in the future.
raise exceptions.NotSupportedError(
f'{colorama.Fore.YELLOW}Stopping cluster '
f'{cluster_name!r}... skipped.{colorama.Style.RESET_ALL}\n'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about the following for simplicity?

features = {clouds.CloudImplementationFeatures.STOP}
if handle.launched_resources.use_spot:
    features.insert(clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE)
cloud.check_features_are_supported(features)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since check_features_are_supported() raises an error I found it harder to read. For example, if a cloud doesn't support stopping spot and here the cluster is an on-demand one, we do not want to perform a check on STOP_SPOT_INSTANCE or raise.

Expand Down Expand Up @@ -437,31 +447,46 @@ def autostop(
operation=operation,
)
backend = backend_utils.get_backend_from_handle(handle)

if tpu_utils.is_tpu_vm_pod(handle.launched_resources) and not down:
# Stopping TPU VM Pods is not supported. Example error:
# "StopNode" is not supported on pod nodes: "v2-32"
# Autodown is supported.
# Reference: https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_a_with_gcloud # pylint: disable=line-too-long
raise exceptions.NotSupportedError(
f'{operation} cluster {cluster_name!r} with TPU VM Pod '
'is not supported.')

# Check cloud supports stopping spot instances
cloud = handle.launched_resources.cloud
assert cloud is not None, handle
supports_stop_spot = True
try:
cloud.check_features_are_supported(
{clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE})
except exceptions.NotSupportedError:
supports_stop_spot = False

if not isinstance(backend, backends.CloudVmRayBackend):
raise exceptions.NotSupportedError(
f'{operation} cluster {cluster_name!r} with backend '
f'{backend.__class__.__name__!r} is not supported.')
elif handle.launched_resources.use_spot and not down and not is_cancel:
elif (handle.launched_resources.use_spot and not down and not is_cancel and
not supports_stop_spot):
# Disable spot instances to be autostopped.
# TODO(ewzeng): allow autostop for spot when stopping is supported.
#
# Exception: Allow GCP spot to be stopped since it preserves disk:
# https://cloud.google.com/compute/docs/instances/preemptible#preemption-process # pylint: disable=line-too-long
raise exceptions.NotSupportedError(
f'{colorama.Fore.YELLOW}Scheduling autostop on cluster '
f'{cluster_name!r}...skipped.{colorama.Style.RESET_ALL}\n'
' Stopping spot instances is not supported as the attached '
'disks will be lost.')

if tpu_utils.is_tpu_vm_pod(handle.launched_resources):
# Reference:
# https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_a_with_gcloud # pylint: disable=line-too-long
raise exceptions.NotSupportedError(
f'{operation} cluster {cluster_name!r} with TPU VM Pod '
'is not supported.')

# Check autostop is implemented for cloud
cloud = handle.launched_resources.cloud
if not down and idle_minutes >= 0:
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
cloud.check_features_are_supported(
{clouds.CloudImplementationFeatures.AUTOSTOP})
{clouds.CloudImplementationFeatures.STOP})

usage_lib.record_cluster_name_for_current_operation(cluster_name)
backend.set_autostop(handle, idle_minutes, down)
Expand Down
17 changes: 6 additions & 11 deletions sky/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,23 +255,18 @@ def execute(
f'{colorama.Style.RESET_ALL}')
idle_minutes_to_autostop = 1
stages.remove(Stage.DOWN)

if not down:
requested_features.add(
clouds.CloudImplementationFeatures.AUTOSTOP)
# TODO(ewzeng): allow autostop for spot when stopping is
# supported.
if task.use_spot:
with ux_utils.print_exception_no_traceback():
raise ValueError(
'Autostop is not supported for spot instances.')
requested_features.add(clouds.CloudImplementationFeatures.STOP)
# NOTE: in general we may not have sufficiently specified info
# (cloud/resource) to check STOP_SPOT_INSTANCE here. This is checked in
# the backend.

elif idle_minutes_to_autostop is not None:
# TODO(zhwu): Autostop is not supported for non-CloudVmRayBackend.
with ux_utils.print_exception_no_traceback():
raise ValueError(
f'Backend {backend.NAME} does not support autostop, please try '
f'{backends.CloudVmRayBackend.NAME}')
f'Backend {backend.NAME} does not support autostop, please try'
f' {backends.CloudVmRayBackend.NAME}')

if Stage.CLONE_DISK in stages:
task = _maybe_clone_disk_from_cluster(clone_disk_from, cluster_name,
Expand Down
29 changes: 29 additions & 0 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -1866,6 +1866,35 @@ def test_use_spot(generic_cloud: str):
run_one_test(test)


@pytest.mark.gcp
def test_stop_gcp_spot():
"""Test GCP spot can be stopped, autostopped, restarted."""
name = _get_cluster_name()
test = Test(
'stop_gcp_spot',
[
f'sky launch -c {name} --cloud gcp --use-spot --cpus 2+ -y -- touch myfile',
# stop should go through:
f'sky stop {name} -y',
f'sky start {name} -y',
f'sky exec {name} -- ls myfile',
f'sky logs {name} 2 --status',
f'sky autostop {name} -i0 -y',
'sleep 90',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED',
f'sky start {name} -y',
f'sky exec {name} -- ls myfile',
f'sky logs {name} 3 --status',
# -i option at launch should go through:
f'sky launch -c {name} -i0 -y',
'sleep 90',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED',
],
f'sky down -y {name}',
)
run_one_test(test)


# ---------- Testing managed spot ----------
@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
Expand Down