From dd76f04535aad433921b75761a9eb71476e29f24 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Sat, 16 Dec 2023 10:22:50 -0800 Subject: [PATCH 01/10] GCP: allow stop/autostop for spot VMs. --- sky/core.py | 32 +++++++++++++++++++------------- tests/test_smoke.py | 24 ++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/sky/core.py b/sky/core.py index 862856abc79..2384d278568 100644 --- a/sky/core.py +++ b/sky/core.py @@ -293,7 +293,8 @@ def stop(cluster_name: str, purge: bool = False) -> None: the instances will stop, while the disks will still be charged. Those disks will be reattached when restarting the cluster. - Currently, spot instance clusters cannot be stopped. + Currently, spot instance clusters cannot be stopped (except for GCP, which + does allow disk contents to be preserved when stopping spot VMs). Args: cluster_name: name of the cluster to stop. @@ -327,9 +328,11 @@ def stop(cluster_name: str, purge: bool = False) -> None: cloud = handle.launched_resources.cloud cloud.check_features_are_supported( {clouds.CloudImplementationFeatures.STOP}) - if handle.launched_resources.use_spot: + # Allow GCP spot to be stopped since it preserves disk: + # https://cloud.google.com/compute/docs/instances/preemptible#preemption-process # pylint: disable=line-too-long + if handle.launched_resources.use_spot and not cloud.is_same_cloud( + clouds.GCP()): # Disable spot instances to be stopped. - # TODO(suquark): enable GCP+spot to be stopped in the future. raise exceptions.NotSupportedError( f'{colorama.Fore.YELLOW}Stopping cluster ' f'{cluster_name!r}... skipped.{colorama.Style.RESET_ALL}\n' @@ -437,26 +440,29 @@ def autostop( operation=operation, ) backend = backend_utils.get_backend_from_handle(handle) + + if tpu_utils.is_tpu_vm_pod(handle.launched_resources): + # Reference: + # https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_a_with_gcloud # pylint: disable=line-too-long + raise exceptions.NotSupportedError( + f'{operation} cluster {cluster_name!r} with TPU VM Pod ' + 'is not supported.') + if not isinstance(backend, backends.CloudVmRayBackend): raise exceptions.NotSupportedError( f'{operation} cluster {cluster_name!r} with backend ' f'{backend.__class__.__name__!r} is not supported.') - elif handle.launched_resources.use_spot and not down and not is_cancel: + elif (handle.launched_resources.use_spot and not down and not is_cancel and + not handle.launched_resources.cloud.is_same_cloud(clouds.GCP())): # Disable spot instances to be autostopped. - # TODO(ewzeng): allow autostop for spot when stopping is supported. + # + # Exception: Allow GCP spot to be stopped since it preserves disk: + # https://cloud.google.com/compute/docs/instances/preemptible#preemption-process # pylint: disable=line-too-long raise exceptions.NotSupportedError( f'{colorama.Fore.YELLOW}Scheduling autostop on cluster ' f'{cluster_name!r}...skipped.{colorama.Style.RESET_ALL}\n' ' Stopping spot instances is not supported as the attached ' 'disks will be lost.') - - if tpu_utils.is_tpu_vm_pod(handle.launched_resources): - # Reference: - # https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_a_with_gcloud # pylint: disable=line-too-long - raise exceptions.NotSupportedError( - f'{operation} cluster {cluster_name!r} with TPU VM Pod ' - 'is not supported.') - # Check autostop is implemented for cloud cloud = handle.launched_resources.cloud if not down and idle_minutes >= 0: diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 1963b32f119..281880385fc 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1866,6 +1866,30 @@ def test_use_spot(generic_cloud: str): run_one_test(test) +@pytest.mark.gcp +def test_stop_gcp_spot(): + """Test GCP spot can be stopped, autostopped, restarted.""" + name = _get_cluster_name() + test = Test( + 'stop_gcp_spot', + [ + f'sky launch -c {name} --cloud gcp --use-spot --cpus 2+ -y -- touch myfile', + f'sky stop {name} -y', + f'sky start {name} -y', + f'sky exec {name} -- ls myfile', + f'sky logs {name} 2 --status', + f'sky autostop {name} -i0 -y', + 'sleep 90', + f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', + f'sky start {name} -y', + f'sky exec {name} -- ls myfile', + f'sky logs {name} 3 --status', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + # ---------- Testing managed spot ---------- @pytest.mark.no_azure # Azure does not support spot instances @pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances From 64a4f9b997baf00fc0a3d7d65b44c9d722927770 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Mon, 18 Dec 2023 21:11:00 -0800 Subject: [PATCH 02/10] CloudImplementationFeatures.STOP_SPOT_INSTANCE --- sky/clouds/aws.py | 6 +++++- sky/clouds/azure.py | 5 ++++- sky/clouds/cloud.py | 9 +++++---- sky/clouds/ibm.py | 12 ++++++++---- sky/clouds/kubernetes.py | 3 +++ sky/clouds/lambda_cloud.py | 9 ++++++--- sky/clouds/oci.py | 9 ++++++--- sky/clouds/scp.py | 9 ++++++--- 8 files changed, 43 insertions(+), 19 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index b524c3a9d96..a0457d2fd0f 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -116,7 +116,11 @@ class AWS(clouds.Cloud): @classmethod def _cloud_unsupported_features( cls) -> Dict[clouds.CloudImplementationFeatures, str]: - return dict() + return { + clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: + ('Stopping spot instances is currently not supported on' + f' {cls._REPR}.'), + } @classmethod def max_cluster_name_length(cls) -> Optional[int]: diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 6046d4b3805..2d0d6267f1e 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -65,7 +65,10 @@ def _cloud_unsupported_features( cls) -> Dict[clouds.CloudImplementationFeatures, str]: return { clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: - (f'Migrating disk is not supported in {cls._REPR}.'), + (f'Migrating disk is currently not supported on {cls._REPR}.'), + clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: + ('Stopping spot instances is currently not supported on' + f' {cls._REPR}.'), } @classmethod diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 98f0048b873..7c2337fd6ce 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -30,7 +30,7 @@ class CloudImplementationFeatures(enum.Enum): Used by Cloud.check_features_are_supported(). - Note: If any new feature is added, please check and update + NOTE: If any new feature is added, please check and update _cloud_unsupported_features in all clouds to make sure the check_features_are_supported() works as expected. """ @@ -40,6 +40,7 @@ class CloudImplementationFeatures(enum.Enum): CLONE_DISK_FROM_CLUSTER = 'clone_disk_from_cluster' DOCKER_IMAGE = 'docker_image' SPOT_INSTANCE = 'spot_instance' + STOP_SPOT_INSTANCE = 'stop_spot_instance' CUSTOM_DISK_TIER = 'custom_disk_tier' OPEN_PORTS = 'open_ports' @@ -476,9 +477,9 @@ def check_features_are_supported( (str(cls._REPR).lower(), 'ssh_proxy_command'), None) is not None: unsupported_features2reason.update({ CloudImplementationFeatures.DOCKER_IMAGE: ( - f'Docker image is not supported in {cls._REPR} when proxy ' - 'command is set. Please remove proxy command in the config.' - ), + f'Docker image is currently not supported on {cls._REPR} ' + 'when proxy command is set. Please remove proxy command in ' + 'the config.'), }) unsupported_features = set(unsupported_features2reason.keys()) diff --git a/sky/clouds/ibm.py b/sky/clouds/ibm.py index 8b3376784e7..29e26e7e989 100644 --- a/sky/clouds/ibm.py +++ b/sky/clouds/ibm.py @@ -38,15 +38,19 @@ def _cloud_unsupported_features( cls) -> Dict[clouds.CloudImplementationFeatures, str]: return { clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: - (f'Migrating disk is not supported in {cls._REPR}.'), + (f'Migrating disk is currently not supported on {cls._REPR}.'), clouds.CloudImplementationFeatures.DOCKER_IMAGE: - (f'Docker image is not supported in {cls._REPR}. ' + (f'Docker image is currently not supported on {cls._REPR}. ' 'You can try running docker command inside the ' '`run` section in task.yaml.'), clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: - (f'Custom disk tier is not supported in {cls._REPR}.'), + (f'Custom disk tier is currently not supported on {cls._REPR}.' + ), clouds.CloudImplementationFeatures.OPEN_PORTS: - (f'Opening ports is not supported in {cls._REPR}.'), + (f'Opening ports is currently not supported on {cls._REPR}.'), + clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: + ('Stopping spot instances is currently not supported on' + f' {cls._REPR}.'), } @classmethod diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index fede03c9044..841c8d7fdbc 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -59,6 +59,9 @@ class Kubernetes(clouds.Cloud): clouds.CloudImplementationFeatures.SPOT_INSTANCE: 'Spot instances are ' 'not supported in ' 'Kubernetes.', + clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: + ('Stopping spot instances is currently not supported on Kubernetes.' + ), clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: 'Custom disk ' 'tiers are not ' 'supported in ' diff --git a/sky/clouds/lambda_cloud.py b/sky/clouds/lambda_cloud.py index fbbcd8e8b5f..41420afe93b 100644 --- a/sky/clouds/lambda_cloud.py +++ b/sky/clouds/lambda_cloud.py @@ -37,14 +37,17 @@ class Lambda(clouds.Cloud): _CLOUD_UNSUPPORTED_FEATURES = { clouds.CloudImplementationFeatures.STOP: 'Lambda cloud does not support stopping VMs.', clouds.CloudImplementationFeatures.AUTOSTOP: 'Lambda cloud does not support stopping VMs.', - clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: f'Migrating disk is not supported in {_REPR}.', + clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: f'Migrating disk is currently not supported on {_REPR}.', clouds.CloudImplementationFeatures.DOCKER_IMAGE: ( - f'Docker image is not supported in {_REPR}. ' + f'Docker image is currently not supported on {_REPR}. ' 'You can try running docker command inside the `run` section in task.yaml.' ), clouds.CloudImplementationFeatures.SPOT_INSTANCE: f'Spot instances are not supported in {_REPR}.', + clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: + ('Stopping spot instances is currently not supported on' + f' {_REPR}.'), clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.', - clouds.CloudImplementationFeatures.OPEN_PORTS: f'Opening ports is not supported in {_REPR}.', + clouds.CloudImplementationFeatures.OPEN_PORTS: f'Opening ports is currently not supported on {_REPR}.', } @classmethod diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index ed1b129d86e..a8978d26bac 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -46,13 +46,16 @@ def _cloud_unsupported_features( cls) -> Dict[clouds.CloudImplementationFeatures, str]: return { clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: - (f'Migrating disk is not supported in {cls._REPR}.'), + (f'Migrating disk is currently not supported on {cls._REPR}.'), clouds.CloudImplementationFeatures.DOCKER_IMAGE: - (f'Docker image is not supported in {cls._REPR}. ' + (f'Docker image is currently not supported on {cls._REPR}. ' 'You can try running docker command inside the ' '`run` section in task.yaml.'), clouds.CloudImplementationFeatures.OPEN_PORTS: - (f'Opening ports is not supported in {cls._REPR}.'), + (f'Opening ports is currently not supported on {cls._REPR}.'), + clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: + ('Stopping spot instances is currently not supported on' + f' {cls._REPR}.'), } @classmethod diff --git a/sky/clouds/scp.py b/sky/clouds/scp.py index be792bfe0c1..50e6c7de4bb 100644 --- a/sky/clouds/scp.py +++ b/sky/clouds/scp.py @@ -44,17 +44,20 @@ class SCP(clouds.Cloud): _CLOUD_UNSUPPORTED_FEATURES = { clouds.CloudImplementationFeatures.MULTI_NODE: _MULTI_NODE, clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: - (f'Migrating disk is not supported in {_REPR}.'), + (f'Migrating disk is currently not supported on {_REPR}.'), clouds.CloudImplementationFeatures.DOCKER_IMAGE: - (f'Docker image is not supported in {_REPR}. ' + (f'Docker image is currently not supported on {_REPR}. ' 'You can try running docker command inside the ' '`run` section in task.yaml.'), clouds.CloudImplementationFeatures.SPOT_INSTANCE: (f'Spot instances are not supported in {_REPR}.'), + clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: + ('Stopping spot instances is currently not supported on' + f' {_REPR}.'), clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: (f'Custom disk tiers are not supported in {_REPR}.'), clouds.CloudImplementationFeatures.OPEN_PORTS: - (f'Opening ports is not supported in {_REPR}.'), + (f'Opening ports is currently not supported on {_REPR}.'), } _INDENT_PREFIX = ' ' From 6eb7ed80ed0f1393ff04c7feeea733c661051525 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Mon, 18 Dec 2023 21:11:07 -0800 Subject: [PATCH 03/10] update core.py --- sky/core.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/sky/core.py b/sky/core.py index 2384d278568..8aab4084c68 100644 --- a/sky/core.py +++ b/sky/core.py @@ -326,12 +326,19 @@ def stop(cluster_name: str, purge: bool = False) -> None: 'is not supported.') # Check cloud supports stopping instances cloud = handle.launched_resources.cloud + assert cloud is not None, handle cloud.check_features_are_supported( {clouds.CloudImplementationFeatures.STOP}) + # Check cloud supports stopping spot instances + supports_stop_spot = True + try: + cloud.check_features_are_supported( + {clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE}) + except exceptions.NotSupportedError: + supports_stop_spot = False # Allow GCP spot to be stopped since it preserves disk: # https://cloud.google.com/compute/docs/instances/preemptible#preemption-process # pylint: disable=line-too-long - if handle.launched_resources.use_spot and not cloud.is_same_cloud( - clouds.GCP()): + if handle.launched_resources.use_spot and not supports_stop_spot: # Disable spot instances to be stopped. raise exceptions.NotSupportedError( f'{colorama.Fore.YELLOW}Stopping cluster ' @@ -448,12 +455,22 @@ def autostop( f'{operation} cluster {cluster_name!r} with TPU VM Pod ' 'is not supported.') + # Check cloud supports stopping spot instances + cloud = handle.launched_resources.cloud + assert cloud is not None, handle + supports_stop_spot = True + try: + cloud.check_features_are_supported( + {clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE}) + except exceptions.NotSupportedError: + supports_stop_spot = False + if not isinstance(backend, backends.CloudVmRayBackend): raise exceptions.NotSupportedError( f'{operation} cluster {cluster_name!r} with backend ' f'{backend.__class__.__name__!r} is not supported.') elif (handle.launched_resources.use_spot and not down and not is_cancel and - not handle.launched_resources.cloud.is_same_cloud(clouds.GCP())): + not supports_stop_spot): # Disable spot instances to be autostopped. # # Exception: Allow GCP spot to be stopped since it preserves disk: From 24669527b409d97289b0f08efd72e1bccc5c9733 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Thu, 21 Dec 2023 22:07:12 -0800 Subject: [PATCH 04/10] changes --- sky/backends/cloud_vm_ray_backend.py | 15 ++++++++++++++- sky/clouds/cloud.py | 7 +++---- sky/clouds/kubernetes.py | 2 -- sky/clouds/lambda_cloud.py | 1 - sky/clouds/local.py | 2 -- sky/core.py | 10 ++++++---- sky/execution.py | 16 ++++++---------- tests/test_smoke.py | 5 +++++ 8 files changed, 34 insertions(+), 24 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 00aa7ce0d70..331e96e5685 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3679,7 +3679,7 @@ def _teardown(self, except filelock.Timeout as e: raise RuntimeError( f'Cluster {cluster_name!r} is locked by {lock_path}. ' - 'Check to see if it is still being launched.') from e + 'Check to see if it is still being launched') from e # --- CloudVMRayBackend Specific APIs --- @@ -4278,6 +4278,19 @@ def set_autostop(self, down: bool = False, stream_logs: bool = True) -> None: if idle_minutes_to_autostop is not None: + + # Check if we're stopping spot + assert (handle.launched_resources is not None and + handle.launched_resources.cloud is not None), handle + if handle.launched_resources.use_spot: + # This can be triggered by, for example: + # sky launch --cloud aws --use-spot --cpus 2+ -i0 -y + # The cluster will be UP, the launch exited with code 1, and + # any stage after PRE_EXEC is not executed. + cloud = handle.launched_resources.cloud + cloud.check_features_are_supported( + {clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE}) + code = autostop_lib.AutostopCodeGen.set_autostop( idle_minutes_to_autostop, self.NAME, down) returncode, _, stderr = self.run_on_head(handle, diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 7c2337fd6ce..dc5c5874929 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -34,8 +34,7 @@ class CloudImplementationFeatures(enum.Enum): _cloud_unsupported_features in all clouds to make sure the check_features_are_supported() works as expected. """ - STOP = 'stop' - AUTOSTOP = 'autostop' + STOP = 'stop' # Includes both stop and autostop. MULTI_NODE = 'multi-node' CLONE_DISK_FROM_CLUSTER = 'clone_disk_from_cluster' DOCKER_IMAGE = 'docker_image' @@ -461,9 +460,9 @@ def check_features_are_supported( cls, requested_features: Set[CloudImplementationFeatures]) -> None: """Errors out if the cloud does not support all requested features. - For instance, Lambda Cloud does not support autostop, so + For instance, Lambda Cloud does not support stop, so Lambda.check_features_are_supported({ - CloudImplementationFeatures.AUTOSTOP + CloudImplementationFeatures.STOP }) raises the exception. Raises: diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 841c8d7fdbc..e762bf6edfb 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -54,8 +54,6 @@ class Kubernetes(clouds.Cloud): # https://kubernetes.io/blog/2022/12/05/forensic-container-checkpointing-alpha/ # pylint: disable=line-too-long clouds.CloudImplementationFeatures.STOP: 'Kubernetes does not ' 'support stopping VMs.', - clouds.CloudImplementationFeatures.AUTOSTOP: 'Kubernetes does not ' - 'support stopping VMs.', clouds.CloudImplementationFeatures.SPOT_INSTANCE: 'Spot instances are ' 'not supported in ' 'Kubernetes.', diff --git a/sky/clouds/lambda_cloud.py b/sky/clouds/lambda_cloud.py index 41420afe93b..c339259f6c7 100644 --- a/sky/clouds/lambda_cloud.py +++ b/sky/clouds/lambda_cloud.py @@ -36,7 +36,6 @@ class Lambda(clouds.Cloud): # STOP/AUTOSTOP: The Lambda cloud provider does not support stopping VMs. _CLOUD_UNSUPPORTED_FEATURES = { clouds.CloudImplementationFeatures.STOP: 'Lambda cloud does not support stopping VMs.', - clouds.CloudImplementationFeatures.AUTOSTOP: 'Lambda cloud does not support stopping VMs.', clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: f'Migrating disk is currently not supported on {_REPR}.', clouds.CloudImplementationFeatures.DOCKER_IMAGE: ( f'Docker image is currently not supported on {_REPR}. ' diff --git a/sky/clouds/local.py b/sky/clouds/local.py index 4cd5ab02639..218d1483b74 100644 --- a/sky/clouds/local.py +++ b/sky/clouds/local.py @@ -30,8 +30,6 @@ class Local(clouds.Cloud): _CLOUD_UNSUPPORTED_FEATURES = { clouds.CloudImplementationFeatures.STOP: ('Local cloud does not support stopping instances.'), - clouds.CloudImplementationFeatures.AUTOSTOP: - ('Local cloud does not support stopping instances.'), clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: ('Migrating disk is not supported for Local.'), clouds.CloudImplementationFeatures.DOCKER_IMAGE: diff --git a/sky/core.py b/sky/core.py index 8aab4084c68..bf8d32d82bd 100644 --- a/sky/core.py +++ b/sky/core.py @@ -448,9 +448,11 @@ def autostop( ) backend = backend_utils.get_backend_from_handle(handle) - if tpu_utils.is_tpu_vm_pod(handle.launched_resources): - # Reference: - # https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_a_with_gcloud # pylint: disable=line-too-long + if tpu_utils.is_tpu_vm_pod(handle.launched_resources) and not down: + # Stopping TPU VM Pods is not supported. Example error: + # "StopNode" is not supported on pod nodes: "v2-32" + # Autodown is supported. + # Reference: https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_a_with_gcloud # pylint: disable=line-too-long raise exceptions.NotSupportedError( f'{operation} cluster {cluster_name!r} with TPU VM Pod ' 'is not supported.') @@ -484,7 +486,7 @@ def autostop( cloud = handle.launched_resources.cloud if not down and idle_minutes >= 0: cloud.check_features_are_supported( - {clouds.CloudImplementationFeatures.AUTOSTOP}) + {clouds.CloudImplementationFeatures.STOP}) usage_lib.record_cluster_name_for_current_operation(cluster_name) backend.set_autostop(handle, idle_minutes, down) diff --git a/sky/execution.py b/sky/execution.py index a7088aa4dd6..3b3e1ef6dcc 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -255,23 +255,19 @@ def execute( f'{colorama.Style.RESET_ALL}') idle_minutes_to_autostop = 1 stages.remove(Stage.DOWN) - if not down: requested_features.add( - clouds.CloudImplementationFeatures.AUTOSTOP) - # TODO(ewzeng): allow autostop for spot when stopping is - # supported. - if task.use_spot: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - 'Autostop is not supported for spot instances.') + clouds.CloudImplementationFeatures.STOP) + # NOTE: in general we may not have sufficiently specified info + # (cloud/resource) to check STOP_SPOT_INSTANCE here. This is checked in + # the backend. elif idle_minutes_to_autostop is not None: # TODO(zhwu): Autostop is not supported for non-CloudVmRayBackend. with ux_utils.print_exception_no_traceback(): raise ValueError( - f'Backend {backend.NAME} does not support autostop, please try ' - f'{backends.CloudVmRayBackend.NAME}') + f'Backend {backend.NAME} does not support autostop, please try' + f' {backends.CloudVmRayBackend.NAME}') if Stage.CLONE_DISK in stages: task = _maybe_clone_disk_from_cluster(clone_disk_from, cluster_name, diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 281880385fc..29a5a3a2e9b 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1874,6 +1874,7 @@ def test_stop_gcp_spot(): 'stop_gcp_spot', [ f'sky launch -c {name} --cloud gcp --use-spot --cpus 2+ -y -- touch myfile', + # stop should go through: f'sky stop {name} -y', f'sky start {name} -y', f'sky exec {name} -- ls myfile', @@ -1884,6 +1885,10 @@ def test_stop_gcp_spot(): f'sky start {name} -y', f'sky exec {name} -- ls myfile', f'sky logs {name} 3 --status', + # -i option at launch should go through: + f'sky launch -c {name} -i0 -y', + 'sleep 90', + f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', ], f'sky down -y {name}', ) From 43613a5d09e8f17352846b38b9cbc4672aca22d1 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Thu, 21 Dec 2023 22:38:43 -0800 Subject: [PATCH 05/10] format --- sky/execution.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index 3b3e1ef6dcc..2179eba2bff 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -256,8 +256,7 @@ def execute( idle_minutes_to_autostop = 1 stages.remove(Stage.DOWN) if not down: - requested_features.add( - clouds.CloudImplementationFeatures.STOP) + requested_features.add(clouds.CloudImplementationFeatures.STOP) # NOTE: in general we may not have sufficiently specified info # (cloud/resource) to check STOP_SPOT_INSTANCE here. This is checked in # the backend. From 390ad731c2617445844e18b99a1c5667ed25179a Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 27 Dec 2023 04:00:15 +0000 Subject: [PATCH 06/10] refactoring for feature supported check --- sky/backends/backend_utils.py | 1 + sky/backends/cloud_vm_ray_backend.py | 5 +-- sky/clouds/aws.py | 17 ++++++---- sky/clouds/azure.py | 15 +++++---- sky/clouds/cloud.py | 40 ++++++++++++---------- sky/clouds/gcp.py | 16 +++++---- sky/clouds/ibm.py | 15 +++++---- sky/clouds/kubernetes.py | 8 ++--- sky/clouds/lambda_cloud.py | 9 +++-- sky/clouds/local.py | 5 +-- sky/clouds/oci.py | 15 +++++---- sky/clouds/scp.py | 15 +++++---- sky/core.py | 50 ++++++++++------------------ sky/resources.py | 4 +-- 14 files changed, 111 insertions(+), 104 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 851e8e8436d..c5142b2dd57 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1941,6 +1941,7 @@ def check_can_clone_disk_and_override_task( new_task_resources = [] original_cloud = handle.launched_resources.cloud original_cloud.check_features_are_supported( + handle.launched_resources, {clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER}) assert original_cloud is not None, handle.launched_resources diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 331e96e5685..1f0d9c23452 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2205,7 +2205,7 @@ def provision_with_retries( cloud_user = to_provision.cloud.get_current_user_identity() # Skip if to_provision.cloud does not support requested features to_provision.cloud.check_features_are_supported( - self._requested_features) + to_provision, self._requested_features) config_dict = self._retry_zones( to_provision, @@ -4289,7 +4289,8 @@ def set_autostop(self, # any stage after PRE_EXEC is not executed. cloud = handle.launched_resources.cloud cloud.check_features_are_supported( - {clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE}) + handle.launched_resources, + {clouds.CloudImplementationFeatures.STOP}) code = autostop_lib.AutostopCodeGen.set_autostop( idle_minutes_to_autostop, self.NAME, down) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index a0457d2fd0f..ff6764aeb99 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -114,13 +114,16 @@ class AWS(clouds.Cloud): ) @classmethod - def _cloud_unsupported_features( - cls) -> Dict[clouds.CloudImplementationFeatures, str]: - return { - clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: - ('Stopping spot instances is currently not supported on' - f' {cls._REPR}.'), - } + def _unsupported_features_for_resources( + cls, resources: 'resources_lib.Resources' + ) -> Dict[clouds.CloudImplementationFeatures, str]: + if resources.use_spot: + return { + clouds.CloudImplementationFeatures.STOP: + ('Stopping spot instances is currently not supported on' + f' {cls._REPR}.'), + } + return {} @classmethod def max_cluster_name_length(cls) -> Optional[int]: diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 2d0d6267f1e..b89c39ff126 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -61,15 +61,18 @@ class Azure(clouds.Cloud): _INDENT_PREFIX = ' ' * 4 @classmethod - def _cloud_unsupported_features( - cls) -> Dict[clouds.CloudImplementationFeatures, str]: - return { + def _unsupported_features_for_resources( + cls, resources: 'resources.Resources' + ) -> Dict[clouds.CloudImplementationFeatures, str]: + features = { clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: (f'Migrating disk is currently not supported on {cls._REPR}.'), - clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: - ('Stopping spot instances is currently not supported on' - f' {cls._REPR}.'), } + if resources.use_spot: + features[clouds.CloudImplementationFeatures.STOP] = ( + 'Stopping spot instances is currently not supported on' + f' {cls._REPR}.') + return features @classmethod def max_cluster_name_length(cls) -> int: diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index dc5c5874929..e59b8c04ba7 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -39,7 +39,6 @@ class CloudImplementationFeatures(enum.Enum): CLONE_DISK_FROM_CLUSTER = 'clone_disk_from_cluster' DOCKER_IMAGE = 'docker_image' SPOT_INSTANCE = 'spot_instance' - STOP_SPOT_INSTANCE = 'stop_spot_instance' CUSTOM_DISK_TIER = 'custom_disk_tier' OPEN_PORTS = 'open_ports' @@ -68,20 +67,6 @@ class Cloud: _REPR = '' _DEFAULT_DISK_TIER = 'medium' - @classmethod - def _cloud_unsupported_features( - cls) -> Dict[CloudImplementationFeatures, str]: - """The features not supported by the cloud implementation. - - This method is used by check_features_are_supported() to check if the - cloud implementation supports all the requested features. - - Returns: - A dict of {feature: reason} for the features not supported by the - cloud implementation. - """ - raise NotImplementedError - @classmethod def max_cluster_name_length(cls) -> Optional[int]: """Returns the maximum length limit of a cluster name. @@ -304,7 +289,8 @@ def get_feasible_launchable_resources( CloudImplementationFeatures.MULTI_NODE) try: - self.check_features_are_supported(resources_required_features) + self.check_features_are_supported(resources, + resources_required_features) except exceptions.NotSupportedError: # TODO(zhwu): The resources are now silently filtered out. We # should have some logging telling the user why the resources @@ -457,7 +443,8 @@ def need_cleanup_after_preemption( @classmethod def check_features_are_supported( - cls, requested_features: Set[CloudImplementationFeatures]) -> None: + cls, resources: 'resources_lib.Resources', + requested_features: Set[CloudImplementationFeatures]) -> None: """Errors out if the cloud does not support all requested features. For instance, Lambda Cloud does not support stop, so @@ -469,7 +456,8 @@ def check_features_are_supported( exceptions.NotSupportedError: If the cloud does not support all the requested features. """ - unsupported_features2reason = cls._cloud_unsupported_features() + unsupported_features2reason = cls._unsupported_features_for_resources( + resources) # Docker image is not compatible with ssh proxy command. if skypilot_config.get_nested( @@ -494,6 +482,22 @@ def check_features_are_supported( f'The following features are not supported by {cls._REPR}:' '\n\t' + table.get_string().replace('\n', '\n\t')) + @classmethod + def _unsupported_features_for_resources( + cls, resources: 'resources_lib.Resources' + ) -> Dict[CloudImplementationFeatures, str]: + """The features not supported based on the resources provided. + + This method is used by check_features_are_supported() to check if the + cloud implementation supports all the requested features. + + Returns: + A dict of {feature: reason} for the features not supported by the + cloud implementation. + """ + del resources + raise NotImplementedError + @classmethod def check_cluster_name_is_valid(cls, cluster_name: str) -> None: """Errors out on invalid cluster names not supported by cloud providers. diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index a869468700c..4a83a7cba35 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -20,6 +20,7 @@ from sky.skylet import log_lib from sky.utils import common_utils from sky.utils import subprocess_utils +from sky.utils import tpu_utils from sky.utils import ux_utils if typing.TYPE_CHECKING: @@ -162,8 +163,15 @@ class GCP(clouds.Cloud): ) @classmethod - def _cloud_unsupported_features( - cls) -> Dict[clouds.CloudImplementationFeatures, str]: + def _unsupported_features_for_resources( + cls, resources: 'resources.Resources' + ) -> Dict[clouds.CloudImplementationFeatures, str]: + if tpu_utils.is_tpu_vm_pod(resources): + return { + clouds.CloudImplementationFeatures.STOP: ( + 'TPU VM pods cannot be stopped. Please refer to: https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_your_resources' + ) + } return {} @classmethod @@ -809,8 +817,6 @@ def need_cleanup_after_preemption(self, # you must delete it and create a new one ..." # See: https://cloud.google.com/tpu/docs/preemptible#tpu-vm - # pylint: disable=import-outside-toplevel - from sky.utils import tpu_utils return tpu_utils.is_tpu_vm(resources) @classmethod @@ -942,8 +948,6 @@ def query_status(cls, name: str, tag_filters: Dict[str, str], """Query the status of a cluster.""" del region # unused - # pylint: disable=import-outside-toplevel - from sky.utils import tpu_utils use_tpu_vm = kwargs.pop('use_tpu_vm', False) label_filter_str = cls._label_filter_str(tag_filters) diff --git a/sky/clouds/ibm.py b/sky/clouds/ibm.py index 29e26e7e989..97950af87cb 100644 --- a/sky/clouds/ibm.py +++ b/sky/clouds/ibm.py @@ -34,9 +34,10 @@ class IBM(clouds.Cloud): _regions: List[clouds.Region] = [] @classmethod - def _cloud_unsupported_features( - cls) -> Dict[clouds.CloudImplementationFeatures, str]: - return { + def _unsupported_features_for_resources( + cls, resources: 'resources_lib.Resources' + ) -> Dict[clouds.CloudImplementationFeatures, str]: + features = { clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: (f'Migrating disk is currently not supported on {cls._REPR}.'), clouds.CloudImplementationFeatures.DOCKER_IMAGE: @@ -48,10 +49,12 @@ def _cloud_unsupported_features( ), clouds.CloudImplementationFeatures.OPEN_PORTS: (f'Opening ports is currently not supported on {cls._REPR}.'), - clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: - ('Stopping spot instances is currently not supported on' - f' {cls._REPR}.'), } + if resources.use_spot: + features[clouds.CloudImplementationFeatures.STOP] = ( + 'Stopping spot instances is currently not supported on' + f' {cls._REPR}.') + return features @classmethod def max_cluster_name_length(cls) -> Optional[int]: diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index e762bf6edfb..3f57e2d369f 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -57,9 +57,6 @@ class Kubernetes(clouds.Cloud): clouds.CloudImplementationFeatures.SPOT_INSTANCE: 'Spot instances are ' 'not supported in ' 'Kubernetes.', - clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: - ('Stopping spot instances is currently not supported on Kubernetes.' - ), clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: 'Custom disk ' 'tiers are not ' 'supported in ' @@ -76,8 +73,9 @@ class Kubernetes(clouds.Cloud): IMAGE_GPU = 'skypilot:gpu-ubuntu-2004' @classmethod - def _cloud_unsupported_features( - cls) -> Dict[clouds.CloudImplementationFeatures, str]: + def _unsupported_features_for_resources( + cls, resources: 'resources_lib.Resources' + ) -> Dict[clouds.CloudImplementationFeatures, str]: return cls._CLOUD_UNSUPPORTED_FEATURES @classmethod diff --git a/sky/clouds/lambda_cloud.py b/sky/clouds/lambda_cloud.py index c339259f6c7..d5c1e2c62c9 100644 --- a/sky/clouds/lambda_cloud.py +++ b/sky/clouds/lambda_cloud.py @@ -42,16 +42,15 @@ class Lambda(clouds.Cloud): 'You can try running docker command inside the `run` section in task.yaml.' ), clouds.CloudImplementationFeatures.SPOT_INSTANCE: f'Spot instances are not supported in {_REPR}.', - clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: - ('Stopping spot instances is currently not supported on' - f' {_REPR}.'), clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.', clouds.CloudImplementationFeatures.OPEN_PORTS: f'Opening ports is currently not supported on {_REPR}.', } @classmethod - def _cloud_unsupported_features( - cls) -> Dict[clouds.CloudImplementationFeatures, str]: + def _unsupported_features_for_resources( + cls, resources: 'resources_lib.Resources' + ) -> Dict[clouds.CloudImplementationFeatures, str]: + del resources # unused return cls._CLOUD_UNSUPPORTED_FEATURES @classmethod diff --git a/sky/clouds/local.py b/sky/clouds/local.py index 218d1483b74..531c0b8a19f 100644 --- a/sky/clouds/local.py +++ b/sky/clouds/local.py @@ -41,8 +41,9 @@ class Local(clouds.Cloud): } @classmethod - def _cloud_unsupported_features( - cls) -> Dict[clouds.CloudImplementationFeatures, str]: + def _unsupported_features_for_resources( + cls, resources: 'resources_lib.Resources' + ) -> Dict[clouds.CloudImplementationFeatures, str]: return cls._CLOUD_UNSUPPORTED_FEATURES @classmethod diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index a8978d26bac..d655ec5cda5 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -42,9 +42,10 @@ class OCI(clouds.Cloud): _INDENT_PREFIX = ' ' @classmethod - def _cloud_unsupported_features( - cls) -> Dict[clouds.CloudImplementationFeatures, str]: - return { + def _unsupported_features_for_resources( + cls, resources: 'resources_lib.Resources' + ) -> Dict[clouds.CloudImplementationFeatures, str]: + features = { clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: (f'Migrating disk is currently not supported on {cls._REPR}.'), clouds.CloudImplementationFeatures.DOCKER_IMAGE: @@ -53,10 +54,12 @@ def _cloud_unsupported_features( '`run` section in task.yaml.'), clouds.CloudImplementationFeatures.OPEN_PORTS: (f'Opening ports is currently not supported on {cls._REPR}.'), - clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: - ('Stopping spot instances is currently not supported on' - f' {cls._REPR}.'), } + if resources.use_spot: + features[clouds.CloudImplementationFeatures.STOP] = ( + f'Stopping spot instances is currently not supported on ' + f'{cls._REPR}.') + return features @classmethod def max_cluster_name_length(cls) -> Optional[int]: diff --git a/sky/clouds/scp.py b/sky/clouds/scp.py index 50e6c7de4bb..bfdcf0264b9 100644 --- a/sky/clouds/scp.py +++ b/sky/clouds/scp.py @@ -51,9 +51,6 @@ class SCP(clouds.Cloud): '`run` section in task.yaml.'), clouds.CloudImplementationFeatures.SPOT_INSTANCE: (f'Spot instances are not supported in {_REPR}.'), - clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE: - ('Stopping spot instances is currently not supported on' - f' {_REPR}.'), clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: (f'Custom disk tiers are not supported in {_REPR}.'), clouds.CloudImplementationFeatures.OPEN_PORTS: @@ -63,9 +60,15 @@ class SCP(clouds.Cloud): _INDENT_PREFIX = ' ' @classmethod - def _cloud_unsupported_features( - cls) -> Dict[clouds.CloudImplementationFeatures, str]: - return cls._CLOUD_UNSUPPORTED_FEATURES + def _unsupported_features_for_resources( + cls, resources: 'resources_lib.Resources' + ) -> Dict[clouds.CloudImplementationFeatures, str]: + features = cls._CLOUD_UNSUPPORTED_FEATURES + if resources.use_spot: + features[clouds.CloudImplementationFeatures.STOP] = ( + 'Stopping spot instances is currently not supported on' + f' {cls._REPR}.') + return features @classmethod def max_cluster_name_length(cls) -> Optional[int]: diff --git a/sky/core.py b/sky/core.py index bf8d32d82bd..df4bb6f6dc5 100644 --- a/sky/core.py +++ b/sky/core.py @@ -327,26 +327,19 @@ def stop(cluster_name: str, purge: bool = False) -> None: # Check cloud supports stopping instances cloud = handle.launched_resources.cloud assert cloud is not None, handle - cloud.check_features_are_supported( - {clouds.CloudImplementationFeatures.STOP}) - # Check cloud supports stopping spot instances - supports_stop_spot = True try: cloud.check_features_are_supported( - {clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE}) - except exceptions.NotSupportedError: - supports_stop_spot = False - # Allow GCP spot to be stopped since it preserves disk: - # https://cloud.google.com/compute/docs/instances/preemptible#preemption-process # pylint: disable=line-too-long - if handle.launched_resources.use_spot and not supports_stop_spot: - # Disable spot instances to be stopped. + handle.launched_resources, + {clouds.CloudImplementationFeatures.STOP}) + except exceptions.NotSupportedError as e: raise exceptions.NotSupportedError( f'{colorama.Fore.YELLOW}Stopping cluster ' f'{cluster_name!r}... skipped.{colorama.Style.RESET_ALL}\n' - ' Stopping spot instances is not supported as the attached ' - 'disks will be lost.\n' + ' Stopping instances is not supported for ' + f'{handle.launched_resources}.\n' ' To terminate the cluster instead, run: ' - f'{colorama.Style.BRIGHT}sky down {cluster_name}') + f'{colorama.Style.BRIGHT}sky down {cluster_name}') from e + usage_lib.record_cluster_name_for_current_operation(cluster_name) backend.teardown(handle, terminate=False, purge=purge) @@ -460,33 +453,24 @@ def autostop( # Check cloud supports stopping spot instances cloud = handle.launched_resources.cloud assert cloud is not None, handle - supports_stop_spot = True - try: - cloud.check_features_are_supported( - {clouds.CloudImplementationFeatures.STOP_SPOT_INSTANCE}) - except exceptions.NotSupportedError: - supports_stop_spot = False if not isinstance(backend, backends.CloudVmRayBackend): raise exceptions.NotSupportedError( f'{operation} cluster {cluster_name!r} with backend ' f'{backend.__class__.__name__!r} is not supported.') - elif (handle.launched_resources.use_spot and not down and not is_cancel and - not supports_stop_spot): - # Disable spot instances to be autostopped. - # - # Exception: Allow GCP spot to be stopped since it preserves disk: - # https://cloud.google.com/compute/docs/instances/preemptible#preemption-process # pylint: disable=line-too-long - raise exceptions.NotSupportedError( - f'{colorama.Fore.YELLOW}Scheduling autostop on cluster ' - f'{cluster_name!r}...skipped.{colorama.Style.RESET_ALL}\n' - ' Stopping spot instances is not supported as the attached ' - 'disks will be lost.') # Check autostop is implemented for cloud cloud = handle.launched_resources.cloud if not down and idle_minutes >= 0: - cloud.check_features_are_supported( - {clouds.CloudImplementationFeatures.STOP}) + try: + cloud.check_features_are_supported( + handle.launched_resources, + {clouds.CloudImplementationFeatures.STOP}) + except exceptions.NotSupportedError as e: + raise exceptions.NotSupportedError( + f'{colorama.Fore.YELLOW}Scheduling autostop on cluster ' + f'{cluster_name!r}...skipped.{colorama.Style.RESET_ALL}\n' + ' Stopping instances is not supported for ' + f'{handle.launched_resources}.') from e usage_lib.record_cluster_name_for_current_operation(cluster_name) backend.set_autostop(handle, idle_minutes, down) diff --git a/sky/resources.py b/sky/resources.py index e90b6b6700c..3fbe1e21395 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -784,7 +784,7 @@ def _try_validate_image_id(self) -> None: 'Docker image is not supported for TPU VM.') if self.cloud is not None: self.cloud.check_features_are_supported( - {clouds.CloudImplementationFeatures.DOCKER_IMAGE}) + self, {clouds.CloudImplementationFeatures.DOCKER_IMAGE}) return if self.cloud is None: @@ -867,7 +867,7 @@ def _try_validate_ports(self) -> None: 'specified.') if self.cloud is not None: self.cloud.check_features_are_supported( - {clouds.CloudImplementationFeatures.OPEN_PORTS}) + self, {clouds.CloudImplementationFeatures.OPEN_PORTS}) # We don't need to check the ports format since we already done it # in resources_utils.simplify_ports From 39751096fa273066a4c190aac899d8dc17fc7da1 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 27 Dec 2023 04:09:47 +0000 Subject: [PATCH 07/10] remove special handling for TPU VM pod --- sky/core.py | 15 --------------- tests/test_smoke.py | 4 ++-- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/sky/core.py b/sky/core.py index df4bb6f6dc5..9891309db10 100644 --- a/sky/core.py +++ b/sky/core.py @@ -318,12 +318,6 @@ def stop(cluster_name: str, purge: bool = False) -> None: if isinstance(backend, backends.CloudVmRayBackend): assert isinstance(handle, backends.CloudVmRayResourceHandle), handle - if tpu_utils.is_tpu_vm_pod(handle.launched_resources): - # Reference: - # https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_a_with_gcloud # pylint: disable=line-too-long - raise exceptions.NotSupportedError( - f'Stopping cluster {cluster_name!r} with TPU VM Pod ' - 'is not supported.') # Check cloud supports stopping instances cloud = handle.launched_resources.cloud assert cloud is not None, handle @@ -441,15 +435,6 @@ def autostop( ) backend = backend_utils.get_backend_from_handle(handle) - if tpu_utils.is_tpu_vm_pod(handle.launched_resources) and not down: - # Stopping TPU VM Pods is not supported. Example error: - # "StopNode" is not supported on pod nodes: "v2-32" - # Autodown is supported. - # Reference: https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_a_with_gcloud # pylint: disable=line-too-long - raise exceptions.NotSupportedError( - f'{operation} cluster {cluster_name!r} with TPU VM Pod ' - 'is not supported.') - # Check cloud supports stopping spot instances cloud = handle.launched_resources.cloud assert cloud is not None, handle diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 89968f2199f..d4e5e61de1e 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1644,10 +1644,10 @@ def test_autostop(generic_cloud: str): # Test restarting the idleness timer via cancel + reset: f'sky autostop -y {name} -i 1', # Idleness starts counting. - 'sleep 45', # Almost reached the threshold. + 'sleep 30', # Almost reached the threshold. f'sky autostop -y {name} --cancel', f'sky autostop -y {name} -i 1', # Should restart the timer. - 'sleep 45', + 'sleep 30', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', f'sleep {autostop_timeout}', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', From 11ec2b7a251c9de8c72ee24568aa5eb3963263b6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 27 Dec 2023 04:14:48 +0000 Subject: [PATCH 08/10] format --- sky/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sky/core.py b/sky/core.py index 9891309db10..16113720b71 100644 --- a/sky/core.py +++ b/sky/core.py @@ -22,7 +22,6 @@ from sky.utils import controller_utils from sky.utils import rich_utils from sky.utils import subprocess_utils -from sky.utils import tpu_utils from sky.utils import ux_utils logger = sky_logging.init_logger(__name__) From e247a75eee7f96a227117ef392bb7d3ec57cff08 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Wed, 27 Dec 2023 08:39:36 -0800 Subject: [PATCH 09/10] Message --- sky/core.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/sky/core.py b/sky/core.py index 16113720b71..76faefbe8bf 100644 --- a/sky/core.py +++ b/sky/core.py @@ -1,6 +1,7 @@ """SDK functions for cluster/job management.""" import getpass import sys +import typing from typing import Any, Dict, List, Optional, Union import colorama @@ -24,6 +25,9 @@ from sky.utils import subprocess_utils from sky.utils import ux_utils +if typing.TYPE_CHECKING: + from sky import resources as resources_lib + logger = sky_logging.init_logger(__name__) # ====================== @@ -283,6 +287,15 @@ def start( force=force) +def _stop_not_supported_message(resources: 'resources_lib.Resources') -> str: + if resources.use_spot: + message = ('Stopping spot instances is currently not supported on ' + f'{resources.cloud}') + else: + message = f'Stopping is currently not supported for {resources}' + return message + + @usage_lib.entrypoint def stop(cluster_name: str, purge: bool = False) -> None: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. @@ -328,8 +341,7 @@ def stop(cluster_name: str, purge: bool = False) -> None: raise exceptions.NotSupportedError( f'{colorama.Fore.YELLOW}Stopping cluster ' f'{cluster_name!r}... skipped.{colorama.Style.RESET_ALL}\n' - ' Stopping instances is not supported for ' - f'{handle.launched_resources}.\n' + f' {_stop_not_supported_message(handle.launched_resources)}.\n' ' To terminate the cluster instead, run: ' f'{colorama.Style.BRIGHT}sky down {cluster_name}') from e @@ -453,8 +465,8 @@ def autostop( raise exceptions.NotSupportedError( f'{colorama.Fore.YELLOW}Scheduling autostop on cluster ' f'{cluster_name!r}...skipped.{colorama.Style.RESET_ALL}\n' - ' Stopping instances is not supported for ' - f'{handle.launched_resources}.') from e + f' {_stop_not_supported_message(handle.launched_resources)}.' + ) from e usage_lib.record_cluster_name_for_current_operation(cluster_name) backend.set_autostop(handle, idle_minutes, down) From 1688099593a5a819b7aa5fb024ea02ae03023895 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 28 Dec 2023 13:11:44 +0000 Subject: [PATCH 10/10] add docstr --- sky/clouds/cloud.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index e59b8c04ba7..126c3fd5ddb 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -448,10 +448,15 @@ def check_features_are_supported( """Errors out if the cloud does not support all requested features. For instance, Lambda Cloud does not support stop, so - Lambda.check_features_are_supported({ + Lambda.check_features_are_supported(to_provision, { CloudImplementationFeatures.STOP }) raises the exception. + Resources are also passed as some features may depend on the resources + requested. For example, some clouds support stopping normal instances, + but not spot instances, e.g., AWS; or, GCP supports stopping TPU VMs but + not TPU VM pods. + Raises: exceptions.NotSupportedError: If the cloud does not support all the requested features.