skypilot-org · concretevitamin · Sep 26, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/docs/source/examples/auto-failover.rst b/docs/source/examples/auto-failover.rst
@@ -60,18 +60,22 @@ provisioner handles such a request:
 .. code-block:: console
 
  $ sky launch -c gpu --gpus V100
- ... # optimizer output
- I 02-11 21:17:43 cloud_vm_ray_backend.py:1034] Creating a new cluster: "gpu" [1x GCP(n1-highmem-8, {'V100': 1.0})].
- I 02-11 21:17:43 cloud_vm_ray_backend.py:1034] Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters.
- I 02-11 21:17:43 cloud_vm_ray_backend.py:614] To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-11-21-17-43-171661/provision.log
- I 02-11 21:17:43 cloud_vm_ray_backend.py:624]
- I 02-11 21:17:43 cloud_vm_ray_backend.py:624] Launching on GCP us-central1 (us-central1-a)
- W 02-11 21:17:56 cloud_vm_ray_backend.py:358] Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-a (message: The zone 'projects/intercloud-320520/zones/us-central1-a' does not have enough resources available to fulfill the request. Try a different zone, or try again later.)
+
+ ...
+ Creating a new cluster: "gpu" [1x GCP(n1-highmem-8, {'V100': 1.0})].
+ Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters.
+ To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-11-21-17-43-171661/provision.log
+
+ Launching on GCP us-central1 (us-central1-a)
+ Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-a (message: The zone 'projects/intercloud-320520/zones/us-central1-a' does not have enough resources available to fulfill the request. Try a different zone, or try again later.)
+ ...
+
+ Launching on GCP us-central1 (us-central1-f)
+ Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-f (message: The zone 'projects/intercloud-320520/zones/us-central1-f' does not have enough resources available to fulfill the request. Try a different zone, or try again later.)
+ ...
+
+ Launching on GCP us-west1 (us-west1-a)
  ...
- I 02-11 21:18:24 cloud_vm_ray_backend.py:624] Launching on GCP us-central1 (us-central1-f)
- W 02-11 21:18:38 cloud_vm_ray_backend.py:358] Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-f (message: The zone 'projects/intercloud-320520/zones/us-central1-f' does not have enough resources available to fulfill the request. Try a different zone, or try again later.)
- I 02-11 21:18:38 cloud_vm_ray_backend.py:624]
- I 02-11 21:18:38 cloud_vm_ray_backend.py:624] Launching on GCP us-west1 (us-west1-a)
  Successfully connected to 35.230.120.87.
 
 GCP was chosen as the best cloud to run the task. There was no capacity in any of the regions in US Central, so the auto-failover provisioner moved to US West instead, allowing for our instance to be successfully provisioned.
@@ -88,21 +92,24 @@ AWS, where it succeeded after two regions:
 .. code-block:: console
 
  $ sky launch -c v100-8 --gpus V100:8
- ... # optimizer output
- I 02-23 16:39:59 cloud_vm_ray_backend.py:1010] Creating a new cluster: "v100-8" [1x GCP(n1-highmem-8, {'V100': 8.0})].
- I 02-23 16:39:59 cloud_vm_ray_backend.py:1010] Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters.
- I 02-23 16:39:59 cloud_vm_ray_backend.py:658] To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-23-16-39-58-577551/provision.log
- I 02-23 16:39:59 cloud_vm_ray_backend.py:668]
- I 02-23 16:39:59 cloud_vm_ray_backend.py:668] Launching on GCP us-central1 (us-central1-a)
- W 02-23 16:40:17 cloud_vm_ray_backend.py:403] Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-a (message: The zone 'projects/intercloud-320520/zones/us-central1-a' does not have enough resources available to fulfill the request. Try a different zone, or try again later.)
+
  ...
- I 02-23 16:42:15 cloud_vm_ray_backend.py:668] Launching on AWS us-east-2 (us-east-2a,us-east-2b,us-east-2c)
- W 02-23 16:42:26 cloud_vm_ray_backend.py:477] Got error(s) in all zones of us-east-2:
- W 02-23 16:42:26 cloud_vm_ray_backend.py:479] create_instances: Attempt failed with An error occurred (InsufficientInstanceCapacity) when calling the RunInstances operation (reached max retries: 0): We currently do not have sufficient p3.16xlarge capacity in the Availability Zone you requested (us-east-2a). Our system will be working on provisioning additional capacity. You can currently get p3.16xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-2b., retrying.
+ Creating a new cluster: "v100-8" [1x GCP(n1-highmem-8, {'V100': 8.0})].
+ Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters.
+ To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-23-16-39-58-577551/provision.log
+
+ Launching on GCP us-central1 (us-central1-a)
+ Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-a (message: The zone 'projects/intercloud-320520/zones/us-central1-a' does not have enough resources available to fulfill the request. Try a different zone, or try again later.)
+ ...
+
+ Launching on AWS us-east-2 (us-east-2a,us-east-2b,us-east-2c)
+ Got error(s) in all zones of us-east-2:
+ create_instances: Attempt failed with An error occurred (InsufficientInstanceCapacity) when calling the RunInstances operation (reached max retries: 0): We currently do not have sufficient p3.16xlarge capacity in the Availability Zone you requested (us-east-2a). Our system will be working on provisioning additional capacity. You can currently get p3.16xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-2b., retrying.
  ...
- I 02-23 16:42:26 cloud_vm_ray_backend.py:668]
- I 02-23 16:42:26 cloud_vm_ray_backend.py:668] Launching on AWS us-west-2 (us-west-2a,us-west-2b,us-west-2c,us-west-2d)
- I 02-23 16:47:04 cloud_vm_ray_backend.py:740] Successfully provisioned or found existing VM. Setup completed.
+
+ Launching on AWS us-west-2 (us-west-2a,us-west-2b,us-west-2c,us-west-2d)
+ ...
+ Successfully provisioned or found existing VM. Setup completed.
 
 
 Multiple Candidate GPUs
@@ -125,13 +132,13 @@ A10, L4, and A10g GPUs, using :code:`sky launch task.yaml`.
 
  $ sky launch task.yaml
  ...
- I 11-19 08:07:45 optimizer.py:910] -----------------------------------------------------------------------------------------------------
- I 11-19 08:07:45 optimizer.py:910]  CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN
- I 11-19 08:07:45 optimizer.py:910] -----------------------------------------------------------------------------------------------------
- I 11-19 08:07:45 optimizer.py:910]  Azure Standard_NV6ads_A10_v5 6 55 A10:1 eastus 0.45 ✔
- I 11-19 08:07:45 optimizer.py:910]  GCP g2-standard-4 4 16 L4:1 us-east4-a 0.70
- I 11-19 08:07:45 optimizer.py:910]  AWS g5.xlarge 4 16 A10G:1 us-east-1 1.01
- I 11-19 08:07:45 optimizer.py:910] -----------------------------------------------------------------------------------------------------
+ -----------------------------------------------------------------------------------------------------
+ CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN
+ -----------------------------------------------------------------------------------------------------
+ Azure Standard_NV6ads_A10_v5 6 55 A10:1 eastus 0.45 ✔
+ GCP g2-standard-4 4 16 L4:1 us-east4-a 0.70
+ AWS g5.xlarge 4 16 A10G:1 us-east-1 1.01
+ -----------------------------------------------------------------------------------------------------
 
 
 
@@ -212,15 +219,15 @@ This will generate the following output:
 
  $ sky launch -c mycluster task.yaml
  ...
- I 12-20 23:55:56 optimizer.py:717]
- I 12-20 23:55:56 optimizer.py:840] Considered resources (1 node):
- I 12-20 23:55:56 optimizer.py:910] ---------------------------------------------------------------------------------------------
- I 12-20 23:55:56 optimizer.py:910]  CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN
- I 12-20 23:55:56 optimizer.py:910] ---------------------------------------------------------------------------------------------
- I 12-20 23:55:56 optimizer.py:910]  GCP g2-standard-96 96 384 L4:8 us-east4-a 7.98 ✔
- I 12-20 23:55:56 optimizer.py:910]  AWS g5.48xlarge 192 768 A10G:8 us-east-1 16.29
- I 12-20 23:55:56 optimizer.py:910]  GCP a2-highgpu-8g 96 680 A100:8 us-east1-b 29.39
- I 12-20 23:55:56 optimizer.py:910]  AWS p4d.24xlarge 96 1152 A100:8 us-east-1 32.77
- I 12-20 23:55:56 optimizer.py:910] ---------------------------------------------------------------------------------------------
- I 12-20 23:55:56 optimizer.py:910]
+
+ Considered resources (1 node):
+ ---------------------------------------------------------------------------------------------
+ CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN
+ ---------------------------------------------------------------------------------------------
+ GCP g2-standard-96 96 384 L4:8 us-east4-a 7.98 ✔
+ AWS g5.48xlarge 192 768 A10G:8 us-east-1 16.29
+ GCP a2-highgpu-8g 96 680 A100:8 us-east1-b 29.39
+ AWS p4d.24xlarge 96 1152 A100:8 us-east-1 32.77
+ ---------------------------------------------------------------------------------------------
+
  Launching a new cluster 'mycluster'. Proceed? [Y/n]:
diff --git a/sky/execution.py b/sky/execution.py
@@ -334,7 +334,7 @@ def _execute(
  #
  # Disable the usage collection for this status command.
  env = dict(os.environ,
- **{env_options.Options.DISABLE_LOGGING.value: '1'})
+ **{str(env_options.Options.DISABLE_LOGGING): '1'})
  subprocess_utils.run(
  'sky status --no-show-managed-jobs --no-show-services', env=env)
  print()

diff --git a/sky/optimizer.py b/sky/optimizer.py
@@ -965,10 +965,10 @@ def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates):
  f'Multiple {cloud} instances satisfy '
  f'{acc_name}:{int(acc_count)}. '
  f'The cheapest {candidate_list[0]!r} is considered '
- f'among:\n{instance_list}.\n')
+ f'among:\n{instance_list}.')
  if is_multi_instances:
  logger.info(
- f'To list more details, run \'sky show-gpus {acc_name}\'.')
+ f'To list more details, run: sky show-gpus {acc_name}\n')
 
  @staticmethod
  def _optimize_dag(
@@ -1101,8 +1101,7 @@ def ordinal_number(n):
  Optimizer.print_optimized_plan(graph, topo_order, best_plan,
  total_time, total_cost,
  node_to_cost_map, minimize_cost)
- if not env_options.Options.MINIMIZE_LOGGING.get():
- Optimizer._print_candidates(local_node_to_candidate_map)
+ Optimizer._print_candidates(local_node_to_candidate_map)
  return best_plan
 
 

diff --git a/sky/sky_logging.py b/sky/sky_logging.py
@@ -10,10 +10,11 @@
 from sky.utils import env_options
 from sky.utils import rich_utils
 
-# If the SKYPILOT_MINIMIZE_LOGGING environment variable is set to True,
-# remove logging prefixes and unnecessary information in optimizer
-_FORMAT = (None if env_options.Options.MINIMIZE_LOGGING.get() else
- '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
+# UX: Should we show logging prefixes and some extra information in optimizer?
+_show_logging_prefix = (env_options.Options.SHOW_DEBUG_INFO.get() or
+ not env_options.Options.MINIMIZE_LOGGING.get())
+_FORMAT = ('%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
+ if _show_logging_prefix else None)
 _DATE_FORMAT = '%m-%d %H:%M:%S'
 
 

diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py
@@ -362,15 +362,15 @@ def shared_controller_vars_to_fill(
  'sky_python_cmd': constants.SKY_PYTHON_CMD,
  }
  env_vars: Dict[str, str] = {
- env.value: '1' for env in env_options.Options if env.get()
+ str(env): str(int(env.get())) for env in env_options.Options
  }
  env_vars.update({
  # Should not use $USER here, as that env var can be empty when
  # running in a container.
  constants.USER_ENV_VAR: getpass.getuser(),
  constants.USER_ID_ENV_VAR: common_utils.get_user_hash(),
  # Skip cloud identity check to avoid the overhead.
- env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.value: '1',
+ str(env_options.Options.SKIP_CLOUD_IDENTITY_CHECK): '1',
  })
  if skypilot_config.loaded():
  # Only set the SKYPILOT_CONFIG env var if the user has a config file.

diff --git a/sky/utils/env_options.py b/sky/utils/env_options.py
@@ -5,17 +5,27 @@
 
 class Options(enum.Enum):
  """Environment variables for SkyPilot."""
- IS_DEVELOPER = 'SKYPILOT_DEV'
- SHOW_DEBUG_INFO = 'SKYPILOT_DEBUG'
- DISABLE_LOGGING = 'SKYPILOT_DISABLE_USAGE_COLLECTION'
- MINIMIZE_LOGGING = 'SKYPILOT_MINIMIZE_LOGGING'
+
+ # (env var name, default value)
+ IS_DEVELOPER = ('SKYPILOT_DEV', False)
+ SHOW_DEBUG_INFO = ('SKYPILOT_DEBUG', False)
+ DISABLE_LOGGING = ('SKYPILOT_DISABLE_USAGE_COLLECTION', False)
+ MINIMIZE_LOGGING = ('SKYPILOT_MINIMIZE_LOGGING', True)
  # Internal: this is used to skip the cloud user identity check, which is
  # used to protect cluster operations in a multi-identity scenario.
  # Currently, this is only used in the job and serve controller, as there
  # will not be multiple identities, and skipping the check can increase
  # robustness.
- SKIP_CLOUD_IDENTITY_CHECK = 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK'
+ SKIP_CLOUD_IDENTITY_CHECK = ('SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK', False)
+
+ def __init__(self, env_var: str, default: bool) -> None:
+ self.env_var = env_var
+ self.default = default
+
+ def __repr__(self) -> str:
+ return self.env_var
 
- def get(self):
+ def get(self) -> bool:
  """Check if an environment variable is set to True."""
- return os.getenv(self.value, 'False').lower() in ('true', '1')
+ return os.getenv(self.env_var,
+ str(self.default)).lower() in ('true', '1')