diff --git a/docs/source/examples/spot-jobs.rst b/docs/source/examples/spot-jobs.rst index 90baf837f73..ef566b1abac 100644 --- a/docs/source/examples/spot-jobs.rst +++ b/docs/source/examples/spot-jobs.rst @@ -241,3 +241,31 @@ you can still tear it down manually with .. note:: Tearing down the spot controller will lose all logs and status information for the spot jobs and can cause resource leakage when there are still in-progress spot jobs. + +Customizing spot controller resources +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You may customize the resources of the spot controller for the following reasons: + +1. Enforcing the spot controller to run on a specific location. (Default: cheapest location) +2. Changing the maximum number of spot jobs that can be run concurrently. (Default: 16) +3. Changing the disk_size of the spot controller to store more logs. (Default: 50GB) + +To achieve the above, you can specify custom configs in :code:`~/.sky/config.yaml` with the following fields (the :code:`resources` field has the same spec as a normal SkyPilot job; see `here `_): + +.. code-block:: yaml + + spot: + controller: + resources: + # All the configs below are optional + # 1. Specify the location of the spot controller. + cloud: gcp + region: us-central1 + # 2. Specify the maximum number of spot jobs that can be run concurrently. + cpus: 4+ # number of vCPUs, max # spot jobs = 2 * cpus + # 3. Specify the disk_size of the spot controller. + disk_size: 100 + + + diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 2ba88bc05bd..63b646dc3a3 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2652,6 +2652,11 @@ def stop_handler(signum, frame): def validate_schema(obj, schema, err_msg_prefix=''): + """Validates an object against a JSON schema. + + Raises: + ValueError: if the object does not match the schema. + """ err_msg = None try: validator.SchemaValidator(schema).validate(obj) diff --git a/sky/execution.py b/sky/execution.py index 362a66d4156..99d2bafe4cf 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -606,6 +606,8 @@ def spot_launch( # running in a container. 'user': getpass.getuser(), } + controller_resources_config = copy.copy( + spot.constants.CONTROLLER_RESOURCES) if skypilot_config.loaded(): # Look up the contents of the already loaded configs via the # 'skypilot_config' module. Don't simply read the on-disk file as @@ -660,12 +662,41 @@ def spot_launch( skypilot_config.ENV_VAR_SKYPILOT_CONFIG, }) + # Override the controller resources with the ones specified in the + # config. + custom_controller_resources_config = skypilot_config.get_nested( + ('spot', 'controller', 'resources'), None) + if custom_controller_resources_config is not None: + controller_resources_config.update( + custom_controller_resources_config) + try: + controller_resources = sky.Resources.from_yaml_config( + controller_resources_config) + except ValueError as e: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Spot controller resources is not valid, please check ' + '~/.sky/config.yaml file and make sure ' + 'spot.controller.resources is a valid resources spec. ' + 'Details:\n' + f' {common_utils.format_exception(e, use_bracket=True)}' + ) from e + yaml_path = os.path.join(spot.SPOT_CONTROLLER_YAML_PREFIX, f'{name}-{task_uuid}.yaml') backend_utils.fill_template(spot.SPOT_CONTROLLER_TEMPLATE, vars_to_fill, output_path=yaml_path) controller_task = task_lib.Task.from_yaml(yaml_path) + assert len(controller_task.resources) == 1, controller_task + # Backward compatibility: if the user changed the + # spot-controller.yaml.j2 to customize the controller resources, + # we should use it. + controller_task_resources = list(controller_task.resources)[0] + if not controller_task_resources.is_empty(): + controller_resources = controller_task_resources + controller_task.set_resources(controller_resources) + controller_task.spot_task = task assert len(controller_task.resources) == 1 diff --git a/sky/resources.py b/sky/resources.py index b4e1783033c..ed78590b396 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -831,6 +831,8 @@ def is_empty(self) -> bool: self.accelerators is None, self.accelerator_args is None, not self._use_spot_specified, + self.disk_size == _DEFAULT_DISK_SIZE_GB, + self._image_id is None, ]) def copy(self, **override) -> 'Resources': diff --git a/sky/spot/constants.py b/sky/spot/constants.py index 33d069327b1..605330691d4 100644 --- a/sky/spot/constants.py +++ b/sky/spot/constants.py @@ -12,3 +12,10 @@ SPOT_FM_FILE_ONLY_BUCKET_NAME = 'skypilot-filemounts-files-{username}-{id}' SPOT_FM_LOCAL_TMP_DIR = 'skypilot-filemounts-files-{id}' SPOT_FM_REMOTE_TMP_DIR = '/tmp/sky-spot-filemounts-files' + +# Resources as a dict for the spot controller. +# Use default CPU instance type for spot controller, i.e. +# m6i.2xlarge (8vCPUs, 32 GB) for AWS, Standard_D8s_v4 (8vCPUs, 32 GB) +# for Azure, and n1-standard-8 (8 vCPUs, 32 GB) for GCP. +# We use 50 GB disk size to reduce the cost. +CONTROLLER_RESOURCES = {'disk_size': 50} diff --git a/sky/templates/spot-controller.yaml.j2 b/sky/templates/spot-controller.yaml.j2 index a507cfc59e3..a9789f88f97 100644 --- a/sky/templates/spot-controller.yaml.j2 +++ b/sky/templates/spot-controller.yaml.j2 @@ -2,14 +2,6 @@ name: {{task_name}} -resources: - disk_size: 50 -# It is now using default CPU instance type hard-coded in code for spot controller, -# i.e. m6i.2xlarge (8vCPUs, 32 GB) for AWS, Standard_D8s_v4 (8vCPUs, 32 GB) for Azure, and n1-highmem-8 (8 vCPUs, 52 GB) for GCP. -# This allows users without the credits for some of the clouds available to use managed spot instances. -# TODO(zhwu): Fix this to be able to failvoer across clouds with cheaper instance. -# instance_type: t3.xlarge - file_mounts: {{remote_user_yaml_prefix}}/{{task_name}}-{{uuid}}.yaml: {{user_yaml_path}} {% if user_config_path is not none %}