Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Spot] Make the controller resources configurable #2040

Merged
merged 11 commits into from
Jun 7, 2023
5 changes: 5 additions & 0 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2652,6 +2652,11 @@ def stop_handler(signum, frame):


def validate_schema(obj, schema, err_msg_prefix=''):
"""Validates an object against a JSON schema.

Raises:
ValueError: if the object does not match the schema.
"""
err_msg = None
try:
validator.SchemaValidator(schema).validate(obj)
Expand Down
27 changes: 27 additions & 0 deletions sky/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,8 @@ def spot_launch(
# running in a container.
'user': getpass.getuser(),
}
controller_resources_config = copy.copy(
spot.constants.CONTROLLER_RESOURCES)
if skypilot_config.loaded():
# Look up the contents of the already loaded configs via the
# 'skypilot_config' module. Don't simply read the on-disk file as
Expand Down Expand Up @@ -660,12 +662,37 @@ def spot_launch(
skypilot_config.ENV_VAR_SKYPILOT_CONFIG,
})

# Override the controller resources with the ones specified in the
# config.
custom_controller_resources_config = skypilot_config.get_nested(
('spot', 'controller', 'resources'), None)
if custom_controller_resources_config is not None:
controller_resources_config.update(
custom_controller_resources_config)
try:
controller_resources = sky.Resources.from_yaml_config(
controller_resources_config)
except ValueError as e:
raise ValueError(
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
'Spot controller resources is not valid, please check '
'~/.sky/skypilot_config.yaml file. Details:\n'
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
f' {common_utils.format_exception(e, use_bracket=True)}'
) from e

yaml_path = os.path.join(spot.SPOT_CONTROLLER_YAML_PREFIX,
f'{name}-{task_uuid}.yaml')
backend_utils.fill_template(spot.SPOT_CONTROLLER_TEMPLATE,
vars_to_fill,
output_path=yaml_path)
controller_task = task_lib.Task.from_yaml(yaml_path)
assert len(controller_task.resources) == 1, controller_task
# Backward compatibility: if the user changed the spot-controller.yaml.j2
# to customize the controller resources, we should use it.
controller_task_resources = list(controller_task.resources)[0]
if not controller_task_resources.is_same_resources(sky.Resources()):
concretevitamin marked this conversation as resolved.
Show resolved Hide resolved
controller_resources = controller_task_resources
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These may not be necessary, but some of our users should have changed the spot-controller.yaml.j2 to make their jobs work.

controller_task.set_resources(controller_resources)

controller_task.spot_task = task
assert len(controller_task.resources) == 1

Expand Down
3 changes: 3 additions & 0 deletions sky/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,9 @@ def less_demanding_than(self,
# self <= other
return True

def is_same_resources(self, other: 'Resources') -> bool:
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
return self.to_yaml_config() == other.to_yaml_config()

def should_be_blocked_by(self, blocked: 'Resources') -> bool:
"""Whether this Resources matches the blocked Resources.

Expand Down
2 changes: 2 additions & 0 deletions sky/spot/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@
SPOT_FM_FILE_ONLY_BUCKET_NAME = 'skypilot-filemounts-files-{username}-{id}'
SPOT_FM_LOCAL_TMP_DIR = 'skypilot-filemounts-files-{id}'
SPOT_FM_REMOTE_TMP_DIR = '/tmp/sky-spot-filemounts-files'

CONTROLLER_RESOURCES = {'disk_size': 50}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doc?

8 changes: 0 additions & 8 deletions sky/templates/spot-controller.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,6 @@

name: {{task_name}}

resources:
disk_size: 50
# It is now using default CPU instance type hard-coded in code for spot controller,
# i.e. m6i.2xlarge (8vCPUs, 32 GB) for AWS, Standard_D8s_v4 (8vCPUs, 32 GB) for Azure, and n1-highmem-8 (8 vCPUs, 52 GB) for GCP.
# This allows users without the credits for some of the clouds available to use managed spot instances.
# TODO(zhwu): Fix this to be able to failvoer across clouds with cheaper instance.
# instance_type: t3.xlarge

file_mounts:
{{remote_user_yaml_prefix}}/{{task_name}}-{{uuid}}.yaml: {{user_yaml_path}}
{% if user_config_path is not none %}
Expand Down