diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 596d0bec043..9d8c8366f8e 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -887,6 +887,13 @@ def write_cluster_config( # Conda setup 'conda_installation_commands': constants.CONDA_INSTALLATION_COMMANDS, + # We should not use `.format`, as it contains '{}' as the bash + # syntax. + 'ray_skypilot_installation_commands': + (constants.RAY_SKYPILOT_INSTALLATION_COMMANDS.replace( + '{sky_wheel_hash}', + wheel_hash).replace('{cloud}', + str(cloud).lower())), # Port of Ray (GCS server). # Ray's default port 6379 is conflicted with Redis. diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index a43c4d73746..7015953a40f 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -285,7 +285,6 @@ def get_or_fail(futures, pg) -> List[int]: # next job can be scheduled on the released resources immediately. ray_util.remove_placement_group(pg) sys.stdout.flush() - sys.stderr.flush() return returncodes run_fn = None @@ -372,14 +371,12 @@ def add_gang_scheduling_placement_group_and_setup( message = {_CTRL_C_TIP_MESSAGE!r} + '\\n' message += f'INFO: Waiting for task resources on {{node_str}}. This will block if the cluster is full.' print(message, - file=sys.stderr, flush=True) # FIXME: This will print the error message from autoscaler if # it is waiting for other task to finish. We should hide the # error message. ray.get(pg.ready()) print('INFO: All task resources reserved.', - file=sys.stderr, flush=True) """) ] @@ -427,7 +424,6 @@ def add_gang_scheduling_placement_group_and_setup( print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with ' 'return code list:{colorama.Style.RESET_ALL}', setup_returncodes, - file=sys.stderr, flush=True) # Need this to set the job status in ray job to be FAILED. sys.exit(1) @@ -623,7 +619,6 @@ def add_epilogue(self) -> None: 'return code list:{colorama.Style.RESET_ALL}', returncodes, reason, - file=sys.stderr, flush=True) # Need this to set the job status in ray job to be FAILED. sys.exit(1) @@ -3139,7 +3134,8 @@ def _exec_code_on_head( f'{cd} && ray job submit ' '--address=http://127.0.0.1:$RAY_DASHBOARD_PORT ' f'--submission-id {job_id}-$(whoami) --no-wait ' - f'"{executable} -u {script_path} > {remote_log_path} 2>&1"') + # Redirect stderr to /dev/null to avoid distracting error from ray. + f'"{executable} -u {script_path} > {remote_log_path} 2> /dev/null"') mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && ' f'touch {remote_log_path}') diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 37fa140207e..bdf4b6d1280 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -146,8 +146,15 @@ def get_default_instance_type( # exactly the requested resources. instance_cpus = float( cpus.strip('+')) if cpus is not None else cls._DEFAULT_NUM_VCPUS - instance_mem = float(memory.strip('+')) if memory is not None else \ - instance_cpus * cls._DEFAULT_MEMORY_CPU_RATIO + if memory is not None: + if memory.endswith('+'): + instance_mem = float(memory[:-1]) + elif memory.endswith('x'): + instance_mem = float(memory[:-1]) * instance_cpus + else: + instance_mem = float(memory) + else: + instance_mem = instance_cpus * cls._DEFAULT_MEMORY_CPU_RATIO virtual_instance_type = kubernetes_utils.KubernetesInstanceType( instance_cpus, instance_mem).name return virtual_instance_type diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index 28596a5a69c..9e3bdaf13ce 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -27,7 +27,7 @@ logger = logging.getLogger(__name__) -_tenancy_prefix = None +_tenancy_prefix: Optional[str] = None @clouds.CLOUD_REGISTRY.register diff --git a/sky/design_docs/onprem-design.md b/sky/design_docs/onprem-design.md deleted file mode 100644 index 394543ea191..00000000000 --- a/sky/design_docs/onprem-design.md +++ /dev/null @@ -1,45 +0,0 @@ -# SkyPilot On-prem - -## Multi-tenancy -- Every user has their own job queue. -- Every user will start their own skylet (whenever `sky launch` is first called). - -## Heterogeneous accelerator support -- Supports different types of accelerators across nodes (internode). -- Does not support different types of accelerators within the same node (intranode). - -## Installing Ray and SkyPilot -- Admin installs Ray==2.4.0 and SkyPilot globally on all machines. It is assumed that the admin regularly keeps SkyPilot updated on the cluster. -- Python >= 3.7 for all users. -- When a regular user runs `sky launch`, a local version of SkyPilot will be installed on the machine for each user. The local installation of Ray is specified in `sky/templates/local-ray.yml.j2`. - -## Registering clusters as a regular user -- Registering clusters can be done in two steps: - - Creating a cluster config in `~/.sky/local/`. This cluster is uninitialized, as SkyPilot has not registered the cluster into its database. - - Running `sky launch -c [LOCAL_CLUSTER_NAME] ''` for the first time. This will intialize the cluster and register it into SkyPilot's database. -- `sky status` shows both initialized and uninitialized local clusters. - -## Job submission pipeline -- Any `sky launch/exec` job is submitted via the Ray job submission interface. -- As the Ray cluster is launched by the admin user, any Ray remote functions will be run under the admin user by default. To see this, run the following snippet as a normal user: - -``` -def whoami(): - import subprocess - subprocess.call(['whoami']) - -# Should print current user -whoami() - -# Should print root user that started the Ray cluster -ray.get(ray.remote(f).remote()) -``` - -- Therefore, SkyPilot On-prem transparently includes user-switching so that SkyPilot tasks are still run as the calling, unprivileged user. This user-switching (`sudo -H su --login [USER]` in appropriate places) works as follows: - - In `sky/backends/cloud_vm_ray_backend.py::_setup_and_create_job_cmd_on_local_head`, switching between users is called during Ray job submission. The command `ray job submit --address=http://127.0.0.1:8266 --submission-id {ray_job_id} -- sudo -H su --login [SSH_USER] -c \"[JOB_COMMAND]\"` switches job submission execution from admin back to the original user `SSH_USER`. The `JOB_COMMAND` argument runs a bash script with the user's run commands. - - In `sky/skylet/log_lib.py::run_bash_command_with_log`, there is also another `sudo -H su` command to switch users. The function `run_bash_command_with_log` is part of the `RayCodeGen` job execution script uploaded to remote for job submission (located in `~/.sky/sky_app/sky_app_[JOB_ID].py`). This program initially runs under the calling user, but it executes the function `run_bash_command_with_log` from the context of the admin, as the function is executed within the Ray cluster as a Ray remote function (see above for why all Ray remote functions are run under admin). -- SkyPilot ensures Ray-related environment variables (that are critical for execution) are preserved across switching users (check with `examples/env_check.yaml`). - -## Miscellaneous -- `sky start/stop/autostop` is not supported. -- `sky down` is supported. The command `sky down` does not terminate the cluster, but it "disconnects" this user by killing the user's jobs in the cluster and removing the local cluster from `sky status`. Other users' jobs are not affected. diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index fa90ef0ea69..50d4a62eb95 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -2,6 +2,7 @@ from concurrent import futures import functools import hashlib +import json import os import resource import time @@ -13,6 +14,7 @@ from sky.provision import logging as provision_logging from sky.provision import metadata_utils from sky.skylet import constants +from sky.utils import accelerator_registry from sky.utils import command_runner from sky.utils import common_utils from sky.utils import subprocess_utils @@ -51,8 +53,7 @@ # Command that waits for the ray status to be initialized. Otherwise, a later # `sky status -r` may fail due to the ray cluster not being ready. RAY_HEAD_WAIT_INITIALIZED_COMMAND = ( - f'while `RAY_ADDRESS=127.0.0.1:{constants.SKY_REMOTE_RAY_PORT} ' - 'ray status | grep -q "No cluster status."`; do ' + f'while `{constants.RAY_STATUS} | grep -q "No cluster status."`; do ' 'sleep 0.5; ' 'echo "Waiting ray cluster to be initialized"; ' 'done;') @@ -214,6 +215,22 @@ def _setup_node(runner: command_runner.SSHCommandRunner, ssh_credentials=ssh_credentials) +def _ray_gpu_options(custom_resource: str) -> str: + """Returns GPU options for the ray start command. + + For some cases (e.g., within docker container), we need to explicitly set + --num-gpus to have ray clusters recognize the schedulable GPUs. + """ + acc_dict = json.loads(custom_resource) + assert len(acc_dict) == 1, acc_dict + acc_name, acc_count = list(acc_dict.items())[0] + if accelerator_registry.is_schedulable_non_gpu_accelerator(acc_name): + return '' + # We need to manually set the number of GPUs, as it may not automatically + # detect the GPUs within the container. + return f' --num-gpus={acc_count}' + + @_log_start_end @_auto_retry def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str], @@ -239,6 +256,7 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str], f'--temp-dir={constants.SKY_REMOTE_RAY_TEMPDIR}') if custom_resource: ray_options += f' --resources=\'{custom_resource}\'' + ray_options += _ray_gpu_options(custom_resource) if cluster_info.custom_ray_options: if 'use_external_ip' in cluster_info.custom_ray_options: @@ -313,6 +331,7 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool, if custom_resource: ray_options += f' --resources=\'{custom_resource}\'' + ray_options += _ray_gpu_options(custom_resource) if cluster_info.custom_ray_options: for key, value in cluster_info.custom_ray_options.items(): diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index cfd70f1664d..c5c57673e3c 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -169,7 +169,7 @@ def parse_readme(readme: str) -> str: # click/grpcio/protobuf. # Excluded 2.6.0 as it has a bug in the cluster launcher: # https://github.com/ray-project/ray/releases/tag/ray-2.6.1 - 'ray[default] >= 2.2.0, <= 2.6.3, != 2.6.0', + 'ray[default] >= 2.2.0, <= 2.9.3, != 2.6.0', ] remote = [ @@ -183,13 +183,11 @@ def parse_readme(readme: str) -> str: "grpcio >= 1.32.0, <= 1.51.3, != 1.48.0; python_version < '3.10' and sys_platform != 'darwin'", # noqa:E501 "grpcio >= 1.42.0, <= 1.51.3, != 1.48.0; python_version >= '3.10' and sys_platform != 'darwin'", # noqa:E501 # Adopted from ray's setup.py: - # https://github.com/ray-project/ray/blob/86fab1764e618215d8131e8e5068f0d493c77023/python/setup.py#L326 + # https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L343 'protobuf >= 3.15.3, != 3.19.5', - # Ray job has an issue with pydantic>2.0.0, due to API changes of pydantic. See - # https://github.com/ray-project/ray/issues/36990 - # >=1.10.8 is needed for ray>=2.6. See - # https://github.com/ray-project/ray/issues/35661 - 'pydantic <2.0, >=1.10.8', + # Some pydantic versions are not compatible with ray. Adopted from ray's + # setup.py: https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L254 + 'pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3', ] # NOTE: Change the templates/spot-controller.yaml.j2 file if any of the diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index fa284022329..09bbac65836 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -1,4 +1,7 @@ """Constants for SkyPilot.""" +from packaging import version + +import sky SKY_LOGS_DIRECTORY = '~/sky_logs' SKY_REMOTE_WORKDIR = '~/sky_workdir' @@ -18,7 +21,7 @@ # i.e. the PORT_DICT_STR above. SKY_REMOTE_RAY_PORT_FILE = '~/.sky/ray_port.json' SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot' -SKY_REMOTE_RAY_VERSION = '2.4.0' +SKY_REMOTE_RAY_VERSION = '2.9.3' # The name for the environment variable that stores the unique ID of the # current task. This will stay the same across multiple recoveries of the @@ -66,19 +69,64 @@ } # Install conda on the remote cluster if it is not already installed. -# We do not install the latest conda with python 3.11 because ray has not -# officially supported it yet. +# We use conda with python 3.10 to be consistent across multiple clouds with +# best effort. # https://github.com/ray-project/ray/issues/31606 # We use python 3.10 to be consistent with the python version of the # AWS's Deep Learning AMI's default conda environment. CONDA_INSTALLATION_COMMANDS = ( 'which conda > /dev/null 2>&1 || ' - '(wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long + '(wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long 'bash Miniconda3-Linux-x86_64.sh -b && ' 'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && ' 'conda config --set auto_activate_base true); ' 'grep "# >>> conda initialize >>>" ~/.bashrc || conda init;') +_sky_version = str(version.parse(sky.__version__)) +RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} ray status' +# Install ray and skypilot on the remote cluster if they are not already +# installed. {var} will be replaced with the actual value in +# backend_utils.write_cluster_config. +RAY_SKYPILOT_INSTALLATION_COMMANDS = ( + '(type -a python | grep -q python3) || ' + 'echo \'alias python=python3\' >> ~/.bashrc;' + '(type -a pip | grep -q pip3) || echo \'alias pip=pip3\' >> ~/.bashrc;' + 'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;' + 'source ~/.bashrc;' + # Backward compatibility for ray upgrade (#3248): do not upgrade ray if the + # ray cluster is already running, to avoid the ray cluster being restarted. + # + # We do this guard to avoid any Ray client-server version mismatch. + # Specifically: If existing ray cluster is an older version say 2.4, and we + # pip install new version say 2.9 wheels here, then subsequent sky exec + # (ray job submit) will have v2.9 vs. 2.4 mismatch, similarly this problem + # exists for sky status -r (ray status). + # + # NOTE: RAY_STATUS will only work for the cluster with ray cluster on our + # latest ray port 6380, but those existing cluster launched before #1790 + # that has ray cluster on the default port 6379 will be upgraded and + # restarted. + f'pip3 list | grep "ray " | grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null ' + f'|| {RAY_STATUS} || ' + f'pip3 install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long + # END ray package check and installation + '{ pip3 list | grep "skypilot " && ' + '[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long + '{ pip3 uninstall skypilot -y; ' + 'pip3 install "$(echo ~/.sky/wheels/{sky_wheel_hash}/' + f'skypilot-{_sky_version}*.whl)[{{cloud}}, remote]" && ' + 'echo "{sky_wheel_hash}" > ~/.sky/wheels/current_sky_wheel_hash || ' + 'exit 1; }; ' + # END SkyPilot package check and installation + + # Only patch ray when the ray version is the same as the expected version. + # The ray installation above can be skipped due to the existing ray cluster + # for backward compatibility. In this case, we should not patch the ray + # files. + f'pip3 list | grep "ray " | grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null ' + '&& { python3 -c "from sky.skylet.ray_patches import patch; patch()" ' + '|| exit 1; };') + # The name for the environment variable that stores SkyPilot user hash, which # is mainly used to make sure sky commands runs on a VM launched by SkyPilot # will be recognized as the same user (e.g., spot controller or sky serve diff --git a/sky/skylet/ray_patches/__init__.py b/sky/skylet/ray_patches/__init__.py index 8c6f8567b35..d7cef2b76aa 100644 --- a/sky/skylet/ray_patches/__init__.py +++ b/sky/skylet/ray_patches/__init__.py @@ -27,8 +27,6 @@ import os import subprocess -import pkg_resources - from sky.skylet import constants @@ -81,6 +79,3 @@ def patch() -> None: from ray.autoscaler._private import updater _run_patch(updater.__file__, _to_absolute('updater.py.patch')) - - from ray.dashboard.modules.job import job_head - _run_patch(job_head.__file__, _to_absolute('job_head.py.patch')) diff --git a/sky/skylet/ray_patches/autoscaler.py.patch b/sky/skylet/ray_patches/autoscaler.py.patch index 732c27282b2..5854d754d09 100644 --- a/sky/skylet/ray_patches/autoscaler.py.patch +++ b/sky/skylet/ray_patches/autoscaler.py.patch @@ -1,9 +1,8 @@ -0a1,4 -> # From https://github.com/ray-project/ray/blob/ray-2.4.0/python/ray/autoscaler/_private/autoscaler.py +0a1,3 +> # From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/autoscaler.py > # Sky patch changes: > # - enable upscaling_speed to be 0.0 -> -1068c1072 +1074c1077 < if upscaling_speed: --- > if upscaling_speed is not None: # NOTE(sky): enable 0.0 diff --git a/sky/skylet/ray_patches/cli.py.patch b/sky/skylet/ray_patches/cli.py.patch index 62313082e53..14deca1d90e 100644 --- a/sky/skylet/ray_patches/cli.py.patch +++ b/sky/skylet/ray_patches/cli.py.patch @@ -1,11 +1,9 @@ 0a1,4 -> # Adapted from https://github.com/ray-project/ray/blob/ray-2.4.0/dashboard/modules/job/cli.py +> # Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/dashboard/modules/job/cli.py > # Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/26514 > # Otherwise, the output redirection ">" will not work. > -4d7 -< from subprocess import list2cmdline -212c215 +273c277 < entrypoint=list2cmdline(entrypoint), --- > entrypoint=" ".join(entrypoint), diff --git a/sky/skylet/ray_patches/command_runner.py.patch b/sky/skylet/ray_patches/command_runner.py.patch index afa525bb5e7..c0063390856 100644 --- a/sky/skylet/ray_patches/command_runner.py.patch +++ b/sky/skylet/ray_patches/command_runner.py.patch @@ -1,5 +1,5 @@ 0a1,2 -> # From https://github.com/ray-project/ray/blob/ray-2.4.0/python/ray/autoscaler/_private/command_runner.py +> # From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/command_runner.py > 140c142 < "ControlPersist": "10s", diff --git a/sky/skylet/ray_patches/job_head.py.patch b/sky/skylet/ray_patches/job_head.py.patch deleted file mode 100644 index e9883460de0..00000000000 --- a/sky/skylet/ray_patches/job_head.py.patch +++ /dev/null @@ -1,8 +0,0 @@ -0a1,3 -> # From https://github.com/ray-project/ray/blob/ray-2.4.0/dashboard/modules/job/job_head.py -> # Fix the issue for python 3.11: https://github.com/ray-project/ray/commit/ee64dbc03d6b21dcfefb893150ee5c7f1ebb705e -> -211c214 -< node_id = sample(set(agent_infos), 1)[0] ---- -> node_id = sample(sorted(agent_infos), 1)[0] diff --git a/sky/skylet/ray_patches/log_monitor.py.patch b/sky/skylet/ray_patches/log_monitor.py.patch index 5743f94f538..4a9dba47d69 100644 --- a/sky/skylet/ray_patches/log_monitor.py.patch +++ b/sky/skylet/ray_patches/log_monitor.py.patch @@ -1,10 +1,10 @@ 0a1,4 -> # Original file https://github.com/ray-project/ray/blob/ray-2.4.0/python/ray/_private/log_monitor.py +> # Original file https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/log_monitor.py > # Fixed the problem for progress bar, as the latest version does not preserve \r for progress bar. > # We change the newline handling back to https://github.com/ray-project/ray/blob/ray-1.10.0/python/ray/_private/log_monitor.py#L299-L300 > -354c358,359 +377c381,382 < next_line = next_line.rstrip("\r\n") --- -> if next_line[-1] == "\n": +> if next_line.endswith("\n"): > next_line = next_line[:-1] diff --git a/sky/skylet/ray_patches/resource_demand_scheduler.py.patch b/sky/skylet/ray_patches/resource_demand_scheduler.py.patch index 08c609f0449..64098f6ce3b 100644 --- a/sky/skylet/ray_patches/resource_demand_scheduler.py.patch +++ b/sky/skylet/ray_patches/resource_demand_scheduler.py.patch @@ -1,17 +1,17 @@ 0a1,5 -> # From https://github.com/ray-project/ray/blob/ray-2.4.0/python/ray/autoscaler/_private/resource_demand_scheduler.py +> # From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/resource_demand_scheduler.py > # Sky patch changes: > # - no new nodes are allowed to be launched launched when the upscaling_speed is 0 > # - comment out "assert not unfulfilled": this seems a buggy assert > -450c455,458 +451c456,459 < if upper_bound > 0: --- > # NOTE(sky): do not autoscale when upsclaing speed is 0. > if self.upscaling_speed == 0: > upper_bound = 0 > if upper_bound >= 0: -594c602 +595c603 < assert not unfulfilled --- > # assert not unfulfilled # NOTE(sky): buggy assert. diff --git a/sky/skylet/ray_patches/updater.py.patch b/sky/skylet/ray_patches/updater.py.patch index c4c865002c8..f25208f3afc 100644 --- a/sky/skylet/ray_patches/updater.py.patch +++ b/sky/skylet/ray_patches/updater.py.patch @@ -1,7 +1,7 @@ 0a1,4 -> # From https://github.com/ray-project/ray/blob/releases/2.4.0/python/ray/autoscaler/_private/updater.py +> # From https://github.com/ray-project/ray/blob/releases/2.9.3/python/ray/autoscaler/_private/updater.py > # Sky patch changes: > # - Ensure the node state is refreshed before checking the node is terminated. > -318a323 +327a332 > self.provider.non_terminated_nodes({}) diff --git a/sky/skylet/ray_patches/worker.py.patch b/sky/skylet/ray_patches/worker.py.patch index e75673c37c4..7834ac4ee8f 100644 --- a/sky/skylet/ray_patches/worker.py.patch +++ b/sky/skylet/ray_patches/worker.py.patch @@ -1,10 +1,9 @@ 0a1,4 -> # Adapted from https://github.com/ray-project/ray/blob/ray-2.4.0/python/ray/_private/worker.py +> # Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py > # Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/9233 > # Tracked in PR https://github.com/ray-project/ray/pull/21977/files. > -1872a1877,1884 -> +2022a2027,2034 > def end_for(line: str) -> str: > if sys.platform == "win32": > return "\n" @@ -12,7 +11,8 @@ > return "" > return "\n" > -1896a1909 +> +2037a2050 > end=end_for(line), -1914a1928 +2054a2068 > end=end_for(line), diff --git a/sky/task.py b/sky/task.py index 77c44250ff8..0f9cfe01053 100644 --- a/sky/task.py +++ b/sky/task.py @@ -255,6 +255,7 @@ def __init__( self.event_callback = event_callback # Ignore type error due to a mypy bug. # https://github.com/python/mypy/issues/3004 + self._num_nodes = 1 self.num_nodes = num_nodes # type: ignore self.inputs: Optional[str] = None diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index e834ee1d0c8..a4e09a90ad0 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -149,19 +149,13 @@ setup_commands: # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} - (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; - source ~/.bashrc; - conda config --remove channels "https://aws-ml-conda-ec2.s3.us-west-2.amazonaws.com"; - mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; - (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}); - (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[aws, remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + conda config --remove channels "https://aws-ml-conda-ec2.s3.us-west-2.amazonaws.com" || true; + {{ ray_skypilot_installation_commands }} sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; {%- if docker_image is none %} sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; {%- endif %} mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; - python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`; # Command to start ray clusters are now placed in `sky.provision.instance_setup`. diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index 9ffe2a7958e..c922471f3bb 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -141,12 +141,8 @@ setup_commands: # Line 'sudo mv /etc/nccl.conf /etc/nccl.conf.bak' removes the default nccl.conf which is wrongly configured on many multi-GPU Azure VM, causing failure for multi-GPU workloads using NCCL. - mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} - (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; - source ~/.bashrc; - mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; - (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}); - (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[azure, remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + {{ ray_skypilot_installation_commands }} + touch ~/.sudo_as_admin_successful; sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; {%- if docker_image is none %} sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; @@ -156,7 +152,6 @@ setup_commands: sudo systemctl disable jupyterhub > /dev/null 2>&1 || true; {%- endif %} mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; - python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); sudo mv /etc/nccl.conf /etc/nccl.conf.bak || true; diff --git a/sky/templates/cudo-ray.yml.j2 b/sky/templates/cudo-ray.yml.j2 index 53b0423fd60..f8f5c1cdc59 100644 --- a/sky/templates/cudo-ray.yml.j2 +++ b/sky/templates/cudo-ray.yml.j2 @@ -63,13 +63,9 @@ setup_commands: sudo dpkg --configure -a; mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} - (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; - source ~/.bashrc; - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; - (pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[cudo,remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + {{ ray_skypilot_installation_commands }} + touch ~/.sudo_as_admin_successful; sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; - python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); diff --git a/sky/templates/fluidstack-ray.yml.j2 b/sky/templates/fluidstack-ray.yml.j2 index baeb114c1a6..a0f952a443f 100644 --- a/sky/templates/fluidstack-ray.yml.j2 +++ b/sky/templates/fluidstack-ray.yml.j2 @@ -65,14 +65,10 @@ setup_commands: {{ cuda_installation_commands }} mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} - (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; - source ~/.bashrc; - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; - (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[aws, remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + {{ ray_skypilot_installation_commands }} + touch ~/.sudo_as_admin_successful; sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; - python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index c406ea32367..586649e5ef1 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -188,8 +188,6 @@ setup_commands: sudo dpkg --configure --force-overwrite -a; mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} - (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; source ~/.bashrc; {%- if tpu_vm %} test -f ~/miniconda3/etc/profile.d/conda.sh && source ~/miniconda3/etc/profile.d/conda.sh && conda activate base || true; @@ -198,9 +196,7 @@ setup_commands: {%- if tpu_node_name %} grep "export TPU_NAME=" ~/.bashrc && echo "TPU_NAME already set" || echo "export TPU_NAME={{tpu_node_name}}" >> ~/.bashrc; {%- endif %} - mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; - pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}; - (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[gcp, remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + {{ ray_skypilot_installation_commands }} sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; {%- if docker_image is none %} sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; @@ -208,7 +204,6 @@ setup_commands: sudo systemctl disable jupyter > /dev/null 2>&1 || true; {%- endif %} mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; - python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # Command to start ray clusters are now placed in `sky.provision.instance_setup`. diff --git a/sky/templates/ibm-ray.yml.j2 b/sky/templates/ibm-ray.yml.j2 index ee4e1490d9b..f455f400a61 100644 --- a/sky/templates/ibm-ray.yml.j2 +++ b/sky/templates/ibm-ray.yml.j2 @@ -100,16 +100,10 @@ setup_commands: sudo dpkg --configure -a; mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} - (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; - source ~/.bashrc; - mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}); - (pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[ibm, remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + {{ ray_skypilot_installation_commands }} sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; - python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`; diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 28d03230645..893d5c8565a 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -210,16 +210,11 @@ setup_commands: - sudo DEBIAN_FRONTEND=noninteractive apt install gcc patch pciutils rsync fuse curl -y; mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} - (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; - source ~/.bashrc; - mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && sudo touch ~/.sudo_as_admin_successful; - (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}); - (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + {{ ray_skypilot_installation_commands }} + sudo touch ~/.sudo_as_admin_successful; sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; - python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`; # Format: `REMOTE_PATH : LOCAL_PATH` diff --git a/sky/templates/lambda-ray.yml.j2 b/sky/templates/lambda-ray.yml.j2 index cb290edab46..8f6f3580d1a 100644 --- a/sky/templates/lambda-ray.yml.j2 +++ b/sky/templates/lambda-ray.yml.j2 @@ -74,16 +74,11 @@ setup_commands: mkdir -p ~/.ssh; touch ~/.ssh/config; rm ~/.local/bin/pip ~/.local/bin/pip3 ~/.local/bin/pip3.8 ~/.local/bin/pip3.10; {{ conda_installation_commands }} - (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; - source ~/.bashrc; - mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; - (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}); - (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[lambda, remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + {{ ray_skypilot_installation_commands }} + touch ~/.sudo_as_admin_successful; sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; - python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # Command to start ray on the head node. You don't need to change this. diff --git a/sky/templates/oci-ray.yml.j2 b/sky/templates/oci-ray.yml.j2 index 143e9590ea7..f3fceedd042 100644 --- a/sky/templates/oci-ray.yml.j2 +++ b/sky/templates/oci-ray.yml.j2 @@ -95,16 +95,11 @@ setup_commands: ([ `sudo lshw -class display | grep "NVIDIA Corporation" | wc -l` -gt 0 ]) && (sudo which nvidia-smi > /dev/null || ( sudo apt-get install nvidia-driver-530-open -y && sudo apt-get install nvidia-driver-525-server -y ) || true); mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} - (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; - source ~/.bashrc; - mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}); - (pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[oci, remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + {{ ray_skypilot_installation_commands }} + touch ~/.sudo_as_admin_successful; sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; - python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); sudo iptables -I INPUT -i ens3 -m state --state ESTABLISHED,RELATED,NEW -j ACCEPT; diff --git a/sky/templates/runpod-ray.yml.j2 b/sky/templates/runpod-ray.yml.j2 index fa3598e429e..62206d1a85c 100644 --- a/sky/templates/runpod-ray.yml.j2 +++ b/sky/templates/runpod-ray.yml.j2 @@ -61,15 +61,11 @@ setup_commands: sudo dpkg --configure -a; mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} - (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; - source ~/.bashrc; - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; - (pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[runpod,remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + {{ ray_skypilot_installation_commands }} + touch ~/.sudo_as_admin_successful; sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; - python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # Command to start ray clusters are now placed in `sky.provision.instance_setup`. diff --git a/sky/templates/scp-ray.yml.j2 b/sky/templates/scp-ray.yml.j2 index c0593e737a2..f8e07c5283c 100644 --- a/sky/templates/scp-ray.yml.j2 +++ b/sky/templates/scp-ray.yml.j2 @@ -71,16 +71,10 @@ setup_commands: # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} - (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; - source ~/.bashrc; - mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; - (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}); - (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[scp, remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + {{ ray_skypilot_installation_commands }} sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; - python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`; # Command to start ray on the head node. You don't need to change this. diff --git a/sky/templates/vsphere-ray.yml.j2 b/sky/templates/vsphere-ray.yml.j2 index a224d8b1d1a..7fc4cd9d01c 100644 --- a/sky/templates/vsphere-ray.yml.j2 +++ b/sky/templates/vsphere-ray.yml.j2 @@ -61,13 +61,8 @@ setup_commands: mkdir -p ~/.ssh; touch ~/.ssh/config; pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc); {{ conda_installation_commands }} - (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; - source ~/.bashrc; - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; - (pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[vsphere, remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + {{ ray_skypilot_installation_commands }} sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; - python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); diff --git a/tests/backward_compatibility_tests.sh b/tests/backward_compatibility_tests.sh index 47381294afe..073bd5cf743 100644 --- a/tests/backward_compatibility_tests.sh +++ b/tests/backward_compatibility_tests.sh @@ -51,8 +51,11 @@ if [ "$start_from" -le 1 ]; then conda activate sky-back-compat-master rm -r ~/.sky/wheels || true which sky +# Job 1 sky launch --cloud ${CLOUD} -y --cpus 2 -c ${CLUSTER_NAME} examples/minimal.yaml sky autostop -i 10 -y ${CLUSTER_NAME} +# Job 2 +sky exec -d --cloud ${CLOUD} ${CLUSTER_NAME} sleep 100 conda activate sky-back-compat-current sky status -r ${CLUSTER_NAME} | grep ${CLUSTER_NAME} | grep UP @@ -60,12 +63,20 @@ rm -r ~/.sky/wheels || true if [ "$need_launch" -eq "1" ]; then sky launch --cloud ${CLOUD} -y -c ${CLUSTER_NAME} fi -sky exec --cloud ${CLOUD} ${CLUSTER_NAME} examples/minimal.yaml +# Job 3 +sky exec -d --cloud ${CLOUD} ${CLUSTER_NAME} sleep 50 +q=$(sky queue ${CLUSTER_NAME}) +echo "$q" +echo "$q" | grep "RUNNING" | wc -l | grep 2 || exit 1 +# Job 4 s=$(sky launch --cloud ${CLOUD} -d -c ${CLUSTER_NAME} examples/minimal.yaml) -echo $s +sky logs ${CLUSTER_NAME} 2 --status | grep RUNNING || exit 1 # remove color and find the job id -echo $s | sed -r "s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g" | grep "Job ID: 3" || exit 1 -sky queue ${CLUSTER_NAME} +echo "$s" | sed -r "s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g" | grep "Job ID: 4" || exit 1 +sleep 45 +q=$(sky queue ${CLUSTER_NAME}) +echo "$q" +echo "$q" | grep "SUCCEEDED" | wc -l | grep 4 || exit 1 fi # sky stop + sky start + sky exec @@ -147,4 +158,32 @@ sky logs ${CLUSTER_NAME}-6 2 --status sky logs ${CLUSTER_NAME}-6 2 fi +# Test spot jobs to make sure existing jobs and new job can run correctly, after +# the spot controller is updated. +if [ "$start_from" -le 7 ]; then +conda activate sky-back-compat-master +rm -r ~/.sky/wheels || true +sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${CLUSTER_NAME}-7-0 "echo hi; sleep 1000" +sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${CLUSTER_NAME}-7-1 "echo hi; sleep 300" +conda activate sky-back-compat-current +rm -r ~/.sky/wheels || true +s=$(sky spot logs --no-follow -n ${CLUSTER_NAME}-7-1) +echo "$s" +echo "$s" | grep " hi" || exit 1 +sky spot launch -d --cloud ${CLOUD} -y -n ${CLUSTER_NAME}-7-2 "echo hi; sleep 10" +s=$(sky spot logs --no-follow -n ${CLUSTER_NAME}-7-2) +echo "$s" +echo "$s" | grep " hi" || exit 1 +s=$(sky spot queue | grep ${CLUSTER_NAME}-7) +echo "$s" +echo "$s" | grep "RUNNING" | wc -l | grep 3 || exit 1 +sky spot cancel -y -n ${CLUSTER_NAME}-7-0 +sleep 200 +s=$(sky spot queue | grep ${CLUSTER_NAME}-7) +echo "$s" +echo "$s" | grep "SUCCEEDED" | wc -l | grep 2 || exit 1 +echo "$s" | grep "CANCELLED" | wc -l | grep 1 || exit 1 +fi + sky down ${CLUSTER_NAME}* -y +sky spot cancel -n ${CLUSTER_NAME}* -y diff --git a/tests/test_smoke.py b/tests/test_smoke.py index c6024d5b0d2..3cde4219a73 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1093,15 +1093,23 @@ def test_job_queue(generic_cloud: str): @pytest.mark.no_scp # Doesn't support SCP for now @pytest.mark.no_oci # Doesn't support OCI for now @pytest.mark.no_kubernetes # Doesn't support Kubernetes for now -def test_job_queue_with_docker(generic_cloud: str): - name = _get_cluster_name() +@pytest.mark.parametrize( + "image_id", + [ + "docker:nvidia/cuda:11.8.0-devel-ubuntu18.04", + "docker:ubuntu:18.04", + # Test image with python 3.11 installed by default. + "docker:continuumio/miniconda3", + ]) +def test_job_queue_with_docker(generic_cloud: str, image_id: str): + name = _get_cluster_name() + image_id[len('docker:'):][:4] test = Test( 'job_queue_with_docker', [ - f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster_docker.yaml', - f'sky exec {name} -n {name}-1 -d examples/job_queue/job_docker.yaml', - f'sky exec {name} -n {name}-2 -d examples/job_queue/job_docker.yaml', - f'sky exec {name} -n {name}-3 -d examples/job_queue/job_docker.yaml', + f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml', + f'sky exec {name} -n {name}-1 -d --image-id {image_id} examples/job_queue/job_docker.yaml', + f'sky exec {name} -n {name}-2 -d --image-id {image_id} examples/job_queue/job_docker.yaml', + f'sky exec {name} -n {name}-3 -d --image-id {image_id} examples/job_queue/job_docker.yaml', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING', @@ -2656,10 +2664,14 @@ def test_aws_custom_image(): @pytest.mark.kubernetes -@pytest.mark.parametrize("image_id", [ - "docker:nvidia/cuda:11.8.0-devel-ubuntu18.04", - "docker:ubuntu:18.04", -]) +@pytest.mark.parametrize( + "image_id", + [ + "docker:nvidia/cuda:11.8.0-devel-ubuntu18.04", + "docker:ubuntu:18.04", + # Test image with python 3.11 installed by default. + "docker:continuumio/miniconda3", + ]) def test_kubernetes_custom_image(image_id): """Test Kubernetes custom image""" name = _get_cluster_name() @@ -3065,7 +3077,7 @@ def _check_ondemand_not_in_status(name: str) -> str: # 2 on-demand (provisioning) + 2 Spot (provisioning). f'output=$(sky serve status {name});' - 'echo "$output" | grep -q "0/4" && break;', + 'echo "$output" | grep -q "0/4" || exit 1', f'sleep 20', _check_two_spot_in_status(name), _check_two_ondemand_in_status(name), @@ -3079,7 +3091,7 @@ def _check_ondemand_not_in_status(name: str) -> str: # 1 on-demand (provisioning) + 1 Spot (ready) + 1 spot (provisioning). f'output=$(sky serve status {name});' - 'echo "$output" | grep -q "1/3";', + 'echo "$output" | grep -q "1/3"', _check_two_spot_in_status(name), _check_one_ondemand_in_status(name),