-
Notifications
You must be signed in to change notification settings - Fork 510
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Core] Upgrade ray to 2.9.3 to support python 3.11 #3248
Changes from 29 commits
4d5679a
d285f33
9b7d8c9
7fe5756
2830311
d5e422b
cc0cb54
8acc37a
79be961
fa8f043
1a61cc2
965fe30
ab3e1dd
44737fb
ecdd54f
b98915f
729a794
01fee94
aa950c3
c928a12
df002e5
0b64d68
b2356bc
a60be81
de44ba7
fdae819
f13c84d
f2c036f
b4326f7
d30cfcf
264b3ed
012b879
af824c3
ec2d8bc
d6d6e09
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -285,7 +285,6 @@ def get_or_fail(futures, pg) -> List[int]: | |
# next job can be scheduled on the released resources immediately. | ||
ray_util.remove_placement_group(pg) | ||
sys.stdout.flush() | ||
sys.stderr.flush() | ||
return returncodes | ||
|
||
run_fn = None | ||
|
@@ -372,14 +371,12 @@ def add_gang_scheduling_placement_group_and_setup( | |
message = {_CTRL_C_TIP_MESSAGE!r} + '\\n' | ||
message += f'INFO: Waiting for task resources on {{node_str}}. This will block if the cluster is full.' | ||
print(message, | ||
file=sys.stderr, | ||
flush=True) | ||
# FIXME: This will print the error message from autoscaler if | ||
# it is waiting for other task to finish. We should hide the | ||
# error message. | ||
ray.get(pg.ready()) | ||
print('INFO: All task resources reserved.', | ||
file=sys.stderr, | ||
flush=True) | ||
""") | ||
] | ||
|
@@ -427,7 +424,6 @@ def add_gang_scheduling_placement_group_and_setup( | |
print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with ' | ||
'return code list:{colorama.Style.RESET_ALL}', | ||
setup_returncodes, | ||
file=sys.stderr, | ||
flush=True) | ||
# Need this to set the job status in ray job to be FAILED. | ||
sys.exit(1) | ||
|
@@ -623,7 +619,6 @@ def add_epilogue(self) -> None: | |
'return code list:{colorama.Style.RESET_ALL}', | ||
returncodes, | ||
reason, | ||
file=sys.stderr, | ||
flush=True) | ||
# Need this to set the job status in ray job to be FAILED. | ||
sys.exit(1) | ||
|
@@ -3139,7 +3134,8 @@ def _exec_code_on_head( | |
f'{cd} && ray job submit ' | ||
'--address=http://127.0.0.1:$RAY_DASHBOARD_PORT ' | ||
f'--submission-id {job_id}-$(whoami) --no-wait ' | ||
f'"{executable} -u {script_path} > {remote_log_path} 2>&1"') | ||
# Redirect stderr to /dev/null to avoid distracting error from ray. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean
? What if there's some genuine errors being printed to stderr? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, we are redirecting stderr to get rid of those message. |
||
f'"{executable} -u {script_path} > {remote_log_path} 2> /dev/null"') | ||
|
||
mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && ' | ||
f'touch {remote_log_path}') | ||
|
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,7 @@ | ||
"""Constants for SkyPilot.""" | ||
from packaging import version | ||
|
||
import sky | ||
|
||
SKY_LOGS_DIRECTORY = '~/sky_logs' | ||
SKY_REMOTE_WORKDIR = '~/sky_workdir' | ||
|
@@ -18,7 +21,7 @@ | |
# i.e. the PORT_DICT_STR above. | ||
SKY_REMOTE_RAY_PORT_FILE = '~/.sky/ray_port.json' | ||
SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot' | ||
SKY_REMOTE_RAY_VERSION = '2.4.0' | ||
SKY_REMOTE_RAY_VERSION = '2.9.3' | ||
|
||
# The name for the environment variable that stores the unique ID of the | ||
# current task. This will stay the same across multiple recoveries of the | ||
|
@@ -66,19 +69,48 @@ | |
} | ||
|
||
# Install conda on the remote cluster if it is not already installed. | ||
# We do not install the latest conda with python 3.11 because ray has not | ||
# officially supported it yet. | ||
# We use conda with python 3.10 to be consistent across multiple clouds with | ||
# best effort. | ||
# https://github.com/ray-project/ray/issues/31606 | ||
# We use python 3.10 to be consistent with the python version of the | ||
# AWS's Deep Learning AMI's default conda environment. | ||
CONDA_INSTALLATION_COMMANDS = ( | ||
'which conda > /dev/null 2>&1 || ' | ||
'(wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long | ||
'(wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long | ||
Michaelvll marked this conversation as resolved.
Show resolved
Hide resolved
|
||
'bash Miniconda3-Linux-x86_64.sh -b && ' | ||
'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && ' | ||
'conda config --set auto_activate_base true); ' | ||
'grep "# >>> conda initialize >>>" ~/.bashrc || conda init;') | ||
|
||
_sky_version = str(version.parse(sky.__version__)) | ||
RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} ray status' | ||
# Install ray and skypilot on the remote cluster if they are not already | ||
# installed. {var} will be replaced with the actual value in | ||
# backend_utils.write_cluster_config. | ||
RAY_SKYPILOT_INSTALLATION_COMMANDS = ( | ||
'(type -a python | grep -q python3) || ' | ||
'echo \'alias python=python3\' >> ~/.bashrc;' | ||
'(type -a pip | grep -q pip3) || echo \'alias pip=pip3\' >> ~/.bashrc;' | ||
'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;' | ||
'source ~/.bashrc;' | ||
# Backward compatibility for ray upgrade (#3248): do not upgrade ray if the | ||
# ray cluster is already running, to avoid the ray cluster being restarted. | ||
# NOTE: this will only work for the cluster with ray cluster on our latest | ||
# ray port 6380, but those existing cluster launched before #1790 that has | ||
# ray cluster on the default port 6379 will be upgraded and restarted. | ||
f'{RAY_STATUS} || {{ pip3 list | grep "ray " | ' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need to do this conditional update now, but not before? Looking at previous .j2, it seems like we always pip install new ray -- but those new module files are not necessarily picked by existing, live ray cluster processes? (My understanding of this previous behavior is fuzzy; please correct if wrong.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because we keep Now, since some users may have existing cluster running the ray cluster with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need some comments here like, "We do this guard to avoid any Ray client-server version mismatch. Specifically: If existing ray cluster is an older version say 2.4, and we pip install new version say 2.9 wheels here, then subsequent There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added the comment! Thanks! |
||
f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null || ' | ||
f'pip3 install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION}; }};' # pylint: disable=line-too-long | ||
Michaelvll marked this conversation as resolved.
Show resolved
Hide resolved
|
||
'{ pip3 list | grep "skypilot " && ' | ||
'[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long | ||
'{ pip3 uninstall skypilot -y; ' | ||
'pip3 install "$(echo ~/.sky/wheels/{sky_wheel_hash}/' | ||
f'skypilot-{_sky_version}*.whl)[{{cloud}}, remote]" && ' | ||
concretevitamin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
'echo "{sky_wheel_hash}" > ~/.sky/wheels/current_sky_wheel_hash || ' | ||
'exit 1; }; ' | ||
f'{RAY_STATUS} || ' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do L93-97 apply to this conditional check as well? Same question as above on why conditional check now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a pointer to those comments too? |
||
'python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;') | ||
|
||
# The name for the environment variable that stores SkyPilot user hash, which | ||
# is mainly used to make sure sky commands runs on a VM launched by SkyPilot | ||
# will be recognized as the same user (e.g., spot controller or sky serve | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,8 @@ | ||
0a1,4 | ||
> # From https://github.com/ray-project/ray/blob/ray-2.4.0/python/ray/autoscaler/_private/autoscaler.py | ||
0a1,3 | ||
> # From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/autoscaler.py | ||
> # Sky patch changes: | ||
> # - enable upscaling_speed to be 0.0 | ||
> | ||
1068c1072 | ||
1074c1077 | ||
< if upscaling_speed: | ||
--- | ||
> if upscaling_speed is not None: # NOTE(sky): enable 0.0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,9 @@ | ||
0a1,4 | ||
> # Adapted from https://github.com/ray-project/ray/blob/ray-2.4.0/dashboard/modules/job/cli.py | ||
> # Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/dashboard/modules/job/cli.py | ||
> # Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/26514 | ||
> # Otherwise, the output redirection ">" will not work. | ||
> | ||
4d7 | ||
< from subprocess import list2cmdline | ||
212c215 | ||
273c277 | ||
< entrypoint=list2cmdline(entrypoint), | ||
--- | ||
> entrypoint=" ".join(entrypoint), |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,10 @@ | ||
0a1,4 | ||
> # Original file https://github.com/ray-project/ray/blob/ray-2.4.0/python/ray/_private/log_monitor.py | ||
> # Original file https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/log_monitor.py | ||
> # Fixed the problem for progress bar, as the latest version does not preserve \r for progress bar. | ||
concretevitamin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
> # We change the newline handling back to https://github.com/ray-project/ray/blob/ray-1.10.0/python/ray/_private/log_monitor.py#L299-L300 | ||
> | ||
354c358,359 | ||
377c381,382 | ||
< next_line = next_line.rstrip("\r\n") | ||
--- | ||
> if next_line[-1] == "\n": | ||
> if next_line.endswith("\n"): | ||
> next_line = next_line[:-1] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,17 @@ | ||
0a1,5 | ||
> # From https://github.com/ray-project/ray/blob/ray-2.4.0/python/ray/autoscaler/_private/resource_demand_scheduler.py | ||
> # From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/resource_demand_scheduler.py | ||
> # Sky patch changes: | ||
> # - no new nodes are allowed to be launched launched when the upscaling_speed is 0 | ||
> # - comment out "assert not unfulfilled": this seems a buggy assert | ||
> | ||
450c455,458 | ||
451c456,459 | ||
< if upper_bound > 0: | ||
--- | ||
> # NOTE(sky): do not autoscale when upsclaing speed is 0. | ||
> if self.upscaling_speed == 0: | ||
> upper_bound = 0 | ||
> if upper_bound >= 0: | ||
594c602 | ||
595c603 | ||
< assert not unfulfilled | ||
--- | ||
> # assert not unfulfilled # NOTE(sky): buggy assert. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Q: What's the reason we previous printed these error logs to stderr but now to stdout?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The reason we redirect the output to stdout is that the latest ray will print out a bunch of useless output to stderr when the job is being cancelled, as shown in the PR description, we now redirect the
stderr
to/dev/null
to get rid of those outputs.I don't think we have a particular reason to have it print to stderr, but was just to print the error to stderr.