From 40000bff7a18a1986ffb955473a53c5a93622bba Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Wed, 23 Aug 2023 22:05:22 +0900
Subject: [PATCH] [runtime env] Fix Ray hangs when nonexistent conda
 environment is specified #28105 (#34956)

When a conda name is given to the runtime env, we assume the env already exits. However, there are times the env doesn't exist, and if that happens, it hangs forever. This fixes the issue by always checking conda env list before creating a conda runtime env.

Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
---
 python/ray/_private/runtime_env/conda.py      | 44 ++++++++++++++-----
 .../ray/_private/runtime_env/conda_utils.py   |  1 +
 python/ray/tests/test_client_proxy.py         |  9 +++-
 .../ray/tests/test_runtime_env_complicated.py | 31 +++++++++++++
 .../tests/test_runtime_env_conda_and_pip.py   | 18 ++++++++
 5 files changed, 91 insertions(+), 12 deletions(-)

diff --git a/python/ray/_private/runtime_env/conda.py b/python/ray/_private/runtime_env/conda.py
index f428163602dc..a28cc02cb046 100644
--- a/python/ray/_private/runtime_env/conda.py
+++ b/python/ray/_private/runtime_env/conda.py
@@ -18,10 +18,12 @@
     create_conda_env_if_needed,
     delete_conda_env,
     get_conda_activate_commands,
+    get_conda_env_list,
 )
 from ray._private.runtime_env.context import RuntimeEnvContext
 from ray._private.runtime_env.packaging import Protocol, parse_uri
 from ray._private.runtime_env.plugin import RuntimeEnvPlugin
+from ray._private.runtime_env.validation import parse_and_validate_conda
 from ray._private.utils import (
     get_directory_size_bytes,
     get_master_wheel_url,
@@ -222,7 +224,7 @@ def get_uri(runtime_env: Dict) -> Optional[str]:
             # we don't track them with URIs.
             uri = None
         elif isinstance(conda, dict):
-            uri = "conda://" + _get_conda_env_hash(conda_dict=conda)
+            uri = f"conda://{_get_conda_env_hash(conda_dict=conda)}"
         else:
             raise TypeError(
                 "conda field received by RuntimeEnvAgent must be "
@@ -269,6 +271,12 @@ def __init__(self, resources_dir: str):
         self._installs_and_deletions_file_lock = os.path.join(
             self._resources_dir, "ray-conda-installs-and-deletions.lock"
         )
+        # A set of named conda environments (instead of yaml or dict)
+        # that are validated to exist.
+        # NOTE: It has to be only used within the same thread, which
+        # is an event loop.
+        # Also, we don't need to GC this field because it is pretty small.
+        self._validated_named_conda_env = set()
 
     def _get_path_from_hash(self, hash: str) -> str:
         """Generate a path from the hash of a conda or pip spec.
@@ -319,18 +327,34 @@ async def create(
         context: RuntimeEnvContext,
         logger: logging.Logger = default_logger,
     ) -> int:
-        if uri is None:
-            # The "conda" field is the name of an existing conda env, so no
-            # need to create one.
-            # TODO(architkulkarni): Try "conda activate" here to see if the
-            # env exists, and raise an exception if it doesn't.
+        if not runtime_env.has_conda():
             return 0
 
-        # Currently create method is still a sync process, to avoid blocking
-        # the loop, need to run this function in another thread.
-        # TODO(Catch-Bull): Refactor method create into an async process, and
-        # make this method running in current loop.
         def _create():
+            result = parse_and_validate_conda(runtime_env.get("conda"))
+
+            if isinstance(result, str):
+                # The conda env name is given.
+                # In this case, we only verify if the given
+                # conda env exists.
+
+                # If the env is already validated, do nothing.
+                if result in self._validated_named_conda_env:
+                    return 0
+
+                conda_env_list = get_conda_env_list()
+                envs = [Path(env).name for env in conda_env_list]
+                if result not in envs:
+                    raise ValueError(
+                        f"The given conda environment '{result}' "
+                        f"from the runtime env {runtime_env} doesn't "
+                        "exist from the output of `conda env list --json`. "
+                        "You can only specify an env that already exists. "
+                        f"Please make sure to create an env {result} "
+                    )
+                self._validated_named_conda_env.add(result)
+                return 0
+
             logger.debug(
                 "Setting up conda for runtime_env: " f"{runtime_env.serialize()}"
             )
diff --git a/python/ray/_private/runtime_env/conda_utils.py b/python/ray/_private/runtime_env/conda_utils.py
index 291f20c2f407..757ca4a998dd 100644
--- a/python/ray/_private/runtime_env/conda_utils.py
+++ b/python/ray/_private/runtime_env/conda_utils.py
@@ -100,6 +100,7 @@ def create_conda_env_if_needed(
     """
     if logger is None:
         logger = logging.getLogger(__name__)
+
     conda_path = get_conda_bin_executable("conda")
     try:
         exec_cmd([conda_path, "--help"], throw_on_error=False)
diff --git a/python/ray/tests/test_client_proxy.py b/python/ray/tests/test_client_proxy.py
index 16211fe01ab4..5faa3b00243d 100644
--- a/python/ray/tests/test_client_proxy.py
+++ b/python/ray/tests/test_client_proxy.py
@@ -88,11 +88,16 @@ def test_proxy_manager_bad_startup(shutdown_only):
     """
     pm, free_ports = start_ray_and_proxy_manager(n_ports=2)
     client = "client1"
+    ctx = ray.init(ignore_reinit_error=True)
+    port_to_conflict = ctx.dashboard_url.split(":")[1]
 
     pm.create_specific_server(client)
-    assert not pm.start_specific_server(
+    # Intentionally bind to the wrong port so that the
+    # server will crash.
+    pm._get_server_for_client(client).port = port_to_conflict
+    pm.start_specific_server(
         client,
-        JobConfig(runtime_env={"conda": "conda-env-that-sadly-does-not-exist"}),
+        JobConfig(),
     )
     # Wait for reconcile loop
     time.sleep(2)
diff --git a/python/ray/tests/test_runtime_env_complicated.py b/python/ray/tests/test_runtime_env_complicated.py
index 5a7361a135de..56463c79fc3b 100644
--- a/python/ray/tests/test_runtime_env_complicated.py
+++ b/python/ray/tests/test_runtime_env_complicated.py
@@ -213,6 +213,37 @@ def wrapped_version(self):
         assert ray.get(actor.wrapped_version.remote()) == package_version
 
 
+@pytest.mark.skipif(
+    os.environ.get("CONDA_DEFAULT_ENV") is None,
+    reason="must be run from within a conda environment",
+)
+def test_task_conda_env_validation_cached(conda_envs, shutdown_only):
+    """Verify that when a task is running with the same conda env
+    it doesn't validate if env exists.
+    """
+    # The first run would be slower because we need to validate
+    # if the package exists.
+    ray.init()
+    version = EMOJI_VERSIONS[0]
+    runtime_env = {"conda": f"package-{version}"}
+    task = get_emoji_version.options(runtime_env=runtime_env)
+    s = time.time()
+    ray.get(task.remote())
+    first_run = time.time() - s
+    # Typically takes 1~2 seconds.
+    print("First run took", first_run)
+
+    # We should verify this doesn't happen
+    # from the second run.
+    s = time.time()
+    for _ in range(10):
+        ray.get(task.remote())
+    second_10_runs = time.time() - s
+    # Typicall takes less than 100ms.
+    print("second 10 runs took", second_10_runs)
+    assert second_10_runs < first_run
+
+
 @pytest.mark.skipif(
     os.environ.get("CONDA_DEFAULT_ENV") is None,
     reason="must be run from within a conda environment",
diff --git a/python/ray/tests/test_runtime_env_conda_and_pip.py b/python/ray/tests/test_runtime_env_conda_and_pip.py
index 62cbdf2c7e75..fb8323ff11b9 100644
--- a/python/ray/tests/test_runtime_env_conda_and_pip.py
+++ b/python/ray/tests/test_runtime_env_conda_and_pip.py
@@ -212,6 +212,24 @@ def f():
     assert ray.get(f.remote()) == 0
 
 
+def test_runtime_env_conda_not_exists_not_hang(shutdown_only):
+    """Verify when the conda env doesn't exist, it doesn't hang Ray."""
+    ray.init(runtime_env={"conda": "env_which_does_not_exist"})
+
+    @ray.remote
+    def f():
+        return 1
+
+    refs = [f.remote() for _ in range(5)]
+
+    for ref in refs:
+        with pytest.raises(ray.exceptions.RuntimeEnvSetupError) as exc_info:
+            ray.get(ref)
+        assert "doesn't exist from the output of `conda env list --json`" in str(
+            exc_info.value
+        )  # noqa
+
+
 def test_get_requirements_file():
     """Unit test for _PathHelper.get_requirements_file."""
     with tempfile.TemporaryDirectory() as tmpdir: