[Core] [runtime env] [logging] Confine runtime env logs to `runtime_e…

…nv*.log` (ray-project#29985) User's runtime envs might contain sensitive info like secret environment variables. The goal is to confine all sensitive info to certain log files, and have other system level log files that are guaranteed not to have user info. This PR ensures all runtime env logs are in runtime_env_agent.log or runtime_env_setup-*.log. In the C++ code, this PR confines all logs that contain the runtime env content to DEBUG level logs. In the Python code, this PR removes all runtime_env logs from dashboard_agent.log by introducing a separate logger for the agent.py process. (Previously it was using the root logger, so all agent logs would go to dashboard_agent.py.). To prevent changing logging behavior in non-runtime-env agents, I've manually added this logger to those agents. The runtime_env_agent uses its own logger which logs to the new file runtime_env_agent.log. Logs from individual runtime env installations are still in runtime_env_setup-*.log. Signed-off-by: tmynn <hovhannes.tamoyan@gmail.com>
ju2ez · Jan 25, 2023 · d9ab277 · d9ab277
1 parent c908f5b
commit d9ab277
Show file tree

Hide file tree

Showing 13 changed files with 203 additions and 26 deletions.
diff --git a/.buildkite/pipeline.build.yml b/.buildkite/pipeline.build.yml
@@ -568,7 +568,7 @@
 
 - label: ":octopus: Tune multinode tests"
   conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"]
-  instance_size: small
+  instance_size: medium
   commands:
     - LINUX_WHEELS=1 ./ci/ci.sh build
     - mkdir -p ~/.docker/cli-plugins/ && curl -SL https://github.com/docker/compose/releases/download/v2.0.1/docker-compose-linux-x86_64 -o ~/.docker/cli-plugins/docker-compose && chmod +x ~/.docker/cli-plugins/docker-compose

diff --git a/ci/ci.sh b/ci/ci.sh
@@ -786,9 +786,9 @@ run_minimal_test() {
   # shellcheck disable=SC2086
   bazel test --test_output=streamed --config=ci --test_env=RAY_MINIMAL=1 ${BAZEL_EXPORT_OPTIONS} python/ray/tests/test_runtime_env_ray_minimal
   # shellcheck disable=SC2086
-  bazel test --test_output=streamed --config=ci ${BAZEL_EXPORT_OPTIONS} python/ray/tests/test_runtime_env
+  bazel test --test_output=streamed --config=ci --test_env=RAY_MINIMAL=1 ${BAZEL_EXPORT_OPTIONS} python/ray/tests/test_runtime_env
   # shellcheck disable=SC2086
-  bazel test --test_output=streamed --config=ci ${BAZEL_EXPORT_OPTIONS} python/ray/tests/test_runtime_env_2
+  bazel test --test_output=streamed --config=ci --test_env=RAY_MINIMAL=1 ${BAZEL_EXPORT_OPTIONS} python/ray/tests/test_runtime_env_2
   # shellcheck disable=SC2086
   bazel test --test_output=streamed --config=ci ${BAZEL_EXPORT_OPTIONS} python/ray/tests/test_utils
 

diff --git a/dashboard/http_server_agent.py b/dashboard/http_server_agent.py
@@ -84,3 +84,4 @@ async def start(self, modules):
     async def cleanup(self):
         # Wait for finish signal.
         await self.runner.cleanup()
+        await self.http_session.close()
diff --git a/dashboard/modules/runtime_env/runtime_env_agent.py b/dashboard/modules/runtime_env/runtime_env_agent.py
@@ -7,7 +7,9 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import Callable, Dict, List, Set, Tuple
-from ray._private.ray_constants import DEFAULT_RUNTIME_ENV_TIMEOUT_SECONDS
+from ray._private.ray_constants import (
+    DEFAULT_RUNTIME_ENV_TIMEOUT_SECONDS,
+)
 
 import ray.dashboard.consts as dashboard_consts
 import ray.dashboard.modules.runtime_env.runtime_env_consts as runtime_env_consts
@@ -167,6 +169,8 @@ class RuntimeEnvAgent(
         dashboard_agent: The DashboardAgent object contains global config.
     """
 
+    LOG_FILENAME = "runtime_env_agent.log"
+
     def __init__(self, dashboard_agent):
         super().__init__(dashboard_agent)
         self._runtime_env_dir = dashboard_agent.runtime_env_dir
@@ -214,6 +218,14 @@ def __init__(self, dashboard_agent):
         )
 
         self._logger = default_logger
+        self._logging_params.update(filename=self.LOG_FILENAME)
+        self._logger = setup_component_logger(
+            logger_name=default_logger.name, **self._logging_params
+        )
+        # Don't propagate logs to the root logger, because these logs
+        # might contain sensitive information. Instead, these logs should
+        # be confined to the runtime env agent log file `self.LOG_FILENAME`.
+        self._logger.propagate = False
 
     def uris_parser(self, runtime_env):
         result = list()
@@ -250,6 +262,7 @@ def get_or_create_logger(self, job_id: bytes):
             params = self._logging_params.copy()
             params["filename"] = f"runtime_env_setup-{job_id}.log"
             params["logger_name"] = f"runtime_env_{job_id}"
+            params["propagate"] = False
             per_job_logger = setup_component_logger(**params)
             self._per_job_logger_cache[job_id] = per_job_logger
         return self._per_job_logger_cache[job_id]

diff --git a/doc/source/ray-observability/ray-logging.rst b/doc/source/ray-observability/ray-logging.rst
@@ -166,6 +166,9 @@ Here's a Ray log directory structure. Note that ``.out`` is logs from stdout/std
 - ``raylet.[out|err]``: A log file of raylets.
 - ``redis-shard_[shard_index].[out|err]``: Redis shard log files.
 - ``redis.[out|err]``: Redis log files.
+- ``runtime_env_agent.log``: Every Ray node has one agent that manages :ref:`runtime environment <runtime-environments>` creation, deletion and caching.
+  This is the log file of the agent containing logs of create/delete requests and cache hits and misses.
+  For the logs of the actual installations (including e.g. ``pip install`` logs), see the ``runtime_env_setup-[job_id].log`` file (see below).
 - ``runtime_env_setup-[job_id].log``: Logs from installing :ref:`runtime environments <runtime-environments>` for a task, actor or job.  This file will only be present if a runtime environment is installed.
 - ``runtime_env_setup-ray_client_server_[port].log``: Logs from installing :ref:`runtime environments <runtime-environments>` for a job when connecting via :ref:`Ray Client <ray-client-ref>`.
 - ``worker-[worker_id]-[job_id]-[pid].[out|err]``: Python/Java part of Ray drivers and workers. All of stdout and stderr from tasks/actors are streamed here. Note that job_id is an id of the driver.- 

diff --git a/python/ray/_private/ray_logging.py b/python/ray/_private/ray_logging.py
@@ -39,6 +39,7 @@ def setup_component_logger(
     max_bytes,
     backup_count,
     logger_name=None,
+    propagate=True,
 ):
     """Configure the root logger that is used for Ray's python components.
 
@@ -54,8 +55,9 @@ def setup_component_logger(
             to stderr.
         max_bytes: Same argument as RotatingFileHandler's maxBytes.
         backup_count: Same argument as RotatingFileHandler's backupCount.
-        logger_name: used to create or get the correspoding
+        logger_name: Used to create or get the correspoding
             logger in getLogger call. It will get the root logger by default.
+        propagate: Whether to propagate the log to the parent logger.
     Returns:
         the created or modified logger.
     """
@@ -74,6 +76,7 @@ def setup_component_logger(
     logger.setLevel(logging_level)
     handler.setFormatter(logging.Formatter(logging_format))
     logger.addHandler(handler)
+    logger.propagate = propagate
     return logger
 
 

diff --git a/python/ray/tests/test_runtime_env_2.py b/python/ray/tests/test_runtime_env_2.py
@@ -1,11 +1,18 @@
 import pytest
 import time
 import sys
+import fnmatch
+import os
+from typing import List
 
 import ray
+from ray.dashboard.modules.job.common import JobStatus
 from ray.exceptions import RuntimeEnvSetupError
 from ray.runtime_env import RuntimeEnv, RuntimeEnvConfig
+from ray._private.test_utils import wait_for_condition
 
+if os.environ.get("RAY_MINIMAL") != "1":
+    from ray.job_submission import JobSubmissionClient
 
 bad_runtime_env_cache_ttl_seconds = 10
 
@@ -125,6 +132,115 @@ def run(runtime_env):
         run(runtime_env)
 
 
+def assert_no_user_info_in_logs(user_info: str, file_whitelist: List[str] = None):
+    """Assert that the user info is not in the logs, except for any file that
+    glob pattern matches a file in the whitelist.
+    """
+    if file_whitelist is None:
+        file_whitelist = []
+
+    log_dir = os.path.join(ray.worker._global_node.get_session_dir_path(), "logs")
+
+    for root, dirs, files in os.walk(log_dir):
+        for file in files:
+            if any(fnmatch.fnmatch(file, pattern) for pattern in file_whitelist):
+                continue
+            # Some lines contain hex IDs, so ignore the UTF decoding errors.
+            with open(os.path.join(root, file), "r", errors="ignore") as f:
+                for line in f:
+                    assert user_info not in line, (file, user_info, line)
+
+
+class TestNoUserInfoInLogs:
+    """Test that no user info (e.g. runtime env env vars) show up in the logs."""
+
+    def test_assert_no_user_info_in_logs(self, shutdown_only):
+        """Test assert_no_user_info_in_logs does not spuriously pass."""
+        ray.init()
+        with pytest.raises(AssertionError):
+            assert_no_user_info_in_logs("ray")
+        assert_no_user_info_in_logs("ray", file_whitelist=["*"])
+
+    def test_basic(self, start_cluster, monkeypatch, tmp_path, shutdown_only):
+        """Test driver with and without Ray Client."""
+
+        cluster, address = start_cluster
+
+        # Runtime env logs may still appear in debug logs. Check the debug flag is off.
+        assert os.getenv("RAY_BACKEND_LOG_LEVEL") != "debug"
+
+        # Reuse the same "secret" for working_dir, pip, env_vars for convenience.
+        USER_SECRET = "pip-install-test"
+        working_dir = tmp_path / USER_SECRET
+        working_dir.mkdir()
+        runtime_env = {
+            "working_dir": str(working_dir),
+            "pip": [USER_SECRET],
+            # Append address to ensure different runtime envs for client and non-client
+            # code paths to force reinstalling the runtime env instead of reusing it.
+            "env_vars": {USER_SECRET: USER_SECRET + str(address)},
+        }
+        ray.init(runtime_env=runtime_env)
+
+        # Run a function to ensure the runtime env is set up.
+        @ray.remote
+        def f():
+            return os.environ.get(USER_SECRET)
+
+        assert USER_SECRET in ray.get(f.remote())
+
+        @ray.remote
+        class Foo:
+            def __init__(self):
+                self.x = os.environ.get(USER_SECRET)
+
+            def get_x(self):
+                return self.x
+
+        foo = Foo.remote()
+        assert USER_SECRET in ray.get(foo.get_x.remote())
+
+        # Generate runtime env failure logs too.
+        bad_runtime_env = {
+            "pip": ["pkg-which-sadly-does-not-exist"],
+            "env_vars": {USER_SECRET: USER_SECRET},
+        }
+        with pytest.raises(Exception):
+            ray.get(f.options(runtime_env=bad_runtime_env).remote())
+        with pytest.raises(Exception):
+            foo2 = Foo.options(runtime_env=bad_runtime_env).remote()
+            ray.get(foo2.get_x.remote())
+
+        using_ray_client = address.startswith("ray://")
+
+        # Test Ray Jobs API codepath. Skip for ray_minimal because Ray Jobs API
+        # requires ray[default].
+        if not using_ray_client and os.environ.get("RAY_MINIMAL") != "1":
+            client = JobSubmissionClient()
+            job_id_good_runtime_env = client.submit_job(
+                entrypoint="echo 'hello world'", runtime_env=runtime_env
+            )
+            job_id_bad_runtime_env = client.submit_job(
+                entrypoint="echo 'hello world'", runtime_env=bad_runtime_env
+            )
+
+            def job_succeeded(job_id):
+                job_status = client.get_job_status(job_id)
+                return job_status == JobStatus.SUCCEEDED
+
+            def job_failed(job_id):
+                job_status = client.get_job_status(job_id)
+                return job_status == JobStatus.FAILED
+
+            wait_for_condition(lambda: job_succeeded(job_id_good_runtime_env))
+            wait_for_condition(lambda: job_failed(job_id_bad_runtime_env), timeout=30)
+
+        with pytest.raises(AssertionError):
+            assert_no_user_info_in_logs(USER_SECRET)
+
+        assert_no_user_info_in_logs(USER_SECRET, file_whitelist=["runtime_env*.log"])
+
+
 if __name__ == "__main__":
     import os
     import sys

diff --git a/python/ray/tests/test_runtime_env_plugin.py b/python/ray/tests/test_runtime_env_plugin.py
@@ -395,10 +395,15 @@ def f():
 
     # Check that the warning is logged.
     session_dir = ray._private.worker.global_worker.node.address_info["session_dir"]
-    dashboard_agent_log_path = Path(session_dir) / "logs" / "dashboard_agent.log"
-    wait_for_condition(lambda: dashboard_agent_log_path.exists())
-    with open(dashboard_agent_log_path, "r") as f:
-        wait_for_condition(lambda: "unexpected_field is not recognized" in f.read())
+    log_path = Path(session_dir) / "logs"
+
+    # Check that a warning appears in some "runtime_env_setup*.log"
+    wait_for_condition(
+        lambda: any(
+            "unexpected_field is not recognized" in open(f).read()
+            for f in log_path.glob("runtime_env_setup*.log")
+        )
+    )
 
 
 URI_CACHING_TEST_PLUGIN_CLASS_PATH = (

diff --git a/src/ray/common/task/task_spec.cc b/src/ray/common/task/task_spec.cc
@@ -477,10 +477,26 @@ std::string TaskSpecification::DebugString() const {
     stream << ", max_retries=" << MaxRetries();
   }
 
-  // Print runtime env.
+  // Print non-sensitive runtime env info.
   if (HasRuntimeEnv()) {
     const auto &runtime_env_info = RuntimeEnvInfo();
-    stream << ", serialized_runtime_env=" << SerializedRuntimeEnv();
+    stream << ", runtime_env_hash=" << GetRuntimeEnvHash();
+    if (runtime_env_info.has_runtime_env_config()) {
+      stream << ", eager_install="
+             << runtime_env_info.runtime_env_config().eager_install();
+      stream << ", setup_timeout_seconds="
+             << runtime_env_info.runtime_env_config().setup_timeout_seconds();
+    }
+  }
+
+  return stream.str();
+}
+
+std::string TaskSpecification::RuntimeEnvDebugString() const {
+  std::ostringstream stream;
+  if (HasRuntimeEnv()) {
+    const auto &runtime_env_info = RuntimeEnvInfo();
+    stream << "serialized_runtime_env=" << SerializedRuntimeEnv();
     const auto &uris = runtime_env_info.uris();
     if (!uris.working_dir_uri().empty() || uris.py_modules_uris().size() > 0) {
       stream << ", runtime_env_uris=";
@@ -493,14 +509,14 @@ std::string TaskSpecification::DebugString() const {
       // Erase the last ":"
       stream.seekp(-1, std::ios_base::end);
     }
+    stream << ", runtime_env_hash=" << GetRuntimeEnvHash();
     if (runtime_env_info.has_runtime_env_config()) {
       stream << ", eager_install="
              << runtime_env_info.runtime_env_config().eager_install();
       stream << ", setup_timeout_seconds="
              << runtime_env_info.runtime_env_config().setup_timeout_seconds();
     }
   }
-
   return stream.str();
 }
 

diff --git a/src/ray/common/task/task_spec.h b/src/ray/common/task/task_spec.h
@@ -378,6 +378,10 @@ class TaskSpecification : public MessageWrapper<rpc::TaskSpec> {
 
   std::string DebugString() const;
 
+  // A one-line summary of the runtime environment for the task. May contain sensitive
+  // information such as user-specified environment variables.
+  std::string RuntimeEnvDebugString() const;
+
   // A one-word summary of the task func as a call site (e.g., __main__.foo).
   std::string CallSiteString() const;
 

diff --git a/src/ray/core_worker/task_manager.cc b/src/ray/core_worker/task_manager.cc
@@ -521,6 +521,8 @@ void TaskManager::FailPendingTask(const TaskID &task_id,
       } else {
         RAY_LOG(INFO) << "Task failed: " << spec.DebugString();
       }
+      RAY_LOG(DEBUG) << "Runtime env for task " << spec.TaskId() << " is "
+                     << spec.RuntimeEnvDebugString();
     }
   }