diff --git a/.bazelrc b/.bazelrc index 03442e8ca113..90df1aa8e1f0 100644 --- a/.bazelrc +++ b/.bazelrc @@ -210,6 +210,8 @@ build:ubsan --linkopt -fsanitize=undefined build:ubsan --linkopt -fno-sanitize-recover=all build:ubsan --per_file_copt="-external/com_github_grpc_grpc/.*@-fsanitize=undefined" +build:cgroup --sandbox_writable_path=/sys/fs/cgroup --config=llvm + # Import local specific llvm config options, which can be generated by # ci/env/install-llvm-dependencies.sh try-import %workspace%/.llvm-local.bazelrc diff --git a/.buildkite/cicd.rayci.yml b/.buildkite/cicd.rayci.yml index 394cd758ad44..8bbd8006ce93 100644 --- a/.buildkite/cicd.rayci.yml +++ b/.buildkite/cicd.rayci.yml @@ -13,3 +13,15 @@ steps: depends_on: - oss-ci-base_test - forge + - label: ":coral: reef: privileged container tests" + commands: + - bazel run //ci/ray_ci:test_in_docker -- + //ci/ray_ci:test_privileged ci + --cache-test-results + --build-name oss-ci-base_test + --build-type cgroup + --privileged + instance_type: small + depends_on: + - oss-ci-base_test + - forge diff --git a/ci/ray_ci/BUILD.bazel b/ci/ray_ci/BUILD.bazel index 8ef0a90129a1..4c0617ff24a3 100644 --- a/ci/ray_ci/BUILD.bazel +++ b/ci/ray_ci/BUILD.bazel @@ -208,3 +208,16 @@ py_test( ci_require("pytest"), ], ) + +# This test is only run on linux machines +# with docker containers that have --privileged +# enabled. +py_test( + name = "test_privileged", + size = "small", + srcs = ["test_privileged.py"], + tags = [ + "team:ci" + ], + deps = [ci_require("pytest")], +) diff --git a/ci/ray_ci/container.py b/ci/ray_ci/container.py index 19a7346a961f..5b44899b3734 100644 --- a/ci/ray_ci/container.py +++ b/ci/ray_ci/container.py @@ -5,6 +5,7 @@ from typing import List, Tuple, Optional + _CUDA_COPYRIGHT = """ ========== == CUDA == diff --git a/ci/ray_ci/linux_container.py b/ci/ray_ci/linux_container.py index 2bfc91f368de..1e865269d25c 100644 --- a/ci/ray_ci/linux_container.py +++ b/ci/ray_ci/linux_container.py @@ -19,6 +19,7 @@ def __init__( volumes: Optional[List[str]] = None, envs: Optional[List[str]] = None, tmp_filesystem: Optional[str] = None, + privileged: bool = False, ) -> None: super().__init__(docker_tag, volumes, envs) @@ -26,6 +27,7 @@ def __init__( if tmp_filesystem != "tmpfs": raise ValueError("Only tmpfs is supported for tmp filesystem") self.tmp_filesystem = tmp_filesystem + self.privileged = privileged def install_ray( self, build_type: Optional[str] = None, mask: Optional[str] = None @@ -78,8 +80,11 @@ def get_run_command_extra_args( "--mount", f"type={self.tmp_filesystem},destination=/tmp", ] - for cap in _DOCKER_CAP_ADD: - extra_args += ["--cap-add", cap] + if self.privileged: + extra_args += ["--privileged"] + else: + for cap in _DOCKER_CAP_ADD: + extra_args += ["--cap-add", cap] if gpu_ids: extra_args += ["--gpus", f'"device={",".join(map(str, gpu_ids))}"'] extra_args += [ @@ -87,7 +92,6 @@ def get_run_command_extra_args( "/rayci", "--shm-size=2.5gb", ] - return extra_args def get_artifact_mount(self) -> Tuple[str, str]: diff --git a/ci/ray_ci/linux_tester_container.py b/ci/ray_ci/linux_tester_container.py index 8aac8aadb387..126c35c4c001 100644 --- a/ci/ray_ci/linux_tester_container.py +++ b/ci/ray_ci/linux_tester_container.py @@ -18,6 +18,7 @@ def __init__( build_type: Optional[str] = None, install_mask: Optional[str] = None, tmp_filesystem: Optional[str] = None, + privileged: bool = False, ) -> None: LinuxContainer.__init__( self, @@ -28,6 +29,7 @@ def __init__( "/var/run/docker.sock:/var/run/docker.sock", ], tmp_filesystem=tmp_filesystem, + privileged=privileged, ) TesterContainer.__init__( self, diff --git a/ci/ray_ci/test_privileged.py b/ci/ray_ci/test_privileged.py new file mode 100644 index 000000000000..c1f71dfe7056 --- /dev/null +++ b/ci/ray_ci/test_privileged.py @@ -0,0 +1,50 @@ +import os +import pytest +import sys + +from pathlib import Path + +# In privileged containers, we expect the following +# cgroupv1 is disabled +# cgroupv2 is enabled and mounted on /sys/fs/cgroup +# the user running tests has read and write access to the cgroup subtree +# memory and cpu controllers are enabled + +_MOUNT_FILE_PATH = "/proc/mounts" +_CGROUP2_PATH = "/sys/fs/cgroup" +_CTRL_FILE = "cgroup.controllers" +_EXPECTED_CTRLS = ["memory", "cpu"] + + +# mount file format: +# cgroup /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime 0 0 +def test_only_cgroupv2_mounted_rw(): + found_cgroupv2 = False + found_cgroupv1 = False + with open(Path(_MOUNT_FILE_PATH)) as f: + for line in f: + c = line.split() + found_cgroupv2 = found_cgroupv2 or ( + c[2] == "cgroup2" and c[1] == _CGROUP2_PATH and "rw" in c[3] + ) + found_cgroupv1 = found_cgroupv1 or (c[2] == "cgroup") + assert found_cgroupv2 and not found_cgroupv1 + + +def test_cgroupv2_rw_for_test_user(): + assert os.access(_CGROUP2_PATH, os.R_OK) and os.access(_CGROUP2_PATH, os.W_OK) + + +def test_cgroupv2_controllers_enabled(): + with open(os.path.join(_CGROUP2_PATH, _CTRL_FILE)) as f: + enabled = f.readlines() + assert len(enabled) == 1 + enabled_ctrls = enabled[0].split() + for expected_ctrl in _EXPECTED_CTRLS: + assert ( + expected_ctrl in enabled_ctrls + ), f"Expected {expected_ctrl} to be enabled for cgroups2, but it is not" + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/ci/ray_ci/test_tester.py b/ci/ray_ci/test_tester.py index f45ce4e22e78..98835c94af7c 100644 --- a/ci/ray_ci/test_tester.py +++ b/ci/ray_ci/test_tester.py @@ -44,6 +44,28 @@ def test_get_tag_matcher() -> None: ) +def test_linux_privileged() -> None: + with mock.patch( + "ci.ray_ci.linux_tester_container.LinuxTesterContainer.install_ray", + return_value=None, + ): + container = _get_container( + team="core", + operating_system="linux", + workers=3, + worker_id=1, + parallelism_per_worker=2, + network=None, + gpus=0, + tmp_filesystem=None, + privileged=True, + ) + assert ( + container.privileged + and "--privileged" in container.get_run_command_extra_args() + ) + + def test_get_container() -> None: with mock.patch( "ci.ray_ci.linux_tester_container.LinuxTesterContainer.install_ray", diff --git a/ci/ray_ci/tester.py b/ci/ray_ci/tester.py index d33f5f58db31..090ee5bc74c1 100644 --- a/ci/ray_ci/tester.py +++ b/ci/ray_ci/tester.py @@ -159,6 +159,7 @@ "asan-clang", "ubsan", "tsan-clang", + "cgroup", # java build types "java", # do not build ray @@ -188,6 +189,13 @@ type=str, help=("Filesystem to use for /tmp"), ) +@click.option( + "--privileged", + is_flag=True, + show_default=True, + default=False, + help="Run the test in a privileged Docker container", +) def main( targets: List[str], team: str, @@ -212,6 +220,7 @@ def main( install_mask: Optional[str], bisect_run_test_target: Optional[str], tmp_filesystem: Optional[str], + privileged: bool, ) -> None: if not bazel_workspace_dir: raise Exception("Please use `bazelisk run //ci/ray_ci`") @@ -241,6 +250,7 @@ def main( build_type=build_type, skip_ray_installation=skip_ray_installation, install_mask=install_mask, + privileged=privileged, ) if build_only: sys.exit(0) @@ -291,6 +301,7 @@ def _get_container( build_type: Optional[str] = None, install_mask: Optional[str] = None, skip_ray_installation: bool = False, + privileged: bool = False, ) -> TesterContainer: shard_count = workers * parallelism_per_worker shard_start = worker_id * parallelism_per_worker @@ -312,6 +323,7 @@ def _get_container( build_type=build_type, tmp_filesystem=tmp_filesystem, install_mask=install_mask, + privileged=privileged, ) if operating_system == "windows": diff --git a/ci/ray_ci/tester_container.py b/ci/ray_ci/tester_container.py index 8db4d24c1dab..19be0e3c2fd3 100644 --- a/ci/ray_ci/tester_container.py +++ b/ci/ray_ci/tester_container.py @@ -252,6 +252,8 @@ def _run_tests_in_docker( test_cmd += "--config=ubsan " if self.build_type == "tsan-clang": test_cmd += "--config=tsan-clang " + if self.build_type == "cgroup": + test_cmd += "--config=cgroup " for env in test_envs: test_cmd += f"--test_env {env} " if test_arg: diff --git a/ci/ray_ci/tests.env.Dockerfile b/ci/ray_ci/tests.env.Dockerfile index 70cd39a879ca..ab3f6c052c1b 100644 --- a/ci/ray_ci/tests.env.Dockerfile +++ b/ci/ray_ci/tests.env.Dockerfile @@ -29,7 +29,7 @@ if [[ "$BUILD_TYPE" == "skip" || "${BUILD_TYPE}" == "ubsan" ]]; then exit 0 fi -if [[ "$BUILD_TYPE" == "clang" || "$BUILD_TYPE" == "asan-clang" || "$BUILD_TYPE" == "tsan-clang" ]]; then +if [[ "$BUILD_TYPE" == "clang" || "$BUILD_TYPE" == "asan-clang" || "$BUILD_TYPE" == "tsan-clang" || "$BUILD_TYPE" == "cgroup" ]]; then echo "--- Install LLVM dependencies (and skip building ray package)" bash ci/env/install-llvm-binaries.sh exit 0