diff --git a/debian/paasta-tools.links b/debian/paasta-tools.links index 9c9720a1ff..367a423316 100644 --- a/debian/paasta-tools.links +++ b/debian/paasta-tools.links @@ -13,7 +13,6 @@ opt/venvs/paasta-tools/bin/generate_deployments_for_service.py usr/bin/generate_ opt/venvs/paasta-tools/bin/generate_services_file.py usr/bin/generate_services_file opt/venvs/paasta-tools/bin/generate_services_yaml.py usr/bin/generate_services_yaml opt/venvs/paasta-tools/bin/generate_authenticating_services.py usr/bin/generate_authenticating_services -opt/venvs/paasta-tools/bin/kill_orphaned_docker_containers.py usr/bin/kill_orphaned_docker_containers opt/venvs/paasta-tools/bin/kubernetes_remove_evicted_pods.py usr/bin/kubernetes_remove_evicted_pods opt/venvs/paasta-tools/bin/paasta-api usr/bin/paasta-api opt/venvs/paasta-tools/bin/paasta-fsm usr/bin/paasta-fsm diff --git a/docs/source/generated/paasta_tools.monitoring.kill_orphaned_docker_containers.rst b/docs/source/generated/paasta_tools.monitoring.kill_orphaned_docker_containers.rst deleted file mode 100644 index 6bac87c0f6..0000000000 --- a/docs/source/generated/paasta_tools.monitoring.kill_orphaned_docker_containers.rst +++ /dev/null @@ -1,7 +0,0 @@ -paasta\_tools.monitoring.kill\_orphaned\_docker\_containers module -================================================================== - -.. automodule:: paasta_tools.monitoring.kill_orphaned_docker_containers - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/generated/paasta_tools.monitoring.rst b/docs/source/generated/paasta_tools.monitoring.rst index ce4749e5d8..3efa3afd1d 100644 --- a/docs/source/generated/paasta_tools.monitoring.rst +++ b/docs/source/generated/paasta_tools.monitoring.rst @@ -7,7 +7,6 @@ Submodules .. toctree:: paasta_tools.monitoring.check_k8s_api_performance - paasta_tools.monitoring.kill_orphaned_docker_containers Module contents --------------- diff --git a/paasta_tools/monitoring/kill_orphaned_docker_containers.py b/paasta_tools/monitoring/kill_orphaned_docker_containers.py deleted file mode 100755 index 3a2d06e00c..0000000000 --- a/paasta_tools/monitoring/kill_orphaned_docker_containers.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python -import argparse -import sys - -import a_sync - -from paasta_tools import mesos_tools -from paasta_tools.utils import get_docker_client -from paasta_tools.utils import get_running_mesos_docker_containers - - -def parse_args(): - parser = argparse.ArgumentParser( - description=( - "Cross references running containers with task ids from the mesos slave" - " and optionally kills them." - ) - ) - parser.add_argument( - "-f", - "--force", - action="store_true", - help="Actually kill the containers. (defaults to dry-run)", - ) - args = parser.parse_args() - return args - - -@a_sync.to_blocking -async def main(): - args = parse_args() - docker_client = get_docker_client() - - running_mesos_task_ids = [ - task["id"] - for task in mesos_tools.filter_running_tasks( - await mesos_tools.get_running_tasks_from_frameworks("") - ) - ] - running_mesos_docker_containers = get_running_mesos_docker_containers() - - orphaned_containers = [] - for container in running_mesos_docker_containers: - mesos_task_id = mesos_tools.get_mesos_id_from_container( - container=container, client=docker_client - ) - if mesos_task_id not in running_mesos_task_ids: - orphaned_containers.append( - (container["Names"][0].strip("/"), mesos_task_id) - ) - - if orphaned_containers: - print( - "CRIT: Docker containers are orphaned: {}{}".format( - ", ".join( - f"{container_name} ({mesos_task_id})" - for container_name, mesos_task_id in orphaned_containers - ), - " and will be killed" if args.force else "", - ) - ) - if args.force: - for container_name, mesos_task_id in orphaned_containers: - docker_client.kill(container_name) - sys.exit(1) - else: - print("OK: All mesos task IDs accounted for") - sys.exit(0) - - -if __name__ == "__main__": - main() diff --git a/paasta_tools/oom_logger.py b/paasta_tools/oom_logger.py index ecd4d7c8f2..ee93381f5a 100644 --- a/paasta_tools/oom_logger.py +++ b/paasta_tools/oom_logger.py @@ -33,10 +33,16 @@ destination(paasta_oom_logger); }; """ +import argparse +import json import re import sys from collections import namedtuple +from typing import Dict +import grpc +from containerd.services.containers.v1 import containers_pb2 +from containerd.services.containers.v1 import containers_pb2_grpc from docker.errors import APIError from paasta_tools.cli.utils import get_instance_config @@ -76,6 +82,16 @@ ) +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="paasta_oom_logger") + parser.add_argument( + "--containerd", + action="store_true", + help="Use containerd to inspect containers, otherwise use docker", + ) + return parser.parse_args() + + def capture_oom_events_from_stdin(): process_name_regex = re.compile( r"^\d+\s[a-zA-Z0-9\-]+\s.*\]\s(.+)\sinvoked\soom-killer:" @@ -93,6 +109,25 @@ def capture_oom_events_from_stdin(): """, re.VERBOSE, ) + oom_regex_kubernetes_containerd_systemd_cgroup = re.compile( + r""" + ^(\d+)\s # timestamp + ([a-zA-Z0-9\-]+) # hostname + \s.*oom-kill:.*task_memcg=/.*\.slice/.* # loosely match systemd slice and containerid + cri-containerd:(\w{64}).*$ # containerid + """, + re.VERBOSE, + ) + + oom_regex_kubernetes_containerd_systemd_cgroup_structured = re.compile( + r""" + ^(\d+)\s # timestamp + ([a-zA-Z0-9\-]+) # hostname + \s.*oom-kill:.*task_memcg=/kubepods\.slice/.* # match systemd slice and containerid + cri-containerd-(\w{64}).*$ # containerid + """, + re.VERBOSE, + ) oom_regex_kubernetes_structured = re.compile( r""" ^(\d+)\s # timestamp @@ -115,6 +150,8 @@ def capture_oom_events_from_stdin(): oom_regex_kubernetes, oom_regex_kubernetes_structured, oom_regex_kubernetes_systemd_cgroup, + oom_regex_kubernetes_containerd_systemd_cgroup, + oom_regex_kubernetes_containerd_systemd_cgroup_structured, ] process_name = "" @@ -136,11 +173,18 @@ def capture_oom_events_from_stdin(): break -def get_container_env_as_dict(docker_inspect): +def get_container_env_as_dict( + is_cri_containerd: bool, container_inspect: dict +) -> Dict[str, str]: env_vars = {} - config = docker_inspect.get("Config") + if is_cri_containerd: + config = container_inspect.get("process") + env_key = "env" + else: + config = container_inspect.get("Config") + env_key = "Env" if config is not None: - env = config.get("Env", []) + env = config.get(env_key, []) for i in env: name, _, value = i.partition("=") env_vars[name] = value @@ -209,18 +253,26 @@ def send_sfx_event(service, instance, cluster): counter.count() +def get_containerd_container(container_id: str) -> containers_pb2.Container: + with grpc.insecure_channel("unix:///run/containerd/containerd.sock") as channel: + containersv1 = containers_pb2_grpc.ContainersStub(channel) + return containersv1.Get( + containers_pb2.GetContainerRequest(id=container_id), + metadata=(("containerd-namespace", "k8s.io"),), + ).container + + def main(): if clog is None: print("CLog logger unavailable, exiting.", file=sys.stderr) sys.exit(1) - + args = parse_args() clog.config.configure( scribe_host="169.254.255.254", scribe_port=1463, monk_disable=False, scribe_disable=False, ) - cluster = load_system_paasta_config().get_cluster() client = get_docker_client() for ( @@ -229,11 +281,22 @@ def main(): container_id, process_name, ) in capture_oom_events_from_stdin(): - try: - docker_inspect = client.inspect_container(resource_id=container_id) - except (APIError): - continue - env_vars = get_container_env_as_dict(docker_inspect) + if args.containerd: + # then we're using containerd to inspect containers + try: + container_info = get_containerd_container(container_id) + except grpc.RpcError as e: + print("An error occurred while getting the container:", e) + continue + container_spec_raw = container_info.spec.value.decode("utf-8") + container_inspect = json.loads(container_spec_raw) + else: + # we're using docker to inspect containers + try: + container_inspect = client.inspect_container(resource_id=container_id) + except (APIError): + continue + env_vars = get_container_env_as_dict(args.containerd, container_inspect) service = env_vars.get("PAASTA_SERVICE", "unknown") instance = env_vars.get("PAASTA_INSTANCE", "unknown") mesos_container_id = env_vars.get("MESOS_CONTAINER_NAME", "mesos-null") diff --git a/requirements-minimal.txt b/requirements-minimal.txt index 7808826a6e..482bf6b087 100644 --- a/requirements-minimal.txt +++ b/requirements-minimal.txt @@ -8,12 +8,14 @@ botocore bravado >= 10.2.0 certifi choice >= 0.1 +containerd cookiecutter >= 1.4.0 croniter docker dulwich >= 0.17.3 ephemeral-port-reserve >= 1.0.1 graphviz +grpcio gunicorn humanfriendly humanize >= 0.5.1 diff --git a/requirements.txt b/requirements.txt index 4de5d38b12..0223381055 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,7 @@ certifi==2017.11.5 chardet==3.0.4 choice==0.1 click==6.6 +containerd==1.5.3 cookiecutter==1.4.0 croniter==1.3.4 decorator==4.1.2 @@ -27,6 +28,7 @@ ephemeral-port-reserve==1.1.0 future==0.16.0 google-auth==1.2.0 graphviz==0.8.2 +grpcio==1.62.2 gunicorn==19.8.1 http-parser==0.9.0 humanfriendly==4.18 diff --git a/setup.py b/setup.py index ad397bba21..a11ae18f56 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,6 @@ def get_install_requires(): "paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py", "paasta_tools/kubernetes/bin/paasta_secrets_sync.py", "paasta_tools/log_task_lifecycle_events.py", - "paasta_tools/monitoring/kill_orphaned_docker_containers.py", "paasta_tools/paasta_deploy_tron_jobs", "paasta_tools/paasta_execute_docker_command.py", "paasta_tools/paasta_remote_run.py", diff --git a/tests/test_oom_logger.py b/tests/test_oom_logger.py index 2a3c4489a8..16821a528a 100644 --- a/tests/test_oom_logger.py +++ b/tests/test_oom_logger.py @@ -14,6 +14,7 @@ import json import pytest +from mock import MagicMock from mock import Mock from mock import patch @@ -120,6 +121,39 @@ def sys_stdin_kubernetes_structured_burstable_systemd_cgroup(): ] +@pytest.fixture +def sys_stdin_kubernetes_containerd_systemd_cgroup_structured(): + return [ + "some random line1\n", + "1720128512 dev37-devc [ 7195.442797] python3 invoked oom-killer: " + "gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=999\n", + "some random line2\n", + "1720128512 dev37-devc [ 7195.442928] oom-kill:constraint=CONSTRAINT_MEMCG," + "cpuset=cri-containerd-e216d2f1e6c625d363c71edb6b3cbab5a9e1b447641b61028d0b94b077adf27c.scope," + "mems_allowed=0,oom_memcg=/kubepods.slice/kubepods-burstable.slice/" + "kubepods-burstable-pod08768c36_163c_40e5_8e49_09cf42ff5046.slice," + "task_memcg=/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod08768c36_163c_40e5_8e49_09cf42ff5046.slice/" + "cri-containerd-e216d2f1e6c625d363c71edb6b3cbab5a9e1b447641b61028d0b94b077adf27c.scope,task=python3,pid=485850,uid=33\n", + ] + + +@pytest.fixture +def sys_stdin_kubernetes_containerd_systemd_cgroup(): + return [ + "some random line1\n", + "1720128512 dev208-uswest1adevc [42201.484624] python3 invoked oom-killer: " + "gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=999\n", + "some random line2\n", + "1720128512 dev208-uswest1adevc [42201.484749] oom-kill:constraint=CONSTRAINT_MEMCG," + "nodemask=(null),cpuset=kubepods-burstable-pod73331cbb_9b96_4a62_9702_46a56ad49dd0.slice:" + "cri-containerd:52f9ece9bcf929a08951aa3b4312fbec50890d82b58988f91a0aa9dc96ebc199," + "mems_allowed=0,oom_memcg=/system.slice/kubepods-burstable-pod73331cbb_9b96_4a62_9702_46a56ad49dd0.slice:" + "cri-containerd:52f9ece9bcf929a08951aa3b4312fbec50890d82b58988f91a0aa9dc96ebc199," + "task_memcg=/system.slice/kubepods-burstable-pod73331cbb_9b96_4a62_9702_46a56ad49dd0.slice:" + "cri-containerd:52f9ece9bcf929a08951aa3b4312fbec50890d82b58988f91a0aa9dc96ebc199,task=python3,pid=4190418,uid=33\n", + ] + + @pytest.fixture def sys_stdin_process_name_with_slashes(): return [ @@ -178,6 +212,19 @@ def docker_inspect(): } +@pytest.fixture +def containerd_inspect(): + return { + "process": { + "env": [ + "PAASTA_SERVICE=fake_service", + "PAASTA_INSTANCE=fake_instance", + "PAASTA_RESOURCE_MEM=512", + ] + } + } + + @pytest.fixture def log_line(): return LogLine( @@ -193,6 +240,21 @@ def log_line(): ) +@pytest.fixture +def log_line_containerd(): + return LogLine( + timestamp=1720128512, + hostname="dev37-devc", + container_id="e216d2f1e6c625d363c71edb6b3cbab5a9e1b447641b61028d0b94b077adf27c", + cluster="fake_cluster", + service="fake_service", + instance="fake_instance", + process_name="python3", + mesos_container_id="mesos-null", + mem_limit="512", + ) + + @patch("paasta_tools.oom_logger.sys.stdin", autospec=True) def test_capture_oom_events_from_stdin(mock_sys_stdin, sys_stdin): mock_sys_stdin.readline.side_effect = sys_stdin @@ -246,6 +308,42 @@ def test_capture_oom_events_from_stdin_kubernetes_structured_burstable_systemd_c assert test_output == [(1500316300, "dev37-devc", "e7ba37bd3708", "apache2")] +@patch("paasta_tools.oom_logger.sys.stdin", autospec=True) +def test_capture_oom_events_from_stdin_kubernetes_containerd_systemd_cgroup( + mock_sys_stdin, + sys_stdin_kubernetes_containerd_systemd_cgroup, +): + mock_sys_stdin.readline.side_effect = sys_stdin_kubernetes_containerd_systemd_cgroup + test_output = [a_tuple for a_tuple in capture_oom_events_from_stdin()] + assert test_output == [ + ( + 1720128512, + "dev208-uswest1adevc", + "52f9ece9bcf929a08951aa3b4312fbec50890d82b58988f91a0aa9dc96ebc199", + "python3", + ) + ] + + +@patch("paasta_tools.oom_logger.sys.stdin", autospec=True) +def test_capture_oom_events_from_stdin_kubernetes_containerd_systemd_cgroup_structured( + mock_sys_stdin, + sys_stdin_kubernetes_containerd_systemd_cgroup_structured, +): + mock_sys_stdin.readline.side_effect = ( + sys_stdin_kubernetes_containerd_systemd_cgroup_structured + ) + test_output = [a_tuple for a_tuple in capture_oom_events_from_stdin()] + assert test_output == [ + ( + 1720128512, + "dev37-devc", + "e216d2f1e6c625d363c71edb6b3cbab5a9e1b447641b61028d0b94b077adf27c", + "python3", + ) + ] + + @patch("paasta_tools.oom_logger.sys.stdin", autospec=True) def test_capture_oom_events_from_stdin_with_slashes( mock_sys_stdin, sys_stdin_process_name_with_slashes @@ -351,7 +449,9 @@ def test_send_sfx_event(mock_get_instance_config): @patch("paasta_tools.oom_logger.log_to_clog", autospec=True) @patch("paasta_tools.oom_logger.log_to_paasta", autospec=True) @patch("paasta_tools.oom_logger.get_docker_client", autospec=True) +@patch("paasta_tools.oom_logger.parse_args", autospec=True) def test_main( + mock_parse_args, mock_get_docker_client, mock_log_to_paasta, mock_log_to_clog, @@ -365,6 +465,7 @@ def test_main( ): mock_sys_stdin.readline.side_effect = sys_stdin + mock_parse_args.return_value.containerd = False docker_client = Mock(inspect_container=Mock(return_value=docker_inspect)) mock_get_docker_client.return_value = docker_client mock_load_system_paasta_config.return_value.get_cluster.return_value = ( @@ -377,3 +478,50 @@ def test_main( mock_send_sfx_event.assert_called_once_with( "fake_service", "fake_instance", "fake_cluster" ) + + +@patch("paasta_tools.oom_logger.sys.stdin", autospec=True) +@patch("paasta_tools.oom_logger.clog", autospec=True) +@patch("paasta_tools.oom_logger.send_sfx_event", autospec=True) +@patch("paasta_tools.oom_logger.load_system_paasta_config", autospec=True) +@patch("paasta_tools.oom_logger.log_to_clog", autospec=True) +@patch("paasta_tools.oom_logger.log_to_paasta", autospec=True) +@patch("paasta_tools.oom_logger.parse_args", autospec=True) +@patch("paasta_tools.oom_logger.get_containerd_container", autospec=True) +@patch("paasta_tools.oom_logger.json.loads", autospec=True) +def test_main_containerd( + mock_json_loads, + mock_get_containerd_container, + mock_parse_args, + mock_log_to_paasta, + mock_log_to_clog, + mock_load_system_paasta_config, + mock_send_sfx_event, + mock_clog, + mock_sys_stdin, + sys_stdin_kubernetes_containerd_systemd_cgroup_structured, + log_line_containerd, + containerd_inspect, +): + + mock_sys_stdin.readline.side_effect = ( + sys_stdin_kubernetes_containerd_systemd_cgroup_structured + ) + mock_parse_args.return_value.containerd = True + + mock_container_info = MagicMock() + mock_container_info.spec.value.decode.return_value = str(containerd_inspect) + + mock_get_containerd_container.return_value = mock_container_info + mock_json_loads.return_value = containerd_inspect + + mock_load_system_paasta_config.return_value.get_cluster.return_value = ( + "fake_cluster" + ) + + main() + mock_log_to_paasta.assert_called_once_with(log_line_containerd) + mock_log_to_clog.assert_called_once_with(log_line_containerd) + mock_send_sfx_event.assert_called_once_with( + "fake_service", "fake_instance", "fake_cluster" + ) diff --git a/tox.ini b/tox.ini index c88425b581..e6700a5a2b 100644 --- a/tox.ini +++ b/tox.ini @@ -12,6 +12,7 @@ passenv = SSH_AUTH_SOCK PAASTA_ENV DOCKER_HOST CI setenv = TZ = UTC deps = + --only-binary=grpcio --requirement={toxinidir}/requirements.txt --requirement={toxinidir}/requirements-dev.txt --editable={toxinidir} @@ -27,6 +28,7 @@ commands = envdir = .tox/py38-linux/ passenv = PAASTA_TEST_CLUSTER KUBECONFIG PAASTA_SYSTEM_CONFIG_DIR deps = + --only-binary=grpcio --requirement={toxinidir}/requirements.txt --requirement={toxinidir}/requirements-dev.txt --requirement={toxinidir}/yelp_package/extra_requirements_yelp.txt @@ -43,6 +45,7 @@ setenv = PAASTA_SYSTEM_CONFIG_DIR = ./etc_paasta_playground/ PAASTA_API_SOA_DIR = ./soa_config_playground deps = + --only-binary=grpcio --requirement={toxinidir}/requirements.txt --requirement={toxinidir}/requirements-dev.txt --requirement={toxinidir}/yelp_package/extra_requirements_yelp.txt @@ -62,7 +65,10 @@ commands = [testenv:tests-yelpy] envdir = .tox/py38-linux/ +setenv = + PIP_INDEX_URL = http://169.254.255.254:20641/simple/ deps = + --only-binary=grpcio --requirement={toxinidir}/requirements.txt --requirement={toxinidir}/requirements-dev.txt --requirement={toxinidir}/yelp_package/extra_requirements_yelp.txt @@ -177,7 +183,8 @@ commands = [testenv:install-hooks] basepython = python3.8 -deps = pre-commit +deps = + pre-commit commands = pre-commit install -f --install-hooks [flake8] diff --git a/yelp_package/dockerfiles/itest/api/Dockerfile b/yelp_package/dockerfiles/itest/api/Dockerfile index eb0bc51dee..43b75690b9 100644 --- a/yelp_package/dockerfiles/itest/api/Dockerfile +++ b/yelp_package/dockerfiles/itest/api/Dockerfile @@ -13,18 +13,24 @@ # limitations under the License. ARG DOCKER_REGISTRY=docker-dev.yelpcorp.com/ -FROM ${DOCKER_REGISTRY}ubuntu:bionic +FROM ${DOCKER_REGISTRY}ubuntu:jammy -ARG PIP_INDEX_URL=https://pypi.yelpcorp.com/bionic/simple +ARG PIP_INDEX_URL=https://pypi.yelpcorp.com/jammy/simple ENV PIP_INDEX_URL=$PIP_INDEX_URL +RUN apt-get update -yq && \ + apt-get install -yq \ + # needed to add a ppa + software-properties-common && \ + add-apt-repository ppa:deadsnakes/ppa + RUN apt-get update > /dev/null && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - software-properties-common \ gcc \ git \ curl \ python3.8-dev \ + python3.8-distutils \ libffi-dev \ libssl-dev \ libyaml-dev \ @@ -36,7 +42,7 @@ WORKDIR /work ADD requirements.txt /work/ RUN virtualenv /venv -ppython3.8 --no-download ENV PATH=/venv/bin:$PATH -RUN pip install -r requirements.txt +RUN pip install --only-binary=grpcio -r requirements.txt COPY yelp_package/dockerfiles/xenial/mesos-slave-secret /etc/ COPY yelp_package/dockerfiles/itest/api/mesos-cli.json yelp_package/dockerfiles/xenial/mesos-slave-secret /nail/etc/ diff --git a/yelp_package/dockerfiles/itest/k8s/Dockerfile b/yelp_package/dockerfiles/itest/k8s/Dockerfile index 84957612cb..d62956ed49 100644 --- a/yelp_package/dockerfiles/itest/k8s/Dockerfile +++ b/yelp_package/dockerfiles/itest/k8s/Dockerfile @@ -13,14 +13,15 @@ # limitations under the License. ARG DOCKER_REGISTRY=docker-dev.yelpcorp.com/ -FROM ${DOCKER_REGISTRY}ubuntu:bionic +FROM ${DOCKER_REGISTRY}ubuntu:jammy -ARG PIP_INDEX_URL=https://pypi.yelpcorp.com/bionic/simple +ARG PIP_INDEX_URL=https://pypi.yelpcorp.com/jammy/simple ENV PIP_INDEX_URL=$PIP_INDEX_URL -# Need Python 3.7 -RUN apt-get update > /dev/null && \ - apt-get install -y --no-install-recommends curl software-properties-common && \ +RUN apt-get update -yq && \ + apt-get install -yq \ + # needed to add a ppa + software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa RUN apt-get update > /dev/null && \ @@ -29,6 +30,7 @@ RUN apt-get update > /dev/null && \ git \ curl \ python3.8-dev \ + python3.8-distutils \ libffi-dev \ libssl-dev \ libyaml-dev \ @@ -45,7 +47,7 @@ WORKDIR /work ADD requirements.txt /work/ RUN virtualenv /venv -ppython3.8 --no-download ENV PATH=/venv/bin:$PATH -RUN pip install -r requirements.txt +RUN pip install --only-binary=grpcio -r requirements.txt ADD yelp_package/dockerfiles/itest/k8s/wait_paasta_api.sh /venv/bin ADD . /work/