feat: exclude evicted pod log from support bundle (#292)

Azure · Aug 5, 2024 · 64c9d20 · 64c9d20
1 parent 8cd8913
commit 64c9d20
Show file tree

Hide file tree

Showing 16 changed files with 211 additions and 66 deletions.
diff --git a/azext_edge/edge/_help.py b/azext_edge/edge/_help.py
@@ -59,6 +59,9 @@ def load_iotops_help():
             - {COMPAT_CLUSTER_CONFIG_APIS.as_str()}
             - {COMPAT_DATAFLOW_APIS.as_str()}
 
+            Note: logs from evicted pod will not be captured, as they are inaccessible. For details
+            on why a pod was evicted, please refer to the related pod and node files.
+
         examples:
         - name: Basic usage with default options. This form of the command will auto detect IoT Operations APIs and build a suitable bundle
                 capturing the last 24 hours of container logs. The bundle will be produced in the current working directory.

diff --git a/azext_edge/edge/providers/support/base.py b/azext_edge/edge/providers/support/base.py
@@ -8,7 +8,7 @@
 from typing import List, Dict, Optional, Iterable, Tuple, TypeVar, Union
 from functools import partial
 
-from azext_edge.edge.common import BundleResourceKind
+from azext_edge.edge.common import BundleResourceKind, PodState
 from knack.log import get_logger
 from kubernetes.client.exceptions import ApiException
 from kubernetes.client.models import (
@@ -34,6 +34,7 @@
 generic = client.ApiClient()
 
 DAY_IN_SECONDS: int = 60 * 60 * 24
+POD_STATUS_FAILED_EVICTED: str = "evicted"
 
 K8sRuntimeResources = TypeVar(
     "K8sRuntimeResources",
@@ -137,17 +138,23 @@ def process_v1_pods(
                 init_pod_containers: List[V1Container] = pod_spec.init_containers
                 pod_containers.extend(init_pod_containers)
 
-        processed.extend(
-            _capture_pod_container_logs(
-                directory_path=directory_path,
-                pod_containers=pod_containers,
-                pod_name=pod_name,
-                pod_namespace=pod_namespace,
-                v1_api=v1_api,
-                since_seconds=since_seconds,
-                capture_previous_logs=capture_previous_logs,
+        # exclude evicted pods from log capture since they are not accessible
+        pod_status = pod.status
+        if pod_status and pod_status.phase == PodState.failed.value and\
+           str(pod_status.reason).lower() == POD_STATUS_FAILED_EVICTED:
+            logger.info(f"Pod {pod_name} in namespace {pod_namespace} is evicted. Skipping log capture.")
+        else:
+            processed.extend(
+                _capture_pod_container_logs(
+                    directory_path=directory_path,
+                    pod_containers=pod_containers,
+                    pod_name=pod_name,
+                    pod_namespace=pod_namespace,
+                    v1_api=v1_api,
+                    since_seconds=since_seconds,
+                    capture_previous_logs=capture_previous_logs,
+                )
             )
-        )
 
         if include_metrics:
             try:

diff --git a/azext_edge/tests/edge/checks/int/helpers.py b/azext_edge/tests/edge/checks/int/helpers.py
@@ -74,7 +74,9 @@ def assert_eval_core_service_runtime(
         find_extra_or_missing_names(
             resource_type="pods",
             result_names=results,
-            expected_names=kubectl_pods.keys()
+            expected_names=kubectl_pods.keys(),
+            ignore_extras=True,
+            ignore_missing=True
         )
 
         for pod in kubectl_pods:

diff --git a/azext_edge/tests/edge/support/conftest.py b/azext_edge/tests/edge/support/conftest.py
@@ -16,7 +16,7 @@
 def add_pod_to_mocked_pods(
     mocked_client, expected_pod_map, mock_names: List[str] = None, mock_init_containers: bool = False
 ):
-    from kubernetes.client.models import V1PodList, V1Pod, V1PodSpec, V1ObjectMeta, V1Container
+    from kubernetes.client.models import V1PodList, V1Pod, V1PodSpec, V1PodStatus, V1ObjectMeta, V1Container
 
     current_pods = mocked_client.CoreV1Api().list_pod_for_all_namespaces.return_value
     pod_list = current_pods.items
@@ -28,7 +28,8 @@ def add_pod_to_mocked_pods(
     for pod_name in mock_names:
         container_name = generate_random_string()
         spec = V1PodSpec(containers=[V1Container(name=container_name)])
-        pod = V1Pod(metadata=V1ObjectMeta(namespace=namespace, name=pod_name), spec=spec)
+        status = V1PodStatus(phase="Running")
+        pod = V1Pod(metadata=V1ObjectMeta(namespace=namespace, name=pod_name), spec=spec, status=status)
 
         if mock_init_containers:
             pod.spec.init_containers = [V1Container(name="mock-init-container")]
@@ -163,7 +164,7 @@ def _handle_resource_call(*args, **kwargs):
 # TODO - @digimaun make this more useful / flexible configuration.
 @pytest.fixture
 def mocked_list_pods(mocked_client):
-    from kubernetes.client.models import V1PodList, V1Pod, V1PodSpec, V1ObjectMeta, V1Container
+    from kubernetes.client.models import V1PodList, V1Pod, V1PodSpec, V1PodStatus, V1ObjectMeta, V1Container
 
     expected_pod_map = {}
     namespaces = [generate_random_string()]
@@ -175,10 +176,23 @@ def mocked_list_pods(mocked_client):
         for pod_name in pod_names:
             container_name = generate_random_string()
             spec = V1PodSpec(containers=[V1Container(name=container_name)])
-            pod = V1Pod(metadata=V1ObjectMeta(namespace=namespace, name=pod_name), spec=spec)
+            status = V1PodStatus(phase="Running")
+            pod = V1Pod(metadata=V1ObjectMeta(namespace=namespace, name=pod_name), spec=spec, status=status)
             pods.append(pod)
             expected_pod_map[namespace][pod_name] = {container_name: mock_log}
 
+    # add evicted pod for testing
+    evicted_pod_name = "evicted_pod"
+    evicted_pod_spec = V1PodSpec(containers=[V1Container(name=generate_random_string())])
+    evicted_pod_status = V1PodStatus(phase="Failed", reason="Evicted")
+    evicted_pod = V1Pod(
+        metadata=V1ObjectMeta(namespace=namespace, name=evicted_pod_name),
+        spec=evicted_pod_spec,
+        status=evicted_pod_status
+    )
+    pods.append(evicted_pod)
+    expected_pod_map[namespace][evicted_pod_name] = {evicted_pod.spec.containers[0].name: mock_log}
+
     pods_list = V1PodList(items=pods)
     mocked_client.CoreV1Api().list_pod_for_all_namespaces.return_value = pods_list
     mocked_client.CoreV1Api().read_namespaced_pod_log.return_value = mock_log

diff --git a/azext_edge/tests/edge/support/create_bundle_int/helpers.py b/azext_edge/tests/edge/support/create_bundle_int/helpers.py
@@ -5,7 +5,7 @@
 # ----------------------------------------------------------------------------------------------
 
 from knack.log import get_logger
-from typing import Dict, List, NamedTuple, Optional, Union
+from typing import Dict, List, NamedTuple, Optional, Tuple, Union
 from os import path
 from zipfile import ZipFile
 import pytest
@@ -137,6 +137,7 @@ def check_workload_resource_files(
     file_objs: Dict[str, List[Dict[str, str]]],
     expected_workload_types: List[str],
     prefixes: Union[str, List[str]],
+    bundle_path: str,
     optional_workload_types: Optional[List[str]] = None,
 ):
     if "pod" in expected_workload_types:
@@ -169,7 +170,14 @@ def check_workload_resource_files(
                 converted_file[file["descriptor"]] = False
 
     expected_pods = get_kubectl_workload_items(prefixes, service_type="pod")
-    find_extra_or_missing_names("pod", file_pods.keys(), expected_pods.keys())
+    check_log_for_evicted_pods(bundle_path, file_objs.get("pod", []))
+    find_extra_or_missing_names(
+        resource_type="pod",
+        result_names=file_pods.keys(),
+        expected_names=expected_pods.keys(),
+        ignore_extras=True,
+        ignore_missing=True
+    )
 
     for name, files in file_pods.items():
         for extension, value in files.items():
@@ -193,6 +201,23 @@ def _check_non_pod_files(workload_types: List[str], required: bool = False):
         _check_non_pod_files(optional_workload_types, required=False)
 
 
+def check_log_for_evicted_pods(bundle_dir: str, file_pods: List[Dict[str, str]]):
+    # open the file using bundle_dir and check for evicted pods
+    name_extension_pair = list(set([(file["name"], file["extension"]) for file in file_pods]))
+    # TODO: upcoming fix will get file content earlier
+    with ZipFile(bundle_dir, 'r') as zip:
+        file_names = zip.namelist()
+        for name, extension in name_extension_pair:
+            if extension == "log":
+                # find file path in file_names that has name and extension
+                file_path = next((file for file in file_names if file.endswith(name + ".yaml")), None)
+                if not file_path:
+                    continue
+                with zip.open(file_path) as pod_content:
+                    log_content = pod_content.read().decode("utf-8")
+                    assert "Evicted" not in log_content, f"Evicted pod {name} log found in bundle."
+
+
 def get_file_map(
     walk_result: Dict[str, Dict[str, List[str]]],
     ops_service: str,
@@ -286,7 +311,7 @@ def _get_namespace_determinating_files(
 def run_bundle_command(
     command: str,
     tracked_files: List[str],
-) -> Dict[str, Dict[str, List[str]]]:
+) -> Tuple[Dict[str, Dict[str, List[str]]], str]:
     result = run(command)
     if not result:
         pytest.skip("No bundle was created.")
@@ -324,7 +349,7 @@ def run_bundle_command(
             # lastly add in the file (with the correct seperators)
             walk_result[built_path]["files"].append(file_name)
 
-    return walk_result
+    return walk_result, result["bundlePath"]
 
 
 def split_name(name: str) -> List[str]:

diff --git a/azext_edge/tests/edge/support/create_bundle_int/test_akri_int.py b/azext_edge/tests/edge/support/create_bundle_int/test_akri_int.py
@@ -7,7 +7,12 @@
 from knack.log import get_logger
 from azext_edge.edge.common import OpsServiceType
 from azext_edge.edge.providers.edge_api import AKRI_API_V0
-from .helpers import check_custom_resource_files, check_workload_resource_files, get_file_map, run_bundle_command
+from .helpers import (
+    check_custom_resource_files,
+    check_workload_resource_files,
+    get_file_map,
+    run_bundle_command
+)
 
 logger = get_logger(__name__)
 
@@ -16,7 +21,7 @@ def test_create_bundle_akri(init_setup, tracked_files):
     """Test for ensuring file names and content. ONLY CHECKS AKRI."""
     ops_service = OpsServiceType.akri.value
     command = f"az iot ops support create-bundle --ops-service {ops_service}"
-    walk_result = run_bundle_command(command=command, tracked_files=tracked_files)
+    walk_result, bundle_path = run_bundle_command(command=command, tracked_files=tracked_files)
     file_map = get_file_map(walk_result, ops_service)["aio"]
 
     check_custom_resource_files(
@@ -28,4 +33,9 @@ def test_create_bundle_akri(init_setup, tracked_files):
     expected_types = set(expected_workload_types).union(AKRI_API_V0.kinds)
     assert set(file_map.keys()).issubset(expected_types)
 
-    check_workload_resource_files(file_map, expected_workload_types, "aio-akri")
+    check_workload_resource_files(
+        file_objs=file_map,
+        expected_workload_types=expected_workload_types,
+        prefixes="aio-akri",
+        bundle_path=bundle_path
+    )
diff --git a/azext_edge/tests/edge/support/create_bundle_int/test_auto_int.py b/azext_edge/tests/edge/support/create_bundle_int/test_auto_int.py
@@ -41,10 +41,10 @@ def test_create_bundle(init_setup, bundle_dir, mq_traces, ops_service, tracked_f
             tracked_files.append(bundle_dir)
         except FileExistsError:
             pass
-    walk_result = run_bundle_command(command=command.format(ops_service), tracked_files=tracked_files)
+    walk_result, _ = run_bundle_command(command=command.format(ops_service), tracked_files=tracked_files)
     # generate second bundle as close as possible
     if ops_service != OpsServiceType.auto.value:
-        auto_walk_result = run_bundle_command(
+        auto_walk_result, _ = run_bundle_command(
             command=command.format(OpsServiceType.auto.value),
             tracked_files=tracked_files
         )
@@ -87,7 +87,11 @@ def test_create_bundle(init_setup, bundle_dir, mq_traces, ops_service, tracked_f
             auto_files = sorted(auto_walk_result[directory]["files"])
             ser_files = sorted(walk_result[directory]["files"])
             find_extra_or_missing_names(
-                f"auto bundle files not found in {ops_service} bundle", auto_files, ser_files, ignore_extras=True
+                resource_type=f"auto bundle files not found in {ops_service} bundle",
+                result_names=auto_files,
+                expected_names=ser_files,
+                ignore_extras=True,
+                ignore_missing=True
             )
 
 
@@ -96,12 +100,17 @@ def test_create_bundle_otel(init_setup, tracked_files):
     # dir for unpacked files
     ops_service = OpsServiceType.auto.value
     command = f"az iot ops support create-bundle --ops-service {ops_service}"
-    walk_result = run_bundle_command(command=command, tracked_files=tracked_files)
+    walk_result, bundle_path = run_bundle_command(command=command, tracked_files=tracked_files)
     file_map = get_file_map(walk_result, "otel")["aio"]
 
     expected_workload_types = ["deployment", "pod", "replicaset", "service"]
     assert set(file_map.keys()).issubset(set(expected_workload_types))
-    check_workload_resource_files(file_map, expected_workload_types, "aio-otel")
+    check_workload_resource_files(
+        file_objs=file_map,
+        expected_workload_types=expected_workload_types,
+        prefixes="aio-otel",
+        bundle_path=bundle_path
+    )
 
 
 def _get_expected_services(

diff --git a/azext_edge/tests/edge/support/create_bundle_int/test_billing_int.py b/azext_edge/tests/edge/support/create_bundle_int/test_billing_int.py
@@ -7,7 +7,12 @@
 from knack.log import get_logger
 from azext_edge.edge.common import OpsServiceType
 from azext_edge.edge.providers.edge_api import CLUSTER_CONFIG_API_V1
-from .helpers import check_custom_resource_files, check_workload_resource_files, get_file_map, run_bundle_command
+from .helpers import (
+    check_custom_resource_files,
+    check_workload_resource_files,
+    get_file_map,
+    run_bundle_command
+)
 
 logger = get_logger(__name__)
 
@@ -17,7 +22,7 @@ def test_create_bundle_billing(init_setup, tracked_files):
     ops_service = OpsServiceType.billing.value
     ops_service = "billing"
     command = f"az iot ops support create-bundle --ops-service {ops_service}"
-    walk_result = run_bundle_command(command=command, tracked_files=tracked_files)
+    walk_result, bundle_path = run_bundle_command(command=command, tracked_files=tracked_files)
     file_map = get_file_map(walk_result, ops_service)
 
     # AIO
@@ -29,7 +34,12 @@ def test_create_bundle_billing(init_setup, tracked_files):
     expected_workload_types = ["cronjob", "job", "pod"]
     expected_types = set(expected_workload_types).union(CLUSTER_CONFIG_API_V1.kinds)
     assert set(file_map["aio"].keys()).issubset(set(expected_types))
-    check_workload_resource_files(file_map["aio"], expected_workload_types, ["aio-usage"])
+    check_workload_resource_files(
+        file_objs=file_map["aio"],
+        expected_workload_types=expected_workload_types,
+        prefixes=["aio-usage"],
+        bundle_path=bundle_path
+    )
 
     # USAGE
     check_custom_resource_files(
@@ -40,4 +50,9 @@ def test_create_bundle_billing(init_setup, tracked_files):
     expected_workload_types = ["deployment", "pod", "replicaset", "service"]
     expected_types = set(expected_workload_types).union(CLUSTER_CONFIG_API_V1.kinds)
     assert set(file_map["usage"].keys()).issubset(expected_types)
-    check_workload_resource_files(file_map["usage"], expected_workload_types, ["billing-operator"])
+    check_workload_resource_files(
+        file_objs=file_map["usage"],
+        expected_workload_types=expected_workload_types,
+        prefixes=["billing-operator"],
+        bundle_path=bundle_path
+    )
diff --git a/azext_edge/tests/edge/support/create_bundle_int/test_dataflow_int.py b/azext_edge/tests/edge/support/create_bundle_int/test_dataflow_int.py
@@ -7,7 +7,12 @@
 from knack.log import get_logger
 from azext_edge.edge.common import OpsServiceType
 from azext_edge.edge.providers.edge_api import DATAFLOW_API_V1B1
-from .helpers import check_custom_resource_files, check_workload_resource_files, get_file_map, run_bundle_command
+from .helpers import (
+    check_custom_resource_files,
+    check_workload_resource_files,
+    get_file_map,
+    run_bundle_command
+)
 
 logger = get_logger(__name__)
 
@@ -16,7 +21,7 @@ def test_create_bundle_dataflow(init_setup, tracked_files):
     """Test for ensuring file names and content. ONLY CHECKS dataflow."""
     ops_service = OpsServiceType.dataflow.value
     command = f"az iot ops support create-bundle --ops-service {ops_service}"
-    walk_result = run_bundle_command(command=command, tracked_files=tracked_files)
+    walk_result, bundle_path = run_bundle_command(command=command, tracked_files=tracked_files)
     file_map = get_file_map(walk_result, ops_service)["aio"]
 
     check_custom_resource_files(
@@ -31,4 +36,5 @@ def test_create_bundle_dataflow(init_setup, tracked_files):
         file_objs=file_map,
         expected_workload_types=expected_workload_types,
         prefixes=["aio-dataflow"],
+        bundle_path=bundle_path
     )
diff --git a/azext_edge/tests/edge/support/create_bundle_int/test_deviceregistry_int.py b/azext_edge/tests/edge/support/create_bundle_int/test_deviceregistry_int.py
@@ -8,7 +8,12 @@
 from knack.log import get_logger
 from azext_edge.edge.common import OpsServiceType
 from azext_edge.edge.providers.edge_api import DEVICEREGISTRY_API_V1
-from .helpers import check_custom_resource_files, BASE_ZIP_PATH, get_file_map, run_bundle_command
+from .helpers import (
+    check_custom_resource_files,
+    BASE_ZIP_PATH,
+    get_file_map,
+    run_bundle_command
+)
 
 logger = get_logger(__name__)
 
@@ -17,7 +22,7 @@ def test_create_bundle_deviceregistry(init_setup, tracked_files):
     """Test for ensuring file names and content. ONLY CHECKS deviceregistry."""
     ops_service = OpsServiceType.deviceregistry.value
     command = f"az iot ops support create-bundle --ops-service {ops_service}"
-    walk_result = run_bundle_command(command=command, tracked_files=tracked_files)
+    walk_result, _ = run_bundle_command(command=command, tracked_files=tracked_files)
     if not walk_result[BASE_ZIP_PATH]["folders"]:
         pytest.skip(f"No bundles created for {ops_service}.")
     file_map = get_file_map(walk_result, ops_service)["aio"]