From 386e2f628cd139e39dca3aa7c1ecf27df75cde87 Mon Sep 17 00:00:00 2001
From: Srinivasa Bharath Kanta <skanta@redhat.com>
Date: Tue, 24 Sep 2024 21:06:06 -0400
Subject: [PATCH] Automation of the Bug#2305677

Signed-off-by: Srinivasa Bharath Kanta <skanta@redhat.com>
---
 ceph/rados/mgr_workflows.py                   |   6 +-
 ...ier-2_rados_test-drain-customer-issue.yaml | 136 ++++++++++++
 .../tier-2_rados_test-osd-rebalance.yaml      |   5 +
 tests/rados/test_node_drain_customer_bug.py   | 206 ++++++++++++++++++
 4 files changed, 351 insertions(+), 2 deletions(-)
 create mode 100644 suites/reef/rados/tier-2_rados_test-drain-customer-issue.yaml
 create mode 100644 tests/rados/test_node_drain_customer_bug.py

diff --git a/ceph/rados/mgr_workflows.py b/ceph/rados/mgr_workflows.py
index d200d4ba72..c501949460 100644
--- a/ceph/rados/mgr_workflows.py
+++ b/ceph/rados/mgr_workflows.py
@@ -443,7 +443,7 @@ def get_mgr_stats(self):
         mgr_stats = self.rados_obj.run_ceph_command(cmd)
         return mgr_stats
 
-    def set_mgr_fail(self, host):
+    def set_mgr_fail(self, host: str = None):
         """
         Method to fail the mgr host
         Args:
@@ -451,7 +451,9 @@ def set_mgr_fail(self, host):
         Return:
              Return the output of the execution of the command
         """
-        cmd = f"ceph mgr fail {host}"
+        cmd = "ceph mgr fail"
+        if host:
+            cmd += " " + host
         out_put = self.rados_obj.run_ceph_command(cmd)
         time.sleep(10)
         return out_put
diff --git a/suites/reef/rados/tier-2_rados_test-drain-customer-issue.yaml b/suites/reef/rados/tier-2_rados_test-drain-customer-issue.yaml
new file mode 100644
index 0000000000..ea362f870a
--- /dev/null
+++ b/suites/reef/rados/tier-2_rados_test-drain-customer-issue.yaml
@@ -0,0 +1,136 @@
+# Suite contains  tier-2 rados bug verification automation
+#===============================================================================================
+#------------------------------------------------------------------------------------------
+#----- Tier-2 - Bug verification  automation   ------
+#------------------------------------------------------------------------------------------
+# Conf: conf/reef/rados/11-node-cluster.yaml
+# Bugs:
+#     1. https://bugzilla.redhat.com/show_bug.cgi?id=2305677
+#===============================================================================================
+tests:
+  - test:
+      name: setup install pre-requisistes
+      desc: Setup phase to deploy the required pre-requisites for running the tests.
+      module: install_prereq.py
+      abort-on-fail: true
+
+  - test:
+      name: cluster deployment
+      desc: Execute the cluster deployment workflow.
+      module: test_cephadm.py
+      polarion-id:
+      config:
+        verify_cluster_health: true
+        steps:
+          - config:
+              command: bootstrap
+              service: cephadm
+              args:
+                rhcs-version: 7.1
+                release: z0
+                mon-ip: node1
+                orphan-initial-daemons: true
+                skip-monitoring-stack: true
+          - config:
+              command: add_hosts
+              service: host
+              args:
+                attach_ip_address: true
+                labels: apply-all-labels
+          - config:
+              command: apply
+              service: mgr
+              args:
+                placement:
+                  label: mgr
+          - config:
+              command: apply
+              service: mon
+              args:
+                placement:
+                  label: mon
+          - config:
+              command: apply
+              service: osd
+              args:
+                all-available-devices: true
+          - config:
+              command: shell
+              args:          # arguments to ceph orch
+                - ceph
+                - fs
+                - volume
+                - create
+                - cephfs
+          - config:
+              command: apply
+              service: rgw
+              pos_args:
+                - rgw.1
+              args:
+                placement:
+                  label: rgw
+          - config:
+              command: apply
+              service: mds
+              base_cmd_args:          # arguments to ceph orch
+                verbose: true
+              pos_args:
+                - cephfs              # name of the filesystem
+              args:
+                placement:
+                  nodes:
+                    - node2
+                    - node6
+                  limit: 2            # no of daemons
+                  sep: " "            # separator to be used for placements
+      destroy-cluster: false
+      abort-on-fail: true
+
+  - test:
+      name: Configure client admin
+      desc: Configures client admin node on cluster
+      module: test_client.py
+      polarion-id:
+      config:
+        command: add
+        id: client.1                      # client Id (<type>.<Id>)
+        node: node7                       # client node
+        install_packages:
+          - ceph-common
+        copy_admin_keyring: true          # Copy admin keyring to node
+        caps:                             # authorize client capabilities
+          mon: "allow *"
+          osd: "allow *"
+          mds: "allow *"
+          mgr: "allow *"
+
+  - test:
+      name: Enable logging to file
+      module: rados_prep.py
+      config:
+        log_to_file: true
+      desc: Change config options to enable logging to file
+  - test:
+      name: Verification of Ceph mgr crash
+      module: test_node_drain_customer_bug.py
+      polarion-id: CEPH-83595932
+      desc: Ceph mgr crashed after a mgr failover with the message mgr operator
+  - test:
+      name: Upgrade cluster to latest 7.x ceph version
+      desc: Upgrade cluster to latest version
+      module: test_cephadm_upgrade.py
+      polarion-id: CEPH-83573791,CEPH-83573790
+      config:
+        command: start
+        service: upgrade
+        base_cmd_args:
+          verbose: true
+        verify_cluster_health: true
+      destroy-cluster: false
+      abort-on-fail: true
+  - test:
+      name: Verification of Ceph mgr crash
+      module: test_node_drain_customer_bug.py
+      polarion-id: CEPH-83595932
+      desc: Ceph mgr crashed after a mgr failover with the message mgr operator
diff --git a/suites/squid/rados/tier-2_rados_test-osd-rebalance.yaml b/suites/squid/rados/tier-2_rados_test-osd-rebalance.yaml
index d2423b7623..e961166a16 100644
--- a/suites/squid/rados/tier-2_rados_test-osd-rebalance.yaml
+++ b/suites/squid/rados/tier-2_rados_test-osd-rebalance.yaml
@@ -173,3 +173,8 @@ tests:
 #            m: 2
 #            plugin: jerasure
 #            disable_pg_autoscale: true
+  - test:
+      name: Verification of Ceph mgr crash
+      module: test_node_drain_customer_bug.py
+      polarion-id: CEPH-83595932
+      desc: Ceph mgr crashed after a mgr failover with the message mgr operator
diff --git a/tests/rados/test_node_drain_customer_bug.py b/tests/rados/test_node_drain_customer_bug.py
new file mode 100644
index 0000000000..1bf21e3c4f
--- /dev/null
+++ b/tests/rados/test_node_drain_customer_bug.py
@@ -0,0 +1,206 @@
+"""
+The file contain the method to check the customer issue-
+ CEPH-83593996 - Check that the Ceph cluster logs are being generated appropriately according to the log level
+"""
+
+import random
+import re
+import time
+from threading import Thread
+
+from ceph.ceph_admin import CephAdmin
+from ceph.rados.core_workflows import RadosOrchestrator
+from ceph.rados.mgr_workflows import MgrWorkflows
+from ceph.rados.serviceability_workflows import ServiceabilityMethods
+from tests.rados.stretch_cluster import wait_for_clean_pg_sets
+from utility.log import Log
+
+log = Log(__name__)
+
+
+def run(ceph_cluster, **kw):
+    """
+    # CEPH-83593996
+    Bug id - https://bugzilla.redhat.com/show_bug.cgi?id=2305677
+    1. Configure a cluster that have more than four OSD nodes
+    2. Select an OSD node and drain the node
+    3. Parallely execute the ceph mgr fail command
+    4. Check that exception occurs on the cluster
+    5. Perform the following workaround  steps-
+          5.1 ceph config-key rm mgr/cephadm/osd_remove_queue
+          5.2 ceph mgr fail
+    6. If exception not occured then check for the Traceback logs
+    """
+    log.info(run.__doc__)
+    config = kw["config"]
+    cephadm = CephAdmin(cluster=ceph_cluster, **config)
+    rados_obj = RadosOrchestrator(node=cephadm)
+    mgr_obj = MgrWorkflows(node=cephadm)
+    installer = ceph_cluster.get_nodes(role="installer")[0]
+    service_obj = ServiceabilityMethods(cluster=ceph_cluster, **config)
+    ceph_nodes = kw.get("ceph_nodes")
+
+    mgr_daemon = Thread(
+        target=background_mgr_task, kwargs={"mgr_object": mgr_obj}, daemon=True
+    )
+    osd_list = []
+
+    for node in ceph_nodes:
+        cmd_host_chk = f"ceph orch host ls --host_pattern {node.hostname}"
+        out = rados_obj.run_ceph_command(cmd=cmd_host_chk)
+        if not out:
+            log.info(f"The {node.hostname} is not in the cluster")
+            continue
+        if node.role == "osd":
+            node_osds = rados_obj.collect_osd_daemon_ids(node)
+            osd_list = osd_list + node_osds
+    osd_weight_chk = check_set_reweight(rados_obj, osd_list)
+    if not osd_weight_chk:
+        log.error(
+            "The osd weights are zero for ew nodes and weights of the OSD are not unique.Set the weights "
+            "manually and re-run the tests"
+        )
+        return 1
+
+    log_lines = [
+        "mgr load Traceback",
+        "TypeError: __init__() got an unexpected keyword argument 'original_weight'",
+    ]
+    ceph_version = rados_obj.run_ceph_command(cmd="ceph version")
+    log.info(f"Current version on the cluster : {ceph_version}")
+    match_str = re.match(
+        r"ceph version (\d+)\.(\d+)\.(\d+)-(\d+)", ceph_version["version"]
+    )
+    major, minor, patch, build = match_str.groups()
+    bug_exists = False
+    exceptional_flag = False
+    if int(major) < 18:
+        bug_exists = True
+    elif int(major) == 18 and int(minor) < 2:
+        bug_exists = True
+    elif int(major) == 18 and int(minor) == 2 and int(patch) < 1:
+        bug_exists = True
+    elif int(major) == 18 and int(minor) == 2 and int(patch) == 1 and int(build) <= 194:
+        bug_exists = True
+
+    osd_hosts = rados_obj.get_osd_hosts()
+    log.info(f"The osd node slist are-{osd_hosts}")
+    drain_host = random.choice(osd_hosts)
+    init_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'")
+    mgr_dump = rados_obj.run_ceph_command(cmd="ceph mgr dump", client_exec=True)
+    active_mgr = mgr_dump["active_name"]
+    try:
+        mgr_daemon.start()
+        service_obj.remove_custom_host(host_node_name=drain_host)
+    except Exception as e:
+        log.error(f"Failed with exception: {e.__doc__}")
+        log.exception(e)
+        exceptional_flag = True
+    finally:
+        log.info("=======In the bug reproduce finally block===========")
+        time.sleep(300)
+        if bug_exists:
+            log.info("Performing the workaround on the cluster")
+            cmd_remove_key = "ceph config-key rm mgr/cephadm/osd_remove_queue"
+            rados_obj.run_ceph_command(cmd=cmd_remove_key, client_exec=True)
+            mgr_obj.set_mgr_fail()
+            log.info(
+                f"This is an existing issue.The current version of ceph is {ceph_version}.The bug exists at "
+                f"< 18.2.1-194 ceph version"
+            )
+            log.info("For more details refer the Bug#2305677")
+            return 0
+        elif not bug_exists and exceptional_flag:
+            log.error(
+                f"The verification failed.The current version of ceph is {ceph_version}."
+                f"The bug fixed ceph-18.2.1-235.Find more details at-Bug#2305677 "
+            )
+            return 1
+        end_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'")
+        if not verify_mgr_traceback_log(
+            rados_obj=rados_obj,
+            start_time=init_time,
+            end_time=end_time,
+            mgr_type=active_mgr,
+            lines=log_lines,
+        ):
+            log.error("Traceback messages are noticed in logs")
+            return 1
+        log.info("Verification completed and not noticed any traceback messages")
+        return 0
+
+
+def verify_mgr_traceback_log(
+    rados_obj: RadosOrchestrator, start_time, end_time, mgr_type, lines
+) -> bool:
+    """
+    Retrieve the preempt log using journalctl command
+    Args:
+        rados_obj: Rados object
+        osd: osd id
+        start_time: time to start reading the journalctl logs - format ('2022-07-20 09:40:10')
+        end_time: time to stop reading the journalctl logs - format ('2022-07-20 10:58:49')
+        lines: Log lines to search in the journalctl logs
+    Returns:  True-> if the lines are not exist in the journalctl logs
+              False -> if the lines are  exist in the journalctl logs
+    """
+
+    log.info("Checking log lines")
+    log_lines = rados_obj.get_journalctl_log(
+        start_time=start_time, end_time=end_time, daemon_type="mgr", daemon_id=mgr_type
+    )
+    log.debug(f"Journalctl logs are : {log_lines}")
+    for line in lines:
+        if line in log_lines:
+            log.error(f" Found the {line} in the mgr logs")
+            return False
+    return True
+
+
+def background_mgr_task(mgr_object):
+    """
+    Method is used to execute the mgr fail command to execute parallel with other commands
+    Args:
+        mgr_object: mgr object
+    Returns: None
+    """
+    for _ in range(10):
+        mgr_object.set_mgr_fail()
+        time.sleep(2)
+
+
+def check_set_reweight(rados_obj, osd_list):
+    """
+    Method is used to check the OSD weights and assigned weight if the OSD weight value is 0
+    Args:
+        rados_obj: Rados object
+        osd_list: osd lists. For example [0,1,2,3,4,5]
+
+    Returns:
+        True-> If none of the OSD weights are 0 or weights are reassigned
+        False-> The cluster has more than one weight value in the cluster
+
+    """
+    osd_zero_weight_list = []
+    osd_weights = []
+    for osd_id in osd_list:
+        selected_osd_details = rados_obj.get_osd_details(osd_id=osd_id)
+        if selected_osd_details["crush_weight"] == 0:
+            osd_zero_weight_list = osd_zero_weight_list + [osd_id]
+        osd_weights = osd_weights + [selected_osd_details["crush_weight"]]
+    if osd_zero_weight_list is None:
+        return True
+    unique_weight_list = list(set(osd_weights) - {0})
+    if len(unique_weight_list) == 1:
+        osd_weight = unique_weight_list[0]
+    else:
+        log.info(
+            f"The osd weights are assigned more than 1 value in the cluster. The weights are-{osd_weights}"
+        )
+        return False
+    if len(osd_zero_weight_list) != 0:
+        for osd_id in osd_zero_weight_list:
+            rados_obj.reweight_crush_items(name=f"osd.{osd_id}", weight=osd_weight)
+    time.sleep(30)  # blind sleep to let stats get updated crush re-weight
+    assert wait_for_clean_pg_sets(rados_obj, timeout=900)
+    return True