Automation of the Bug#2305677

Signed-off-by: Srinivasa Bharath Kanta <skanta@redhat.com>
red-hat-storage · Sep 27, 2024 · 386e2f6 · 386e2f6
1 parent d9d451a
commit 386e2f6
Show file tree

Hide file tree

Showing 4 changed files with 351 additions and 2 deletions.
diff --git a/ceph/rados/mgr_workflows.py b/ceph/rados/mgr_workflows.py
@@ -443,15 +443,17 @@ def get_mgr_stats(self):
  mgr_stats = self.rados_obj.run_ceph_command(cmd)
  return mgr_stats
 
- def set_mgr_fail(self, host):
+ def set_mgr_fail(self, host: str = None):
  """
  Method to fail the mgr host
  Args:
  host : mgr host name
  Return:
  Return the output of the execution of the command
  """
- cmd = f"ceph mgr fail {host}"
+ cmd = "ceph mgr fail"
+ if host:
+ cmd += " " + host
  out_put = self.rados_obj.run_ceph_command(cmd)
  time.sleep(10)
  return out_put

diff --git a/suites/reef/rados/tier-2_rados_test-drain-customer-issue.yaml b/suites/reef/rados/tier-2_rados_test-drain-customer-issue.yaml
@@ -0,0 +1,136 @@
+# Suite contains tier-2 rados bug verification automation
+#===============================================================================================
+#------------------------------------------------------------------------------------------
+#----- Tier-2 - Bug verification automation ------
+#------------------------------------------------------------------------------------------
+# Conf: conf/reef/rados/11-node-cluster.yaml
+# Bugs:
+# 1. https://bugzilla.redhat.com/show_bug.cgi?id=2305677
+#===============================================================================================
+tests:
+ - test:
+ name: setup install pre-requisistes
+ desc: Setup phase to deploy the required pre-requisites for running the tests.
+ module: install_prereq.py
+ abort-on-fail: true
+
+ - test:
+ name: cluster deployment
+ desc: Execute the cluster deployment workflow.
+ module: test_cephadm.py
+ polarion-id:
+ config:
+ verify_cluster_health: true
+ steps:
+ - config:
+ command: bootstrap
+ service: cephadm
+ args:
+ rhcs-version: 7.1
+ release: z0
+ mon-ip: node1
+ orphan-initial-daemons: true
+ skip-monitoring-stack: true
+ - config:
+ command: add_hosts
+ service: host
+ args:
+ attach_ip_address: true
+ labels: apply-all-labels
+ - config:
+ command: apply
+ service: mgr
+ args:
+ placement:
+ label: mgr
+ - config:
+ command: apply
+ service: mon
+ args:
+ placement:
+ label: mon
+ - config:
+ command: apply
+ service: osd
+ args:
+ all-available-devices: true
+ - config:
+ command: shell
+ args: # arguments to ceph orch
+ - ceph
+ - fs
+ - volume
+ - create
+ - cephfs
+ - config:
+ command: apply
+ service: rgw
+ pos_args:
+ - rgw.1
+ args:
+ placement:
+ label: rgw
+ - config:
+ command: apply
+ service: mds
+ base_cmd_args: # arguments to ceph orch
+ verbose: true
+ pos_args:
+ - cephfs # name of the filesystem
+ args:
+ placement:
+ nodes:
+ - node2
+ - node6
+ limit: 2 # no of daemons
+ sep: " " # separator to be used for placements
+ destroy-cluster: false
+ abort-on-fail: true
+
+ - test:
+ name: Configure client admin
+ desc: Configures client admin node on cluster
+ module: test_client.py
+ polarion-id:
+ config:
+ command: add
+ id: client.1 # client Id (<type>.<Id>)
+ node: node7 # client node
+ install_packages:
+ - ceph-common
+ copy_admin_keyring: true # Copy admin keyring to node
+ caps: # authorize client capabilities
+ mon: "allow *"
+ osd: "allow *"
+ mds: "allow *"
+ mgr: "allow *"
+
+ - test:
+ name: Enable logging to file
+ module: rados_prep.py
+ config:
+ log_to_file: true
+ desc: Change config options to enable logging to file
+ - test:
+ name: Verification of Ceph mgr crash
+ module: test_node_drain_customer_bug.py
+ polarion-id: CEPH-83595932
+ desc: Ceph mgr crashed after a mgr failover with the message mgr operator
+ - test:
+ name: Upgrade cluster to latest 7.x ceph version
+ desc: Upgrade cluster to latest version
+ module: test_cephadm_upgrade.py
+ polarion-id: CEPH-83573791,CEPH-83573790
+ config:
+ command: start
+ service: upgrade
+ base_cmd_args:
+ verbose: true
+ verify_cluster_health: true
+ destroy-cluster: false
+ abort-on-fail: true
+ - test:
+ name: Verification of Ceph mgr crash
+ module: test_node_drain_customer_bug.py
+ polarion-id: CEPH-83595932
+ desc: Ceph mgr crashed after a mgr failover with the message mgr operator
diff --git a/suites/squid/rados/tier-2_rados_test-osd-rebalance.yaml b/suites/squid/rados/tier-2_rados_test-osd-rebalance.yaml
@@ -173,3 +173,8 @@ tests:
 # m: 2
 # plugin: jerasure
 # disable_pg_autoscale: true
+ - test:
+ name: Verification of Ceph mgr crash
+ module: test_node_drain_customer_bug.py
+ polarion-id: CEPH-83595932
+ desc: Ceph mgr crashed after a mgr failover with the message mgr operator
diff --git a/tests/rados/test_node_drain_customer_bug.py b/tests/rados/test_node_drain_customer_bug.py
@@ -0,0 +1,206 @@
+"""
+The file contain the method to check the customer issue-
+ CEPH-83593996 - Check that the Ceph cluster logs are being generated appropriately according to the log level
+"""
+
+import random
+import re
+import time
+from threading import Thread
+
+from ceph.ceph_admin import CephAdmin
+from ceph.rados.core_workflows import RadosOrchestrator
+from ceph.rados.mgr_workflows import MgrWorkflows
+from ceph.rados.serviceability_workflows import ServiceabilityMethods
+from tests.rados.stretch_cluster import wait_for_clean_pg_sets
+from utility.log import Log
+
+log = Log(__name__)
+
+
+def run(ceph_cluster, **kw):
+ """
+ # CEPH-83593996
+ Bug id - https://bugzilla.redhat.com/show_bug.cgi?id=2305677
+ 1. Configure a cluster that have more than four OSD nodes
+ 2. Select an OSD node and drain the node
+ 3. Parallely execute the ceph mgr fail command
+ 4. Check that exception occurs on the cluster
+ 5. Perform the following workaround steps-
+ 5.1 ceph config-key rm mgr/cephadm/osd_remove_queue
+ 5.2 ceph mgr fail
+ 6. If exception not occured then check for the Traceback logs
+ """
+ log.info(run.__doc__)
+ config = kw["config"]
+ cephadm = CephAdmin(cluster=ceph_cluster, **config)
+ rados_obj = RadosOrchestrator(node=cephadm)
+ mgr_obj = MgrWorkflows(node=cephadm)
+ installer = ceph_cluster.get_nodes(role="installer")[0]
+ service_obj = ServiceabilityMethods(cluster=ceph_cluster, **config)
+ ceph_nodes = kw.get("ceph_nodes")
+
+ mgr_daemon = Thread(
+ target=background_mgr_task, kwargs={"mgr_object": mgr_obj}, daemon=True
+ )
+ osd_list = []
+
+ for node in ceph_nodes:
+ cmd_host_chk = f"ceph orch host ls --host_pattern {node.hostname}"
+ out = rados_obj.run_ceph_command(cmd=cmd_host_chk)
+ if not out:
+ log.info(f"The {node.hostname} is not in the cluster")
+ continue
+ if node.role == "osd":
+ node_osds = rados_obj.collect_osd_daemon_ids(node)
+ osd_list = osd_list + node_osds
+ osd_weight_chk = check_set_reweight(rados_obj, osd_list)
+ if not osd_weight_chk:
+ log.error(
+ "The osd weights are zero for ew nodes and weights of the OSD are not unique.Set the weights "
+ "manually and re-run the tests"
+ )
+ return 1
+
+ log_lines = [
+ "mgr load Traceback",
+ "TypeError: __init__() got an unexpected keyword argument 'original_weight'",
+ ]
+ ceph_version = rados_obj.run_ceph_command(cmd="ceph version")
+ log.info(f"Current version on the cluster : {ceph_version}")
+ match_str = re.match(
+ r"ceph version (\d+)\.(\d+)\.(\d+)-(\d+)", ceph_version["version"]
+ )
+ major, minor, patch, build = match_str.groups()
+ bug_exists = False
+ exceptional_flag = False
+ if int(major) < 18:
+ bug_exists = True
+ elif int(major) == 18 and int(minor) < 2:
+ bug_exists = True
+ elif int(major) == 18 and int(minor) == 2 and int(patch) < 1:
+ bug_exists = True
+ elif int(major) == 18 and int(minor) == 2 and int(patch) == 1 and int(build) <= 194:
+ bug_exists = True
+
+ osd_hosts = rados_obj.get_osd_hosts()
+ log.info(f"The osd node slist are-{osd_hosts}")
+ drain_host = random.choice(osd_hosts)
+ init_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'")
+ mgr_dump = rados_obj.run_ceph_command(cmd="ceph mgr dump", client_exec=True)
+ active_mgr = mgr_dump["active_name"]
+ try:
+ mgr_daemon.start()
+ service_obj.remove_custom_host(host_node_name=drain_host)
+ except Exception as e:
+ log.error(f"Failed with exception: {e.__doc__}")
+ log.exception(e)
+ exceptional_flag = True
+ finally:
+ log.info("=======In the bug reproduce finally block===========")
+ time.sleep(300)
+ if bug_exists:
+ log.info("Performing the workaround on the cluster")
+ cmd_remove_key = "ceph config-key rm mgr/cephadm/osd_remove_queue"
+ rados_obj.run_ceph_command(cmd=cmd_remove_key, client_exec=True)
+ mgr_obj.set_mgr_fail()
+ log.info(
+ f"This is an existing issue.The current version of ceph is {ceph_version}.The bug exists at "
+ f"< 18.2.1-194 ceph version"
+ )
+ log.info("For more details refer the Bug#2305677")
+ return 0
+ elif not bug_exists and exceptional_flag:
+ log.error(
+ f"The verification failed.The current version of ceph is {ceph_version}."
+ f"The bug fixed ceph-18.2.1-235.Find more details at-Bug#2305677 "
+ )
+ return 1
+ end_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'")
+ if not verify_mgr_traceback_log(
+ rados_obj=rados_obj,
+ start_time=init_time,
+ end_time=end_time,
+ mgr_type=active_mgr,
+ lines=log_lines,
+ ):
+ log.error("Traceback messages are noticed in logs")
+ return 1
+ log.info("Verification completed and not noticed any traceback messages")
+ return 0
+
+
+def verify_mgr_traceback_log(
+ rados_obj: RadosOrchestrator, start_time, end_time, mgr_type, lines
+) -> bool:
+ """
+ Retrieve the preempt log using journalctl command
+ Args:
+ rados_obj: Rados object
+ osd: osd id
+ start_time: time to start reading the journalctl logs - format ('2022-07-20 09:40:10')
+ end_time: time to stop reading the journalctl logs - format ('2022-07-20 10:58:49')
+ lines: Log lines to search in the journalctl logs
+ Returns: True-> if the lines are not exist in the journalctl logs
+ False -> if the lines are exist in the journalctl logs
+ """
+
+ log.info("Checking log lines")
+ log_lines = rados_obj.get_journalctl_log(
+ start_time=start_time, end_time=end_time, daemon_type="mgr", daemon_id=mgr_type
+ )
+ log.debug(f"Journalctl logs are : {log_lines}")
+ for line in lines:
+ if line in log_lines:
+ log.error(f" Found the {line} in the mgr logs")
+ return False
+ return True
+
+
+def background_mgr_task(mgr_object):
+ """
+ Method is used to execute the mgr fail command to execute parallel with other commands
+ Args:
+ mgr_object: mgr object
+ Returns: None
+ """
+ for _ in range(10):
+ mgr_object.set_mgr_fail()
+ time.sleep(2)
+
+
+def check_set_reweight(rados_obj, osd_list):
+ """
+ Method is used to check the OSD weights and assigned weight if the OSD weight value is 0
+ Args:
+ rados_obj: Rados object
+ osd_list: osd lists. For example [0,1,2,3,4,5]
+
+ Returns:
+ True-> If none of the OSD weights are 0 or weights are reassigned
+ False-> The cluster has more than one weight value in the cluster
+
+ """
+ osd_zero_weight_list = []
+ osd_weights = []
+ for osd_id in osd_list:
+ selected_osd_details = rados_obj.get_osd_details(osd_id=osd_id)
+ if selected_osd_details["crush_weight"] == 0:
+ osd_zero_weight_list = osd_zero_weight_list + [osd_id]
+ osd_weights = osd_weights + [selected_osd_details["crush_weight"]]
+ if osd_zero_weight_list is None:
+ return True
+ unique_weight_list = list(set(osd_weights) - {0})
+ if len(unique_weight_list) == 1:
+ osd_weight = unique_weight_list[0]
+ else:
+ log.info(
+ f"The osd weights are assigned more than 1 value in the cluster. The weights are-{osd_weights}"
+ )
+ return False
+ if len(osd_zero_weight_list) != 0:
+ for osd_id in osd_zero_weight_list:
+ rados_obj.reweight_crush_items(name=f"osd.{osd_id}", weight=osd_weight)
+ time.sleep(30) # blind sleep to let stats get updated crush re-weight
+ assert wait_for_clean_pg_sets(rados_obj, timeout=900)
+ return True