Skip to content

Commit

Permalink
Automation of the Bug#2305677
Browse files Browse the repository at this point in the history
Signed-off-by: Srinivasa Bharath Kanta <skanta@redhat.com>
  • Loading branch information
SrinivasaBharath committed Sep 27, 2024
1 parent d9d451a commit 386e2f6
Show file tree
Hide file tree
Showing 4 changed files with 351 additions and 2 deletions.
6 changes: 4 additions & 2 deletions ceph/rados/mgr_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,15 +443,17 @@ def get_mgr_stats(self):
mgr_stats = self.rados_obj.run_ceph_command(cmd)
return mgr_stats

def set_mgr_fail(self, host):
def set_mgr_fail(self, host: str = None):
"""
Method to fail the mgr host
Args:
host : mgr host name
Return:
Return the output of the execution of the command
"""
cmd = f"ceph mgr fail {host}"
cmd = "ceph mgr fail"
if host:
cmd += " " + host
out_put = self.rados_obj.run_ceph_command(cmd)
time.sleep(10)
return out_put
Expand Down
136 changes: 136 additions & 0 deletions suites/reef/rados/tier-2_rados_test-drain-customer-issue.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# Suite contains tier-2 rados bug verification automation
#===============================================================================================
#------------------------------------------------------------------------------------------
#----- Tier-2 - Bug verification automation ------
#------------------------------------------------------------------------------------------
# Conf: conf/reef/rados/11-node-cluster.yaml
# Bugs:
# 1. https://bugzilla.redhat.com/show_bug.cgi?id=2305677
#===============================================================================================
tests:
- test:
name: setup install pre-requisistes
desc: Setup phase to deploy the required pre-requisites for running the tests.
module: install_prereq.py
abort-on-fail: true

- test:
name: cluster deployment
desc: Execute the cluster deployment workflow.
module: test_cephadm.py
polarion-id:
config:
verify_cluster_health: true
steps:
- config:
command: bootstrap
service: cephadm
args:
rhcs-version: 7.1
release: z0
mon-ip: node1
orphan-initial-daemons: true
skip-monitoring-stack: true
- config:
command: add_hosts
service: host
args:
attach_ip_address: true
labels: apply-all-labels
- config:
command: apply
service: mgr
args:
placement:
label: mgr
- config:
command: apply
service: mon
args:
placement:
label: mon
- config:
command: apply
service: osd
args:
all-available-devices: true
- config:
command: shell
args: # arguments to ceph orch
- ceph
- fs
- volume
- create
- cephfs
- config:
command: apply
service: rgw
pos_args:
- rgw.1
args:
placement:
label: rgw
- config:
command: apply
service: mds
base_cmd_args: # arguments to ceph orch
verbose: true
pos_args:
- cephfs # name of the filesystem
args:
placement:
nodes:
- node2
- node6
limit: 2 # no of daemons
sep: " " # separator to be used for placements
destroy-cluster: false
abort-on-fail: true

- test:
name: Configure client admin
desc: Configures client admin node on cluster
module: test_client.py
polarion-id:
config:
command: add
id: client.1 # client Id (<type>.<Id>)
node: node7 # client node
install_packages:
- ceph-common
copy_admin_keyring: true # Copy admin keyring to node
caps: # authorize client capabilities
mon: "allow *"
osd: "allow *"
mds: "allow *"
mgr: "allow *"

- test:
name: Enable logging to file
module: rados_prep.py
config:
log_to_file: true
desc: Change config options to enable logging to file
- test:
name: Verification of Ceph mgr crash
module: test_node_drain_customer_bug.py
polarion-id: CEPH-83595932
desc: Ceph mgr crashed after a mgr failover with the message mgr operator
- test:
name: Upgrade cluster to latest 7.x ceph version
desc: Upgrade cluster to latest version
module: test_cephadm_upgrade.py
polarion-id: CEPH-83573791,CEPH-83573790
config:
command: start
service: upgrade
base_cmd_args:
verbose: true
verify_cluster_health: true
destroy-cluster: false
abort-on-fail: true
- test:
name: Verification of Ceph mgr crash
module: test_node_drain_customer_bug.py
polarion-id: CEPH-83595932
desc: Ceph mgr crashed after a mgr failover with the message mgr operator
5 changes: 5 additions & 0 deletions suites/squid/rados/tier-2_rados_test-osd-rebalance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,8 @@ tests:
# m: 2
# plugin: jerasure
# disable_pg_autoscale: true
- test:
name: Verification of Ceph mgr crash
module: test_node_drain_customer_bug.py
polarion-id: CEPH-83595932
desc: Ceph mgr crashed after a mgr failover with the message mgr operator
206 changes: 206 additions & 0 deletions tests/rados/test_node_drain_customer_bug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
"""
The file contain the method to check the customer issue-
CEPH-83593996 - Check that the Ceph cluster logs are being generated appropriately according to the log level
"""

import random
import re
import time
from threading import Thread

from ceph.ceph_admin import CephAdmin
from ceph.rados.core_workflows import RadosOrchestrator
from ceph.rados.mgr_workflows import MgrWorkflows
from ceph.rados.serviceability_workflows import ServiceabilityMethods
from tests.rados.stretch_cluster import wait_for_clean_pg_sets
from utility.log import Log

log = Log(__name__)


def run(ceph_cluster, **kw):
"""
# CEPH-83593996
Bug id - https://bugzilla.redhat.com/show_bug.cgi?id=2305677
1. Configure a cluster that have more than four OSD nodes
2. Select an OSD node and drain the node
3. Parallely execute the ceph mgr fail command
4. Check that exception occurs on the cluster
5. Perform the following workaround steps-
5.1 ceph config-key rm mgr/cephadm/osd_remove_queue
5.2 ceph mgr fail
6. If exception not occured then check for the Traceback logs
"""
log.info(run.__doc__)
config = kw["config"]
cephadm = CephAdmin(cluster=ceph_cluster, **config)
rados_obj = RadosOrchestrator(node=cephadm)
mgr_obj = MgrWorkflows(node=cephadm)
installer = ceph_cluster.get_nodes(role="installer")[0]
service_obj = ServiceabilityMethods(cluster=ceph_cluster, **config)
ceph_nodes = kw.get("ceph_nodes")

mgr_daemon = Thread(
target=background_mgr_task, kwargs={"mgr_object": mgr_obj}, daemon=True
)
osd_list = []

for node in ceph_nodes:
cmd_host_chk = f"ceph orch host ls --host_pattern {node.hostname}"
out = rados_obj.run_ceph_command(cmd=cmd_host_chk)
if not out:
log.info(f"The {node.hostname} is not in the cluster")
continue
if node.role == "osd":
node_osds = rados_obj.collect_osd_daemon_ids(node)
osd_list = osd_list + node_osds
osd_weight_chk = check_set_reweight(rados_obj, osd_list)
if not osd_weight_chk:
log.error(
"The osd weights are zero for ew nodes and weights of the OSD are not unique.Set the weights "
"manually and re-run the tests"
)
return 1

log_lines = [
"mgr load Traceback",
"TypeError: __init__() got an unexpected keyword argument 'original_weight'",
]
ceph_version = rados_obj.run_ceph_command(cmd="ceph version")
log.info(f"Current version on the cluster : {ceph_version}")
match_str = re.match(
r"ceph version (\d+)\.(\d+)\.(\d+)-(\d+)", ceph_version["version"]
)
major, minor, patch, build = match_str.groups()
bug_exists = False
exceptional_flag = False
if int(major) < 18:
bug_exists = True
elif int(major) == 18 and int(minor) < 2:
bug_exists = True
elif int(major) == 18 and int(minor) == 2 and int(patch) < 1:
bug_exists = True
elif int(major) == 18 and int(minor) == 2 and int(patch) == 1 and int(build) <= 194:
bug_exists = True

osd_hosts = rados_obj.get_osd_hosts()
log.info(f"The osd node slist are-{osd_hosts}")
drain_host = random.choice(osd_hosts)
init_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'")
mgr_dump = rados_obj.run_ceph_command(cmd="ceph mgr dump", client_exec=True)
active_mgr = mgr_dump["active_name"]
try:
mgr_daemon.start()
service_obj.remove_custom_host(host_node_name=drain_host)
except Exception as e:
log.error(f"Failed with exception: {e.__doc__}")
log.exception(e)
exceptional_flag = True
finally:
log.info("=======In the bug reproduce finally block===========")
time.sleep(300)
if bug_exists:
log.info("Performing the workaround on the cluster")
cmd_remove_key = "ceph config-key rm mgr/cephadm/osd_remove_queue"
rados_obj.run_ceph_command(cmd=cmd_remove_key, client_exec=True)
mgr_obj.set_mgr_fail()
log.info(
f"This is an existing issue.The current version of ceph is {ceph_version}.The bug exists at "
f"< 18.2.1-194 ceph version"
)
log.info("For more details refer the Bug#2305677")
return 0
elif not bug_exists and exceptional_flag:
log.error(
f"The verification failed.The current version of ceph is {ceph_version}."
f"The bug fixed ceph-18.2.1-235.Find more details at-Bug#2305677 "
)
return 1
end_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'")
if not verify_mgr_traceback_log(
rados_obj=rados_obj,
start_time=init_time,
end_time=end_time,
mgr_type=active_mgr,
lines=log_lines,
):
log.error("Traceback messages are noticed in logs")
return 1
log.info("Verification completed and not noticed any traceback messages")
return 0


def verify_mgr_traceback_log(
rados_obj: RadosOrchestrator, start_time, end_time, mgr_type, lines
) -> bool:
"""
Retrieve the preempt log using journalctl command
Args:
rados_obj: Rados object
osd: osd id
start_time: time to start reading the journalctl logs - format ('2022-07-20 09:40:10')
end_time: time to stop reading the journalctl logs - format ('2022-07-20 10:58:49')
lines: Log lines to search in the journalctl logs
Returns: True-> if the lines are not exist in the journalctl logs
False -> if the lines are exist in the journalctl logs
"""

log.info("Checking log lines")
log_lines = rados_obj.get_journalctl_log(
start_time=start_time, end_time=end_time, daemon_type="mgr", daemon_id=mgr_type
)
log.debug(f"Journalctl logs are : {log_lines}")
for line in lines:
if line in log_lines:
log.error(f" Found the {line} in the mgr logs")
return False
return True


def background_mgr_task(mgr_object):
"""
Method is used to execute the mgr fail command to execute parallel with other commands
Args:
mgr_object: mgr object
Returns: None
"""
for _ in range(10):
mgr_object.set_mgr_fail()
time.sleep(2)


def check_set_reweight(rados_obj, osd_list):
"""
Method is used to check the OSD weights and assigned weight if the OSD weight value is 0
Args:
rados_obj: Rados object
osd_list: osd lists. For example [0,1,2,3,4,5]
Returns:
True-> If none of the OSD weights are 0 or weights are reassigned
False-> The cluster has more than one weight value in the cluster
"""
osd_zero_weight_list = []
osd_weights = []
for osd_id in osd_list:
selected_osd_details = rados_obj.get_osd_details(osd_id=osd_id)
if selected_osd_details["crush_weight"] == 0:
osd_zero_weight_list = osd_zero_weight_list + [osd_id]
osd_weights = osd_weights + [selected_osd_details["crush_weight"]]
if osd_zero_weight_list is None:
return True
unique_weight_list = list(set(osd_weights) - {0})
if len(unique_weight_list) == 1:
osd_weight = unique_weight_list[0]
else:
log.info(
f"The osd weights are assigned more than 1 value in the cluster. The weights are-{osd_weights}"
)
return False
if len(osd_zero_weight_list) != 0:
for osd_id in osd_zero_weight_list:
rados_obj.reweight_crush_items(name=f"osd.{osd_id}", weight=osd_weight)
time.sleep(30) # blind sleep to let stats get updated crush re-weight
assert wait_for_clean_pg_sets(rados_obj, timeout=900)
return True

0 comments on commit 386e2f6

Please sign in to comment.