-
Notifications
You must be signed in to change notification settings - Fork 67
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Srinivasa Bharath Kanta <skanta@redhat.com>
- Loading branch information
1 parent
d9d451a
commit 386e2f6
Showing
4 changed files
with
351 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
136 changes: 136 additions & 0 deletions
136
suites/reef/rados/tier-2_rados_test-drain-customer-issue.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
# Suite contains tier-2 rados bug verification automation | ||
#=============================================================================================== | ||
#------------------------------------------------------------------------------------------ | ||
#----- Tier-2 - Bug verification automation ------ | ||
#------------------------------------------------------------------------------------------ | ||
# Conf: conf/reef/rados/11-node-cluster.yaml | ||
# Bugs: | ||
# 1. https://bugzilla.redhat.com/show_bug.cgi?id=2305677 | ||
#=============================================================================================== | ||
tests: | ||
- test: | ||
name: setup install pre-requisistes | ||
desc: Setup phase to deploy the required pre-requisites for running the tests. | ||
module: install_prereq.py | ||
abort-on-fail: true | ||
|
||
- test: | ||
name: cluster deployment | ||
desc: Execute the cluster deployment workflow. | ||
module: test_cephadm.py | ||
polarion-id: | ||
config: | ||
verify_cluster_health: true | ||
steps: | ||
- config: | ||
command: bootstrap | ||
service: cephadm | ||
args: | ||
rhcs-version: 7.1 | ||
release: z0 | ||
mon-ip: node1 | ||
orphan-initial-daemons: true | ||
skip-monitoring-stack: true | ||
- config: | ||
command: add_hosts | ||
service: host | ||
args: | ||
attach_ip_address: true | ||
labels: apply-all-labels | ||
- config: | ||
command: apply | ||
service: mgr | ||
args: | ||
placement: | ||
label: mgr | ||
- config: | ||
command: apply | ||
service: mon | ||
args: | ||
placement: | ||
label: mon | ||
- config: | ||
command: apply | ||
service: osd | ||
args: | ||
all-available-devices: true | ||
- config: | ||
command: shell | ||
args: # arguments to ceph orch | ||
- ceph | ||
- fs | ||
- volume | ||
- create | ||
- cephfs | ||
- config: | ||
command: apply | ||
service: rgw | ||
pos_args: | ||
- rgw.1 | ||
args: | ||
placement: | ||
label: rgw | ||
- config: | ||
command: apply | ||
service: mds | ||
base_cmd_args: # arguments to ceph orch | ||
verbose: true | ||
pos_args: | ||
- cephfs # name of the filesystem | ||
args: | ||
placement: | ||
nodes: | ||
- node2 | ||
- node6 | ||
limit: 2 # no of daemons | ||
sep: " " # separator to be used for placements | ||
destroy-cluster: false | ||
abort-on-fail: true | ||
|
||
- test: | ||
name: Configure client admin | ||
desc: Configures client admin node on cluster | ||
module: test_client.py | ||
polarion-id: | ||
config: | ||
command: add | ||
id: client.1 # client Id (<type>.<Id>) | ||
node: node7 # client node | ||
install_packages: | ||
- ceph-common | ||
copy_admin_keyring: true # Copy admin keyring to node | ||
caps: # authorize client capabilities | ||
mon: "allow *" | ||
osd: "allow *" | ||
mds: "allow *" | ||
mgr: "allow *" | ||
|
||
- test: | ||
name: Enable logging to file | ||
module: rados_prep.py | ||
config: | ||
log_to_file: true | ||
desc: Change config options to enable logging to file | ||
- test: | ||
name: Verification of Ceph mgr crash | ||
module: test_node_drain_customer_bug.py | ||
polarion-id: CEPH-83595932 | ||
desc: Ceph mgr crashed after a mgr failover with the message mgr operator | ||
- test: | ||
name: Upgrade cluster to latest 7.x ceph version | ||
desc: Upgrade cluster to latest version | ||
module: test_cephadm_upgrade.py | ||
polarion-id: CEPH-83573791,CEPH-83573790 | ||
config: | ||
command: start | ||
service: upgrade | ||
base_cmd_args: | ||
verbose: true | ||
verify_cluster_health: true | ||
destroy-cluster: false | ||
abort-on-fail: true | ||
- test: | ||
name: Verification of Ceph mgr crash | ||
module: test_node_drain_customer_bug.py | ||
polarion-id: CEPH-83595932 | ||
desc: Ceph mgr crashed after a mgr failover with the message mgr operator |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
""" | ||
The file contain the method to check the customer issue- | ||
CEPH-83593996 - Check that the Ceph cluster logs are being generated appropriately according to the log level | ||
""" | ||
|
||
import random | ||
import re | ||
import time | ||
from threading import Thread | ||
|
||
from ceph.ceph_admin import CephAdmin | ||
from ceph.rados.core_workflows import RadosOrchestrator | ||
from ceph.rados.mgr_workflows import MgrWorkflows | ||
from ceph.rados.serviceability_workflows import ServiceabilityMethods | ||
from tests.rados.stretch_cluster import wait_for_clean_pg_sets | ||
from utility.log import Log | ||
|
||
log = Log(__name__) | ||
|
||
|
||
def run(ceph_cluster, **kw): | ||
""" | ||
# CEPH-83593996 | ||
Bug id - https://bugzilla.redhat.com/show_bug.cgi?id=2305677 | ||
1. Configure a cluster that have more than four OSD nodes | ||
2. Select an OSD node and drain the node | ||
3. Parallely execute the ceph mgr fail command | ||
4. Check that exception occurs on the cluster | ||
5. Perform the following workaround steps- | ||
5.1 ceph config-key rm mgr/cephadm/osd_remove_queue | ||
5.2 ceph mgr fail | ||
6. If exception not occured then check for the Traceback logs | ||
""" | ||
log.info(run.__doc__) | ||
config = kw["config"] | ||
cephadm = CephAdmin(cluster=ceph_cluster, **config) | ||
rados_obj = RadosOrchestrator(node=cephadm) | ||
mgr_obj = MgrWorkflows(node=cephadm) | ||
installer = ceph_cluster.get_nodes(role="installer")[0] | ||
service_obj = ServiceabilityMethods(cluster=ceph_cluster, **config) | ||
ceph_nodes = kw.get("ceph_nodes") | ||
|
||
mgr_daemon = Thread( | ||
target=background_mgr_task, kwargs={"mgr_object": mgr_obj}, daemon=True | ||
) | ||
osd_list = [] | ||
|
||
for node in ceph_nodes: | ||
cmd_host_chk = f"ceph orch host ls --host_pattern {node.hostname}" | ||
out = rados_obj.run_ceph_command(cmd=cmd_host_chk) | ||
if not out: | ||
log.info(f"The {node.hostname} is not in the cluster") | ||
continue | ||
if node.role == "osd": | ||
node_osds = rados_obj.collect_osd_daemon_ids(node) | ||
osd_list = osd_list + node_osds | ||
osd_weight_chk = check_set_reweight(rados_obj, osd_list) | ||
if not osd_weight_chk: | ||
log.error( | ||
"The osd weights are zero for ew nodes and weights of the OSD are not unique.Set the weights " | ||
"manually and re-run the tests" | ||
) | ||
return 1 | ||
|
||
log_lines = [ | ||
"mgr load Traceback", | ||
"TypeError: __init__() got an unexpected keyword argument 'original_weight'", | ||
] | ||
ceph_version = rados_obj.run_ceph_command(cmd="ceph version") | ||
log.info(f"Current version on the cluster : {ceph_version}") | ||
match_str = re.match( | ||
r"ceph version (\d+)\.(\d+)\.(\d+)-(\d+)", ceph_version["version"] | ||
) | ||
major, minor, patch, build = match_str.groups() | ||
bug_exists = False | ||
exceptional_flag = False | ||
if int(major) < 18: | ||
bug_exists = True | ||
elif int(major) == 18 and int(minor) < 2: | ||
bug_exists = True | ||
elif int(major) == 18 and int(minor) == 2 and int(patch) < 1: | ||
bug_exists = True | ||
elif int(major) == 18 and int(minor) == 2 and int(patch) == 1 and int(build) <= 194: | ||
bug_exists = True | ||
|
||
osd_hosts = rados_obj.get_osd_hosts() | ||
log.info(f"The osd node slist are-{osd_hosts}") | ||
drain_host = random.choice(osd_hosts) | ||
init_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'") | ||
mgr_dump = rados_obj.run_ceph_command(cmd="ceph mgr dump", client_exec=True) | ||
active_mgr = mgr_dump["active_name"] | ||
try: | ||
mgr_daemon.start() | ||
service_obj.remove_custom_host(host_node_name=drain_host) | ||
except Exception as e: | ||
log.error(f"Failed with exception: {e.__doc__}") | ||
log.exception(e) | ||
exceptional_flag = True | ||
finally: | ||
log.info("=======In the bug reproduce finally block===========") | ||
time.sleep(300) | ||
if bug_exists: | ||
log.info("Performing the workaround on the cluster") | ||
cmd_remove_key = "ceph config-key rm mgr/cephadm/osd_remove_queue" | ||
rados_obj.run_ceph_command(cmd=cmd_remove_key, client_exec=True) | ||
mgr_obj.set_mgr_fail() | ||
log.info( | ||
f"This is an existing issue.The current version of ceph is {ceph_version}.The bug exists at " | ||
f"< 18.2.1-194 ceph version" | ||
) | ||
log.info("For more details refer the Bug#2305677") | ||
return 0 | ||
elif not bug_exists and exceptional_flag: | ||
log.error( | ||
f"The verification failed.The current version of ceph is {ceph_version}." | ||
f"The bug fixed ceph-18.2.1-235.Find more details at-Bug#2305677 " | ||
) | ||
return 1 | ||
end_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'") | ||
if not verify_mgr_traceback_log( | ||
rados_obj=rados_obj, | ||
start_time=init_time, | ||
end_time=end_time, | ||
mgr_type=active_mgr, | ||
lines=log_lines, | ||
): | ||
log.error("Traceback messages are noticed in logs") | ||
return 1 | ||
log.info("Verification completed and not noticed any traceback messages") | ||
return 0 | ||
|
||
|
||
def verify_mgr_traceback_log( | ||
rados_obj: RadosOrchestrator, start_time, end_time, mgr_type, lines | ||
) -> bool: | ||
""" | ||
Retrieve the preempt log using journalctl command | ||
Args: | ||
rados_obj: Rados object | ||
osd: osd id | ||
start_time: time to start reading the journalctl logs - format ('2022-07-20 09:40:10') | ||
end_time: time to stop reading the journalctl logs - format ('2022-07-20 10:58:49') | ||
lines: Log lines to search in the journalctl logs | ||
Returns: True-> if the lines are not exist in the journalctl logs | ||
False -> if the lines are exist in the journalctl logs | ||
""" | ||
|
||
log.info("Checking log lines") | ||
log_lines = rados_obj.get_journalctl_log( | ||
start_time=start_time, end_time=end_time, daemon_type="mgr", daemon_id=mgr_type | ||
) | ||
log.debug(f"Journalctl logs are : {log_lines}") | ||
for line in lines: | ||
if line in log_lines: | ||
log.error(f" Found the {line} in the mgr logs") | ||
return False | ||
return True | ||
|
||
|
||
def background_mgr_task(mgr_object): | ||
""" | ||
Method is used to execute the mgr fail command to execute parallel with other commands | ||
Args: | ||
mgr_object: mgr object | ||
Returns: None | ||
""" | ||
for _ in range(10): | ||
mgr_object.set_mgr_fail() | ||
time.sleep(2) | ||
|
||
|
||
def check_set_reweight(rados_obj, osd_list): | ||
""" | ||
Method is used to check the OSD weights and assigned weight if the OSD weight value is 0 | ||
Args: | ||
rados_obj: Rados object | ||
osd_list: osd lists. For example [0,1,2,3,4,5] | ||
Returns: | ||
True-> If none of the OSD weights are 0 or weights are reassigned | ||
False-> The cluster has more than one weight value in the cluster | ||
""" | ||
osd_zero_weight_list = [] | ||
osd_weights = [] | ||
for osd_id in osd_list: | ||
selected_osd_details = rados_obj.get_osd_details(osd_id=osd_id) | ||
if selected_osd_details["crush_weight"] == 0: | ||
osd_zero_weight_list = osd_zero_weight_list + [osd_id] | ||
osd_weights = osd_weights + [selected_osd_details["crush_weight"]] | ||
if osd_zero_weight_list is None: | ||
return True | ||
unique_weight_list = list(set(osd_weights) - {0}) | ||
if len(unique_weight_list) == 1: | ||
osd_weight = unique_weight_list[0] | ||
else: | ||
log.info( | ||
f"The osd weights are assigned more than 1 value in the cluster. The weights are-{osd_weights}" | ||
) | ||
return False | ||
if len(osd_zero_weight_list) != 0: | ||
for osd_id in osd_zero_weight_list: | ||
rados_obj.reweight_crush_items(name=f"osd.{osd_id}", weight=osd_weight) | ||
time.sleep(30) # blind sleep to let stats get updated crush re-weight | ||
assert wait_for_clean_pg_sets(rados_obj, timeout=900) | ||
return True |