From 00c604e4bfaf961f82853c2710aa8726c37ee97d Mon Sep 17 00:00:00 2001
From: Pawan Dhiran <pdhiran@redhat.com>
Date: Wed, 9 Oct 2024 15:12:58 +0530
Subject: [PATCH] Addition of tests for 3AZ Cluster Scenarios - Netsplit b/w
 DCs

Signed-off-by: Pawan Dhiran <pdhiran@redhat.com>
---
 ceph/rados/core_workflows.py                  | 235 ++++++++++++++
 conf/squid/rados/3AZ-cluster.yaml             |  42 ++-
 .../rados/tier-3_rados_test-3-AZ-Cluster.yaml |  38 +--
 .../test_stretch_n-az_netsplit_scenarios.py   | 286 ++++++++++++++++++
 .../rados/test_stretch_netsplit_scenarios.py  |   2 +-
 5 files changed, 568 insertions(+), 35 deletions(-)
 create mode 100644 tests/rados/test_stretch_n-az_netsplit_scenarios.py

diff --git a/ceph/rados/core_workflows.py b/ceph/rados/core_workflows.py
index 28be98a171..0dce5f84f1 100644
--- a/ceph/rados/core_workflows.py
+++ b/ceph/rados/core_workflows.py
@@ -11,8 +11,10 @@
 
 import datetime
 import json
+import math
 import re
 import time
+from collections import namedtuple
 
 from ceph.ceph_admin import CephAdmin
 from ceph.parallel import parallel
@@ -2271,6 +2273,239 @@ def run_pool_sanity_check(self):
         log.info("Completed check on the cluster. Pass!")
         return True
 
+    def create_n_az_stretch_pool(
+        self,
+        pool_name: str,
+        rule_name: str,
+        rule_id: int,
+        peer_bucket_barrier: str = "datacenter",
+        num_sites: int = 3,
+        num_copies_per_site: int = 2,
+        total_buckets: int = 3,
+        req_peering_buckets: int = 2,
+    ) -> bool:
+        """Method to create a replicated pool and enable stretch mode on the pool
+
+        Note: Most of the params have a default value. when created with defaults, pool is crated for 3AZ cluster,
+         with 2 copies per site.
+        Args:
+             pool_name: name of the pool
+             rule_id: rule ID
+             rule_name: rule name
+             peer_bucket_barrier: Crush level at which failures are accepted
+             num_sites: number of "peer_bucket_barrier"s the data should be stored.
+                eg : data has to be stored acorss 3 DCs. num_sites is 3
+            num_copies_per_site: number of copies of data to be stored in each site
+            total_buckets: total no of "peer_bucket_barrier" present on cluster.
+                note: In most cases, total_buckets = num_sites. this changes when CU does not want each site to
+                        hold data copy
+            req_peering_buckets: number of "peer_bucket_barrier" buckets to perform successful peering process
+        Returns:
+            bool. Pass -> True, Fail -> False
+        """
+
+        # Creating test pool to check the effect of Netsplit scenarios on the Pool IO
+        if not self.create_pool(pool_name=pool_name):
+            log.error(f"Failed to create pool : {pool_name}")
+            return False
+
+        rules = f"""id {rule_id}
+type replicated
+step take default
+step choose firstn {num_sites} type {peer_bucket_barrier}
+step chooseleaf firstn {num_copies_per_site} type host
+step emit"""
+        log.debug(f"Rule to be added :\n {rules}\n")
+
+        if not self.add_custom_crush_rules(rule_name=rule_name, rules=rules):
+            log.error("Failed to add the new crush rule")
+            return False
+
+        size = num_sites * num_copies_per_site
+        min_size = math.ceil(size / 2)
+
+        # Enabling stretch mode on the pool
+        if not self.enable_nsite_stretch_pool(
+            pool_name=pool_name,
+            peering_crush_bucket_count=req_peering_buckets,
+            peering_crush_bucket_target=total_buckets,
+            peering_crush_bucket_barrier=peer_bucket_barrier,
+            crush_rule=rule_name,
+            size=size,
+            min_size=min_size,
+        ):
+            log.error(f"Unable to enable stretch mode on the pool : {pool_name}")
+            return False
+
+    def get_multi_az_stretch_site_hosts(
+        self, num_data_sites, stretch_bucket: str = "datacenter"
+    ) -> tuple:
+        """
+        Method to get the site hosts from the stretch cluster
+        Uses osd tree and mon dump commands to prepare a set of all the hosts from each DC.
+        Args:
+            num_data_sites: number of data sites in the cluster
+            stretch_bucket: bucket level at which the stretch rules are set
+        Returns:
+            Hosts: A named tuple containing information about the hosts.
+                - {site_name} (list): A list of hosts in the respective data center.
+        """
+
+        # Getting the CRUSH buckets added into the cluster via osd tree
+        osd_tree_cmd = "ceph osd tree"
+        buckets = self.run_ceph_command(cmd=osd_tree_cmd)
+        dc_buckets = [d for d in buckets["nodes"] if d.get("type") == stretch_bucket]
+        dc_names = [name["name"] for name in dc_buckets]
+        log.debug(f"DC names obtained from OSD tree : {dc_names}, count : {len(dc_names)}")
+
+        # Dynamically create named tuple fields based on data center names (site names)
+        fields = [dc["name"] for dc in dc_buckets[:num_data_sites]]
+
+        # Create a namedtuple class dynamically based on the site names
+        Hosts = namedtuple("Hosts", fields)
+
+        # Initialize all fields with empty lists
+        hosts = Hosts(**{field: [] for field in fields})
+
+        # Fetching the Mon daemon placement in each CRUSH location
+        def get_mon_from_dc(site_name) -> list:
+            """
+            Returns the list of dictionaries that are part of the site_name passed.
+            Args:
+                site_name: Name of the site, whose mons have to be fetched.
+            Return:
+                List of dictionaries that are present in a particular site.
+            """
+            mon_dump = "ceph mon dump"
+            mons = self.run_ceph_command(cmd=mon_dump)
+            site_mons = [
+                d
+                for d in mons["mons"]
+                if d.get("crush_location")
+                == "{" + stretch_bucket + "=" + site_name + "}"
+            ]
+            return site_mons
+
+        for i in range(num_data_sites):
+            dc = dc_buckets.pop()
+            dc_name = dc["name"]  # Use the actual data center name (site name)
+            osd_hosts = []
+
+            # Fetching the OSD hosts of the DCs
+            for crush_id in dc["children"]:
+                for entry in buckets["nodes"]:
+                    if entry.get("id") == crush_id:
+                        osd_hosts.append(entry.get("name"))
+
+            # Fetch MON hosts for the site
+            dc_mons = [
+                entry.get("name") for entry in get_mon_from_dc(site_name=dc_name)
+            ]
+
+            # Combine each DC's OSD & MON hosts and update the respective field in the namedtuple
+            combined_hosts = list(set(osd_hosts + dc_mons))
+            field_name = dc_name  # Use the site name as the field name
+
+            # Using _replace to update the field
+            hosts = hosts._replace(**{field_name: combined_hosts})
+
+            log.debug(f"Hosts present in Datacenter : {dc_name} : {combined_hosts}")
+
+        log.info(f"Hosts present in Cluster : {hosts}")
+        return hosts
+
+    def enable_nsite_stretch_pool(
+        self,
+        pool_name,
+        peering_crush_bucket_count,
+        peering_crush_bucket_target,
+        peering_crush_bucket_barrier,
+        crush_rule,
+        size,
+        min_size,
+    ) -> bool:
+        """
+        Module to enable stretch mode on the pools in a multi AZ setup
+        Args:
+            pool_name: name of the pool
+            peering_crush_bucket_count: number of buckets for peering to happen
+            peering_crush_bucket_target: number of peering buckets
+            peering_crush_bucket_barrier: CRUSH object used for various AZs
+            crush_rule: name of the crush rule. Make sure the crush rule already exists on the cluster
+            size: size for the pool
+            min_size: min_size for the pool
+        """
+        cmd = (
+            f"ceph osd pool stretch set {pool_name} {peering_crush_bucket_count} {peering_crush_bucket_target} "
+            f"{peering_crush_bucket_barrier} {crush_rule} {size} {min_size}"
+        )
+
+        try:
+            self.run_ceph_command(cmd=cmd)
+            time.sleep(5)
+            log.debug(f"Checking if the stretch mode op the pool : {pool_name}")
+            cmd = f"ceph osd pool stretch show {pool_name}"
+            out = self.run_ceph_command(cmd=cmd)
+            log.debug(out)
+            return True
+        except Exception as err:
+            log.error(
+                f"hit exception while enabling/ checking stretch pool details. Error : {err}"
+            )
+            return False
+
+    def add_custom_crush_rules(self, rule_name: str, rules: str) -> bool:
+        """
+        Adds the given crush rules into the crush map
+        Args:
+            rule_name: Name of the crush rule to add
+            rules: The rules for crush
+        Returns: True -> pass, False -> fail
+        """
+        try:
+            # Getting the crush map
+            cmd = "ceph osd getcrushmap > /tmp/crush.map.bin"
+            self.client.exec_command(cmd=cmd, sudo=True)
+
+            # changing it to text for editing
+            cmd = "crushtool -d /tmp/crush.map.bin -o /tmp/crush.map.txt"
+            self.client.exec_command(cmd=cmd, sudo=True)
+
+            # Adding the crush rules into the file
+            cmd = f"""cat <<EOF >> /tmp/crush.map.txt
+rule {rule_name} {"{"}
+{rules}
+{"}"}
+EOF"""
+            log.debug(f"Command to add crush rules : \n {cmd} \n")
+            self.client.exec_command(cmd=cmd, sudo=True)
+
+            # Changing back the text file into bin
+            cmd = "crushtool -c /tmp/crush.map.txt -o /tmp/crush2.map.bin"
+            self.client.exec_command(cmd=cmd, sudo=True)
+
+            # Setting the new crush map
+            cmd = "ceph osd setcrushmap -i /tmp/crush2.map.bin"
+            self.client.exec_command(cmd=cmd, sudo=True)
+
+            time.sleep(5)
+
+            out = self.run_ceph_command(cmd="ceph osd crush rule ls", client_exec=True)
+            if rule_name not in out:
+                log.error(
+                    f"New rule added in the cluster is not listed in the cluster."
+                    f"rule added : {rule_name}, \n"
+                    f"rules present on cluster : {out}"
+                )
+                return False
+
+            log.info(f"Crush rule: {rule_name} added successfully")
+            return True
+        except Exception as err:
+            log.error("Failed to set the crush rules")
+            log.error(err)
+            return False
+
     def check_inactive_pgs_on_pool(self, pool_name) -> bool:
         """
         Method to check if the provided pool has any PGs in inactive state
diff --git a/conf/squid/rados/3AZ-cluster.yaml b/conf/squid/rados/3AZ-cluster.yaml
index f66d582cee..4db39e9352 100644
--- a/conf/squid/rados/3AZ-cluster.yaml
+++ b/conf/squid/rados/3AZ-cluster.yaml
@@ -5,6 +5,8 @@ globals:
   - ceph-cluster:
       name: ceph
       node1:
+        networks:
+          - provider_net_cci_15
         role:
           - _admin
           - mon
@@ -15,68 +17,90 @@ globals:
           - prometheus
           - osd
         no-of-volumes: 4
-        disk-size: 25
+        disk-size: 15
       node2:
+        networks:
+          - provider_net_cci_15
         role:
           - mon
           - mgr
           - rgw
           - osd
         no-of-volumes: 4
-        disk-size: 25
+        disk-size: 15
       node3:
+        networks:
+          - provider_net_cci_15
         role:
           - osd
           - mon
           - mds
         no-of-volumes: 4
-        disk-size: 25
+        disk-size: 15
       node4:
+        networks:
+          - provider_net_cci_13
         role:
           - _admin
           - mon
           - mgr
           - osd
+          - alertmanager
+          - grafana
+          - prometheus
         no-of-volumes: 4
-        disk-size: 25
+        disk-size: 15
       node5:
+        networks:
+          - provider_net_cci_13
         role:
           - mon
           - mgr
           - rgw
           - osd
         no-of-volumes: 4
-        disk-size: 25
+        disk-size: 15
       node6:
+        networks:
+          - provider_net_cci_13
         role:
           - osd
           - mon
           - mds
         no-of-volumes: 4
-        disk-size: 25
+        disk-size: 15
       node7:
+        networks:
+          - provider_net_cci_16
         role:
           - _admin
           - mon
           - mgr
           - osd
+          - alertmanager
+          - grafana
+          - prometheus
         no-of-volumes: 4
-        disk-size: 25
+        disk-size: 15
       node8:
+        networks:
+          - provider_net_cci_16
         role:
           - mon
           - mgr
           - rgw
           - osd
         no-of-volumes: 4
-        disk-size: 25
+        disk-size: 15
       node9:
+        networks:
+          - provider_net_cci_16
         role:
           - osd
           - mon
           - mds
         no-of-volumes: 4
-        disk-size: 25
+        disk-size: 15
       node10:
         role:
           - client
diff --git a/suites/squid/rados/tier-3_rados_test-3-AZ-Cluster.yaml b/suites/squid/rados/tier-3_rados_test-3-AZ-Cluster.yaml
index 9e0583ca52..8ddd4b04b8 100644
--- a/suites/squid/rados/tier-3_rados_test-3-AZ-Cluster.yaml
+++ b/suites/squid/rados/tier-3_rados_test-3-AZ-Cluster.yaml
@@ -105,31 +105,6 @@ tests:
               args: # display OSD tree
                 - "ceph osd tree"
 
-  - test:
-      name: MDS Service deployment with spec
-      desc: Add MDS services using spec file
-      module: test_cephadm.py
-      polarion-id: CEPH-83574728
-      config:
-        steps:
-          - config:
-              command: shell
-              args: # arguments to ceph orch
-                - ceph
-                - fs
-                - volume
-                - create
-                - cephfs
-          - config:
-              command: apply_spec
-              service: orch
-              validate-spec-services: true
-              specs:
-                - service_type: mds
-                  service_id: cephfs
-                  placement:
-                    label: mds
-
   - test:
       name: RGW Service deployment
       desc: RGW Service deployment
@@ -157,6 +132,7 @@ tests:
         node: node10
         install_packages:
           - ceph-common
+          - ceph-base
         copy_admin_keyring: true          # Copy admin keyring to node
         caps: # authorize client capabilities
           mon: "allow *"
@@ -171,3 +147,15 @@ tests:
       config:
         log_to_file: true
       desc: Change config options to enable logging to file
+
+  - test:
+      name: Netsplit Scenarios data-data sites
+      module: test_stretch_n-az_netsplit_scenarios.py
+      polarion-id: CEPH-83574979
+      config:
+        pool_name: test_stretch_pool7
+        stretch_bucket: datacenter
+        netsplit_site_1: DC1
+        netsplit_site_2: DC3
+        delete_pool: true
+      desc: Test stretch Cluster netsplit scenario between data site and arbiter site
diff --git a/tests/rados/test_stretch_n-az_netsplit_scenarios.py b/tests/rados/test_stretch_n-az_netsplit_scenarios.py
new file mode 100644
index 0000000000..bafca4bc88
--- /dev/null
+++ b/tests/rados/test_stretch_n-az_netsplit_scenarios.py
@@ -0,0 +1,286 @@
+"""
+This test module is used to test net-split scenarios with recovery in the stretch pool environment - 3 AZ
+includes:
+1. Netsplit b/w data sites in a 3 AZ cluster with post test checks.
+
+"""
+
+import time
+
+from ceph.ceph_admin import CephAdmin
+from ceph.rados.core_workflows import RadosOrchestrator
+from ceph.rados.pool_workflows import PoolFunctions
+from utility.log import Log
+
+log = Log(__name__)
+
+
+def run(ceph_cluster, **kw):
+    """
+    performs Netsplit scenarios in stretch mode
+    Args:
+        ceph_cluster (ceph.ceph.Ceph): ceph cluster
+    """
+
+    log.info(run.__doc__)
+    config = kw.get("config")
+    cephadm = CephAdmin(cluster=ceph_cluster, **config)
+    rados_obj = RadosOrchestrator(node=cephadm)
+    pool_obj = PoolFunctions(node=cephadm)
+    client_node = ceph_cluster.get_nodes(role="client")[0]
+    pool_name = config.get("pool_name", "test_stretch_io")
+    stretch_bucket = config.get("stretch_bucket", "datacenter")
+    netsplit_site_1 = config.get("netsplit_site_1", "DC1")
+    netsplit_site_2 = config.get("netsplit_site_2", "DC2")
+    set_debug = config.get("set_debug", False)
+    rule_name = config.get("rule_name", "3az_rule_netsplit")
+    cluster_nodes = ceph_cluster.get_nodes()
+    installer = ceph_cluster.get_nodes(role="installer")[0]
+    init_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'")
+    log.debug(f"Initial time when test was started : {init_time}")
+
+    try:
+
+        osd_tree_cmd = "ceph osd tree"
+        buckets = rados_obj.run_ceph_command(osd_tree_cmd)
+        dc_buckets = [d for d in buckets["nodes"] if d.get("type") == stretch_bucket]
+        dc_names = [name["name"] for name in dc_buckets]
+
+        if netsplit_site_1 not in dc_names and netsplit_site_2 not in dc_names:
+            log.error(
+                f"Passed DC names does not exist on the cluster."
+                f"DC's on cluster : {dc_names}"
+                f"Passed names : {netsplit_site_1} & {netsplit_site_2}"
+            )
+            raise Exception("DC names not found to test netsplit")
+
+        # Starting to flush IP table rules on all hosts
+        for host in cluster_nodes:
+            log.debug(f"Proceeding to flush iptable rules on host : {host.hostname}")
+            host.exec_command(sudo=True, cmd="iptables -F", long_running=True)
+        time.sleep(60)
+
+        if not rados_obj.run_pool_sanity_check():
+            log.error(
+                "Cluster PGs not in active + clean state before starting the tests"
+            )
+            # raise Exception("Post execution checks failed on the Stretch cluster")
+
+        # log cluster health
+        rados_obj.log_cluster_health()
+
+        all_hosts = rados_obj.get_multi_az_stretch_site_hosts(
+            num_data_sites=len(dc_names), stretch_bucket=stretch_bucket
+        )
+        for site in dc_names:
+            log.debug(
+                f"Hosts present in Datacenter : {site} : {getattr(all_hosts, site)}"
+            )
+
+        log.info(
+            f"Starting Netsplit scenario in the cluster B/W site {netsplit_site_1} & "
+            f"{netsplit_site_2}."
+            f" Pre-checks Passed and IP tables flushed on the cluster"
+        )
+
+        if set_debug:
+            log.debug("Setting up debug configs on the cluster for mon & osd")
+            rados_obj.run_ceph_command(
+                cmd="ceph config set mon debug_mon 30", client_exec=True
+            )
+            rados_obj.run_ceph_command(
+                cmd="ceph config set osd debug_osd 20", client_exec=True
+            )
+
+        # Creating test pool to check the effect of Netsplit scenarios on the Pool IO
+
+        if not rados_obj.create_n_az_stretch_pool(
+            pool_name=pool_name,
+            rule_name=rule_name,
+            rule_id=101,
+            peer_bucket_barrier=stretch_bucket,
+            num_sites=3,
+            num_copies_per_site=2,
+            total_buckets=3,
+            req_peering_buckets=2,
+        ):
+            log.error(f"Unable to Create/Enable stretch mode on the pool : {pool_name}")
+            raise Exception("Unable to enable stretch pool")
+
+        # Sleeping for 10 seconds for pool to be populated in the cluster
+        time.sleep(10)
+
+        # Collecting the init no of objects on the pool, before site down
+        pool_stat = rados_obj.get_cephdf_stats(pool_name=pool_name)
+        init_objects = pool_stat["stats"]["objects"]
+        log.debug(
+            f"initial number of objects on the pool : {pool_name} is {init_objects}"
+        )
+
+        # Starting test to induce netsplit b/w
+        log.debug(
+            f"Proceeding to induce netsplit scenario b/w the two data sites. Adding IPs of {netsplit_site_1} hosts"
+            f"into other site, i.e {netsplit_site_2} for blocking Incoming and Outgoing "
+            f"packets between the two sites"
+        )
+
+        for host1 in getattr(all_hosts, netsplit_site_1):
+            target_host_obj = rados_obj.get_host_object(hostname=host1)
+            if not target_host_obj:
+                log.error(f"target host : {host1} not found . Exiting...")
+                raise Exception("Test execution Failed")
+            log.debug(
+                f"Proceeding to add IPtables rules to block incoming - outgoing traffic to host {host1} "
+            )
+            for host2 in getattr(all_hosts, netsplit_site_2):
+                source_host_obj = rados_obj.get_host_object(hostname=host2)
+                log.debug(
+                    f"Proceeding to add IPtables rules to block incoming - outgoing traffic to host {host1} "
+                    f"Applying rules on host : {host2}"
+                )
+                if not source_host_obj:
+                    log.error(f"Source host : {host2} not found . Exiting...")
+                if not rados_obj.block_in_out_packets_on_host(
+                    source_host=source_host_obj, target_host=target_host_obj
+                ):
+                    log.error(
+                        f"Failed to add IPtable rules to block {host1} on {host2}"
+                    )
+                    raise Exception("Test execution Failed")
+
+        log.info(
+            f"Completed adding IPtable rules into all hosts of {netsplit_site_1} to {netsplit_site_2}"
+        )
+
+        # sleeping for 120 seconds for the DC to be identified as down and proceeding to next checks
+        time.sleep(120)
+
+        # log cluster health
+        rados_obj.log_cluster_health()
+
+        # Checking the health status of the cluster and the active alerts for site down
+        # These should be generated on the cluster
+        status_report = rados_obj.run_ceph_command(cmd="ceph report", client_exec=True)
+        ceph_health_status = list(status_report["health"]["checks"].keys())
+        expected_health_warns = (
+            "OSD_HOST_DOWN",
+            "OSD_DOWN",
+            "OSD_DATACENTER_DOWN",
+            "MON_DOWN",
+        )
+        if not all(elem in ceph_health_status for elem in expected_health_warns):
+            log.error(
+                f"We do not have the expected health warnings generated on the cluster.\n"
+                f" Warns on cluster : {ceph_health_status}\n"
+                f"Expected Warnings : {expected_health_warns}\n"
+            )
+            # raise execption()
+
+        log.info(
+            f"The expected health warnings are generated on the cluster. Warnings : {ceph_health_status}"
+        )
+
+        log.debug(
+            "Checking is the cluster is marked degraded and "
+            "operating in degraded mode post Netsplit b/w data sites"
+        )
+
+        log.debug("sleeping for 4 minutes before starting writes.")
+        time.sleep(600)
+
+        # log cluster health
+        rados_obj.log_cluster_health()
+
+        # Starting checks to see availability of cluster during netsplit scenario
+        # perform rados put to check if write ops is possible
+        pool_obj.do_rados_put(client=client_node, pool=pool_name, nobj=200, timeout=100)
+        # rados_obj.bench_write(pool_name=pool_name, rados_write_duration=100)
+
+        log.debug("sleeping for 4 minutes for the objects to be displayed in ceph df")
+        time.sleep(600)
+
+        # Getting the number of objects post write, to check if writes were successful
+        pool_stat_final = rados_obj.get_cephdf_stats(pool_name=pool_name)
+        log.debug(pool_stat_final)
+        final_objects = pool_stat_final["stats"]["objects"]
+        log.debug(
+            f"Final number of objects on the pool : {pool_name} is {final_objects}"
+        )
+
+        # Objects should be more than the initial no of objects
+        if int(final_objects) <= int(init_objects):
+            log.error(
+                "Write ops should be possible, number of objects in the pool has not changed"
+            )
+            raise Exception(
+                f"Pool {pool_name} has {pool_stat['stats']['objects']} objs"
+            )
+
+        log.info(
+            f"Successfully wrote {int(final_objects) - int(init_objects)} on pool {pool_name} in degraded mode\n"
+            f"Proceeding to remove the IPtable rules and recover the cluster from degraded mode"
+        )
+
+        time.sleep(5)
+
+        # Starting to flush IP table rules on all hosts
+        for host in cluster_nodes:
+            log.debug(f"Proceeding to flush iptable rules on host : {host.hostname}")
+            host.exec_command(sudo=True, cmd="iptables -F", long_running=True)
+            log.debug(
+                "Observed that just IP tables flush did not work to bring back the nodes to cluster."
+                f"rebooting the nodes post testing. Rebooting node : {host.hostname}"
+            )
+            host.exec_command(sudo=True, cmd="reboot")
+        log.debug("Sleeping for 30 seconds...")
+        time.sleep(30)
+
+        log.info("Proceeding to do checks post Stretch mode netsplit scenarios")
+
+        if not rados_obj.run_pool_sanity_check():
+            log.error("Checks failed post Site Netsplit scenarios")
+            raise Exception("Post execution checks failed on the Stretch cluster")
+
+    except Exception as err:
+        log.error(f"Hit an exception: {err}. Test failed")
+        log.debug(
+            "Test case expected to fail until bug fix : https://bugzilla.redhat.com/show_bug.cgi?id=2265116"
+        )
+        return 1
+    finally:
+        log.debug("---------------- In Finally Block -------------")
+        # Starting to flush IP table rules on all hosts
+        for host in cluster_nodes:
+            log.debug(f"Proceeding to flush iptable rules on host : {host.hostname}")
+            host.exec_command(sudo=True, cmd="iptables -F", long_running=True)
+            log.debug(
+                "Observed that just IP tables flush did not work to bring back the nodes to cluster."
+                f"rebooting the nodes post testing. Rebooting node : {host.hostname}"
+            )
+            host.exec_command(sudo=True, cmd="reboot")
+
+        rados_obj.rados_pool_cleanup()
+        cmd = f"ceph osd crush rule rm {rule_name}"
+        rados_obj.client.exec_command(cmd=cmd, sudo=True)
+
+        init_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'")
+        log.debug(f"time when test was Ended : {init_time}")
+        if set_debug:
+            log.debug("Removing debug configs on the cluster for mon & osd")
+            rados_obj.run_ceph_command(
+                cmd="ceph config rm mon debug_mon", client_exec=True
+            )
+            rados_obj.run_ceph_command(
+                cmd="ceph config rm osd debug_osd", client_exec=True
+            )
+
+        time.sleep(60)
+        # log cluster health
+        rados_obj.log_cluster_health()
+        # check for crashes after test execution
+        if rados_obj.check_crash_status():
+            log.error("Test failed due to crash at the end of test")
+            return 1
+
+    log.info("All the tests completed on the cluster, Pass!!!")
+    return 0
diff --git a/tests/rados/test_stretch_netsplit_scenarios.py b/tests/rados/test_stretch_netsplit_scenarios.py
index df110c3230..5569537dd7 100644
--- a/tests/rados/test_stretch_netsplit_scenarios.py
+++ b/tests/rados/test_stretch_netsplit_scenarios.py
@@ -275,7 +275,7 @@ def run(ceph_cluster, **kw):
                 f"Pool {pool_name} has {pool_stat['stats']['objects']} objs"
             )
         log.info(
-            f"Successfully wrote {pool_stat['stats']['objects']} on pool {pool_name} in degraded mode\n"
+            f"Successfully wrote {int(final_objects) - int(init_objects)} on pool {pool_name} in degraded mode\n"
             f"Proceeding to remove the IPtable rules and recover the cluster from degraded mode"
         )