Skip to content

Commit

Permalink
tests: added a test in which all the nodes are decommissioned
Browse files Browse the repository at this point in the history
Added a test decommissioning each cluster node and then adding it back
to the cluster, either with an overridden node id or with the new node
id assigned by the controller. This test is similar to the chaos tests
using `recycle_all` failure.

Signed-off-by: Michał Maślanka <michal@redpanda.com>
  • Loading branch information
mmaslankaprv committed Aug 1, 2024
1 parent 10c7548 commit a1cc4de
Showing 1 changed file with 123 additions and 0 deletions.
123 changes: 123 additions & 0 deletions tests/rptest/tests/nodes_decommissioning_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,6 +950,129 @@ def test_decommissioning_and_upgrade(self):

self._check_state_consistent(to_decommission_id)

@cluster(num_nodes=6)
@matrix(auto_assign_node_id=[True, False])
def test_recycle_all_nodes(self, auto_assign_node_id):

# start redpanda on initial pool of nodes
self.redpanda.start(nodes=self.redpanda.nodes,
auto_assign_node_id=auto_assign_node_id,
omit_seeds_on_idx_one=(not auto_assign_node_id))
# configure controller not to take snapshot
self.redpanda.set_cluster_config(
{"controller_snapshot_max_age_sec": 20000})

spec = TopicSpec(name=f"migration-test-workload",
partition_count=32,
replication_factor=3)

self.client().create_topic(spec)
self._topic = spec.name

self.start_producer()
self.start_consumer()

# wait for some messages before executing actions
self.producer.wait_for_acks(10000, timeout_sec=60, backoff_sec=2)

current_replicas = {
self.redpanda.node_id(n)
for n in self.redpanda.nodes
}
left_to_decommission = [
self.redpanda.node_id(n) for n in self.redpanda.nodes
]

next_id = 10
admin = Admin(self.redpanda)

def cluster_view_is_consistent():
versions = set()
for n in self.redpanda.started_nodes():
node_id = self.redpanda.node_id(n)
cv = admin.get_cluster_view(node=n)
cv_nodes = {b['node_id'] for b in cv['brokers']}
versions.add(cv['version'])
self.logger.info(
f"cluster view from node {node_id} - {cv_nodes}, version: {cv['version']}"
)
if cv_nodes != current_replicas:
self.logger.warn(
f"inconsistent cluster view from {node_id}, expected: {current_replicas}, current: {cv_nodes}"
)
return False

controller = admin.get_partition(ns="redpanda",
topic="controller",
id=0,
node=n)
controller_replicas = {
r['node_id']
for r in controller['replicas']
}
self.logger.info(
f"controller partition replicas from node {node_id} - {controller_replicas}"
)
if controller_replicas != current_replicas:
self.logger.warn(
f"inconsistent controller replicas {node_id}, expected: {current_replicas}, current: {cv_nodes}"
)
return False

return len(versions) == 1

while len(left_to_decommission) > 0:
decommissioned_id = left_to_decommission.pop()
n = self.redpanda.get_node_by_id(decommissioned_id)
assert n is not None
self.logger.info(
f"decommissioning node with id: {decommissioned_id} - {n.account.hostname}"
)
# force taking controller snapshot after first node is decommissioned
if len(left_to_decommission) == (len(self.redpanda.nodes) - 1):
self.redpanda.set_cluster_config(
{"controller_snapshot_max_age_sec": 10})
self.redpanda.wait_for_controller_snapshot(
self.redpanda.controller(),
prev_start_offset=0,
timeout_sec=60)

admin.decommission_broker(decommissioned_id)
if len(left_to_decommission) == 5:
self.redpanda.set_cluster_config(
{"controller_snapshot_max_age_sec": 20000})

current_replicas.remove(decommissioned_id)
# wait until node is decommissioned
waiter = NodeDecommissionWaiter(
self.redpanda,
decommissioned_id,
self.logger,
60,
decommissioned_node_ids=[decommissioned_id])
waiter.wait_for_removal()

# clean the node data
self.redpanda.clean_node(n,
preserve_logs=True,
preserve_current_install=True)

# start with new id
self.logger.info(f"adding {n.account.hostname} with id {next_id}")

self.redpanda.start_node(
n,
auto_assign_node_id=auto_assign_node_id,
omit_seeds_on_idx_one=(not auto_assign_node_id),
node_id_override=None if auto_assign_node_id else next_id)
current_replicas.add(self.redpanda.node_id(n, force_refresh=True))
next_id += 1

wait_until(cluster_view_is_consistent, 60, 1,
"error waiting for consistent view of the cluster")

self.verify()


class NodeDecommissionFailureReportingTest(RedpandaTest):
def __init__(self, *args, **kwargs):
Expand Down

0 comments on commit a1cc4de

Please sign in to comment.