Skip to content

Commit 859a469

Browse files
committed
fix(get_rack_names_per_datacenter_and_rack_idx): support topology change
when there a topology change during this function being called it can fail like the following: ``` File ".../sdcm/stress_thread.py", line 298, in _run_cs_stress stress_cmd = self.create_stress_cmd(cmd_runner, keyspace_idx, loader) File ".../sdcm/stress_thread.py", line 187, in create_stress_cmd stress_cmd = self.adjust_cmd_connection_options(stress_cmd, loader, cmd_runner) File ".../sdcm/stress_thread.py", line 159, in adjust_cmd_connection_options stress_cmd = self.adjust_cmd_node_option(stress_cmd, loader, cmd_runner) File ".../sdcm/stress_thread.py", line 142, in adjust_cmd_node_option rack_names = self.loader_set.get_rack_names_per_datacenter_and_rack_idx(db_nodes=self.node_list) File ".../sdcm/cluster.py", line 3224, in get_rack_names_per_datacenter_and_rack_idx rack_names_mapping[(region, rack)] = status[nodes[0]]['rack'] ``` since one of the nodes it might use as a key, isn't in `nodetool status` anymore. so we using only the intersection of the nodes from the argument and from the response of `nodetool status`
1 parent 678c129 commit 859a469

File tree

1 file changed

+10
-4
lines changed

1 file changed

+10
-4
lines changed

sdcm/cluster.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
# Copyright (c) 2016 ScyllaDB
1313

1414
# pylint: disable=too-many-lines
15+
from __future__ import annotations
16+
1517
import contextlib
1618
import queue
1719
import logging
@@ -2602,7 +2604,7 @@ def check_node_health(self, retries: int = CHECK_NODE_HEALTH_RETRIES) -> None:
26022604
CHECK_NODE_HEALTH_RETRY_DELAY, self.name)
26032605
time.sleep(CHECK_NODE_HEALTH_RETRY_DELAY)
26042606

2605-
def get_nodes_status(self):
2607+
def get_nodes_status(self) -> dict[BaseNode, dict]:
26062608
nodes_status = {}
26072609
try:
26082610
statuses = self.parent_cluster.get_nodetool_status(verification_node=self)
@@ -2660,7 +2662,7 @@ def get_peers_info(self):
26602662
return peers_details
26612663

26622664
@retrying(n=5, sleep_time=10, raise_on_exceeded=False)
2663-
def get_gossip_info(self):
2665+
def get_gossip_info(self) -> dict[BaseNode, dict]:
26642666
gossip_info = self.run_nodetool('gossipinfo', verbose=False, warning_event_on_exception=(Exception,),
26652667
publish_event=False)
26662668
LOGGER.debug("get_gossip_info: %s", gossip_info)
@@ -3180,7 +3182,7 @@ def tags(self) -> Dict[str, str]:
31803182
def dead_nodes_ip_address_list(self):
31813183
return [node.ip_address for node in self.dead_nodes_list]
31823184

3183-
def get_ip_to_node_map(self):
3185+
def get_ip_to_node_map(self) -> dict[str, BaseNode]:
31843186
"""returns {ip: node} map for all nodes in cluster to get node by ip"""
31853187
return {ip: node for node in self.nodes for ip in node.get_all_ip_addresses()}
31863188

@@ -3219,8 +3221,12 @@ def get_rack_names_per_datacenter_and_rack_idx(self, db_nodes: list[BaseNode] |
32193221
db_nodes = db_nodes if db_nodes else self.nodes
32203222
status = db_nodes[0].get_nodes_status()
32213223

3224+
# intersection of asked nodes and nodes returned by nodetool status
3225+
# since topology might change during this command execution
3226+
actual_db_nodes = set(status.keys()).intersection(db_nodes)
3227+
32223228
rack_names_mapping = {}
3223-
for (region, rack), nodes in self.nodes_by_racks_idx_and_regions(nodes=db_nodes).items():
3229+
for (region, rack), nodes in self.nodes_by_racks_idx_and_regions(nodes=actual_db_nodes).items():
32243230
rack_names_mapping[(region, rack)] = status[nodes[0]]['rack']
32253231

32263232
return rack_names_mapping

0 commit comments

Comments
 (0)