Skip to content

Commit

Permalink
fix(shrink cluster): address comments scylladb#1
Browse files Browse the repository at this point in the history
  • Loading branch information
juliayakovlev committed Mar 29, 2021
1 parent 6340fb3 commit 7928cd8
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 29 deletions.
8 changes: 4 additions & 4 deletions sdcm/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -3145,7 +3145,7 @@ def destroy(self):
for node in self.nodes:
node.destroy()

def terminate_node(self, node, by_nemesis=""):
def terminate_node(self, node):
if node.ip_address not in self.dead_nodes_ip_address_list:
self.dead_nodes_list.append(DeadNode(name=node.name,
public_ip=node.public_ip_address,
Expand All @@ -3154,7 +3154,7 @@ def terminate_node(self, node, by_nemesis=""):
ip_address=node.ip_address,
shards=node.scylla_shards,
termination_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"),
terminated_by_nemesis=by_nemesis))
terminated_by_nemesis=node.running_nemesis))
if node in self.nodes:
self.nodes.remove(node)
node.destroy()
Expand Down Expand Up @@ -4259,7 +4259,7 @@ def restore_keyspace(self, backup_data):
for node in self.nodes:
node.run_nodetool('repair')

def decommission(self, node, nemesis=''):
def decommission(self, node):
def get_node_ip_list(verification_node):
try:
ip_node_list = []
Expand Down Expand Up @@ -4290,7 +4290,7 @@ def get_node_ip_list(verification_node):
raise NodeStayInClusterAfterDecommission(error_msg)

LOGGER.info('Decommission %s PASS', node)
self.terminate_node(node, by_nemesis=nemesis) # pylint: disable=no-member
self.terminate_node(node) # pylint: disable=no-member
Setup.tester_obj().monitors.reconfigure_scylla_monitoring()

@property
Expand Down
36 changes: 17 additions & 19 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -702,16 +702,16 @@ def _get_kubernetes_node_break_methods(self):
]
raise UnsupportedNemesis("Only GkeScyllaPodCluster is supported")

def _terminate_cluster_node(self, node, by_nemesis=""):
self.cluster.terminate_node(node, by_nemesis=by_nemesis)
def _terminate_cluster_node(self, node):
self.cluster.terminate_node(node)
self.monitoring_set.reconfigure_scylla_monitoring()

def disrupt_nodetool_decommission(self, add_node=True, disruption_name=None):
if self._is_it_on_kubernetes() and disruption_name is None:
self.set_last_node_as_target()
self._set_current_disruption(f"{disruption_name or 'Decommission'} {self.target_node}")
target_is_seed = self.target_node.is_seed
self.cluster.decommission(self.target_node, nemesis=disruption_name or 'Decommission')
self.cluster.decommission(self.target_node)
new_node = None
if add_node:
# When adding node after decommission the node is declared as up only after it completed bootstrapping,
Expand Down Expand Up @@ -747,8 +747,8 @@ def disrupt_nodetool_seed_decommission(self, add_node=True):
self.cluster.update_seed_provider()

@latency_calculator_decorator
def _terminate_and_wait(self, target_node, sleep_time=300, by_nemesis=""):
self._terminate_cluster_node(target_node, by_nemesis=by_nemesis)
def _terminate_and_wait(self, target_node, sleep_time=300):
self._terminate_cluster_node(target_node)
time.sleep(sleep_time) # Sleeping for 5 mins to let the cluster live with a missing node for a while

@latency_calculator_decorator
Expand Down Expand Up @@ -808,7 +808,7 @@ def _disrupt_terminate_decommission_add_node_kubernetes(self, node, node_termina
node_terminate_method = getattr(node, node_terminate_method_name)
node_terminate_method()
self.log.info(f'Decommission %s', node)
self.cluster.decommission(node, nemesis='OperatorNodeTerminateDecommissionAdd')
self.cluster.decommission(node)
self.add_new_node(rack=node.rack)

def _disrupt_terminate_and_replace_node_kubernetes(self, node, node_terminate_method_name): # pylint: disable=invalid-name
Expand Down Expand Up @@ -836,7 +836,7 @@ def disrupt_terminate_and_replace_node(self): # pylint: disable=invalid-name
self._set_current_disruption('TerminateAndReplaceNode %s' % self.target_node)
old_node_ip = self.target_node.ip_address
InfoEvent(message='StartEvent - Terminate node and wait 5 minutes').publish()
self._terminate_and_wait(target_node=self.target_node, by_nemesis='TerminateAndReplaceNode')
self._terminate_and_wait(target_node=self.target_node)
InfoEvent(message='FinishEvent - target_node was terminated').publish()
new_node = self._add_and_init_new_cluster_node(old_node_ip, rack=self.target_node.rack)
try:
Expand Down Expand Up @@ -2255,7 +2255,7 @@ def disrupt_remove_node_then_add_node(self):
node_to_remove.stop_scylla_server(verify_up=True, verify_down=True)

# terminate node
self._terminate_cluster_node(node_to_remove, by_nemesis='TerminateAndRemoveNodeMonkey')
self._terminate_cluster_node(node_to_remove)

# full cluster repair
up_normal_nodes.remove(node_to_remove)
Expand Down Expand Up @@ -2496,7 +2496,7 @@ def decommission_post_action():
if self.target_node.ip_address not in ips or decommission_done:
self.log.error(
'The target node is decommission unexpectedly, decommission might complete before stopping it. Re-add a new node')
self._terminate_cluster_node(self.target_node, by_nemesis='DecommissionStreamingErr')
self._terminate_cluster_node(self.target_node)
new_node = self._add_and_init_new_cluster_node(rack=self.target_node.rack)
self.unset_current_running_nemesis(new_node)
return new_node
Expand Down Expand Up @@ -2623,11 +2623,10 @@ def add_new_node(self, rack=0):
return self._add_and_init_new_cluster_node(rack=rack)

@latency_calculator_decorator
def decommission_node(self, node, nemesis=''):
self.cluster.decommission(node, nemesis=nemesis)
def decommission_node(self, node):
self.cluster.decommission(node)

def decommission_nodes(self, add_nodes_number, rack, is_seed: Optional[Union[bool, DefaultValue]] = DefaultValue,
nemesis=""):
def decommission_nodes(self, add_nodes_number, rack, is_seed: Optional[Union[bool, DefaultValue]] = DefaultValue):
for _ in range(add_nodes_number):
if self._is_it_on_kubernetes():
self.set_last_node_as_target(rack=rack, is_seed=is_seed)
Expand All @@ -2636,17 +2635,17 @@ def decommission_nodes(self, add_nodes_number, rack, is_seed: Optional[Union[boo
self.log.info("Next node will be removed %s", self.target_node)
try:
InfoEvent(message='StartEvent - ShrinkCluster started decommissioning a node').publish()
self.decommission_node(self.target_node, nemesis=nemesis)
self.decommission_node(self.target_node)
finally:
InfoEvent(message='FinishEvent - ShrinkCluster has done decommissioning a node').publish()

def disrupt_grow_shrink_cluster(self):
self._disrupt_grow_shrink_cluster(rack=0, nemesis="GrowShrinkClusterNemesis")
self._disrupt_grow_shrink_cluster(rack=0)

def disrupt_grow_shrink_new_rack(self):
self._disrupt_grow_shrink_cluster(rack=max(self.cluster.racks) + 1, nemesis="AddRemoveRackNemesis")
self._disrupt_grow_shrink_cluster(rack=max(self.cluster.racks) + 1)

def _disrupt_grow_shrink_cluster(self, rack=0, nemesis=""):
def _disrupt_grow_shrink_cluster(self, rack=0):
if rack > 0:
if not self._is_it_on_kubernetes():
raise UnsupportedNemesis("SCT rack functionality is implemented only on kubernetes")
Expand All @@ -2670,8 +2669,7 @@ def _disrupt_grow_shrink_cluster(self, rack=0, nemesis=""):
# Currently on kubernetes first two nodes of each rack are getting seed status
# Because of such behavior only way to get them decommission is to enable decommissioning
# TBD: After https://github.com/scylladb/scylla-operator/issues/292 is fixed remove is_seed parameter
self.decommission_nodes(add_nodes_number, rack, is_seed=None if self._is_it_on_kubernetes() else DefaultValue,
nemesis=nemesis)
self.decommission_nodes(add_nodes_number, rack, is_seed=None if self._is_it_on_kubernetes() else DefaultValue)
self.log.info("Finish cluster shrink. Current number of nodes %s", len(self.cluster.nodes))

def disrupt_hot_reloading_internode_certificate(self):
Expand Down
12 changes: 6 additions & 6 deletions test-cases/longevity/longevity-10gb-3h.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
test_duration: 240
stress_cmd: ["cassandra-stress write cl=QUORUM duration=180m -schema 'replication(factor=3) compaction(strategy=SizeTieredCompactionStrategy)' -port jmx=6868 -mode cql3 native -rate threads=1000 -pop seq=1..10000000 -log interval=5"
test_duration: 40
stress_cmd: ["cassandra-stress write cl=QUORUM duration=35m -schema 'replication(factor=3) compaction(strategy=SizeTieredCompactionStrategy)' -port jmx=6868 -mode cql3 native -rate threads=1000 -pop seq=1..10000000 -log interval=5"
]

n_db_nodes: 6
n_loaders: 2
n_db_nodes: 3
n_loaders: 1
n_monitor_nodes: 1

instance_type_db: 'i3.4xlarge'
instance_type_db: 'i3.2xlarge'
gce_instance_type_db: 'n1-highmem-16'
gce_instance_type_loader: 'e2-standard-4'
scylla_repo_loader: 'https://s3.amazonaws.com/downloads.scylladb.com/rpm/centos/scylla-4.3.repo'

nemesis_class_name: 'SisyphusMonkey'
nemesis_class_name: 'NodeTerminateAndReplace'
nemesis_seed: '111'
nemesis_interval: 2
ssh_transport: 'libssh2'
Expand Down

0 comments on commit 7928cd8

Please sign in to comment.