Skip to content

Commit

Permalink
STAR-385 Retry cluster stop after exception stopping 'gently' (#36)
Browse files Browse the repository at this point in the history
Cluster stop requests in cleanup_cluster are made with "gently=True" when Jacoco code coverage is enabled to allow the jacoco agent to record results; however, some tests leave nodes in a state where this type of shutdown does not succeed, resulting in the test being marked failed regardless of it's true completion status.

This change will retry these stop requests with "gently=False" so that the test completion status will not be altered due to shutdown not completing.

(cherry picked from commit f6fc3d3)
(cherry picked from commit aff9d6e)
(cherry picked from commit fe27230)
(cherry picked from commit f931d43)
  • Loading branch information
djatnieks authored and jacek-lewandowski committed Oct 18, 2022
1 parent e5f5b9f commit 88c022c
Showing 1 changed file with 21 additions and 3 deletions.
24 changes: 21 additions & 3 deletions dtest_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from cassandra.cluster import EXEC_PROFILE_DEFAULT
from cassandra.policies import WhiteListRoundRobinPolicy
from ccmlib.common import is_win
from ccmlib.cluster import Cluster
from ccmlib.cluster import Cluster, NodeError

from dtest import (get_ip_from_node, make_execution_profile, get_auth_provider, get_port_from_node,
get_eager_protocol_version)
Expand Down Expand Up @@ -390,17 +390,35 @@ def stop_active_log_watch(self):
"""
self.log_watch_thread.join(timeout=60)

def stop_cluster(self, gently=False):
"""
Stops the cluster; if 'gently' is requested and a NodeError occurs, then
try again without 'gently'.
Some tests, by design, leave the cluster in a state which prevents it from
being stopped using 'gently'. Retrying without 'gently' will avoid marking
the test as a failure, but may prevent jacoco results from being recorded.
"""
try:
self.cluster.stop(gently)
except NodeError as e:
if gently:
logger.debug("Exception stopping cluster with gently=True, retrying with gently=False: {0}".format(e))
self.cluster.stop(gently=False)
else:
raise e

def cleanup_cluster(self, request=None, failure=False):
with log_filter('cassandra'): # quiet noise from driver when nodes start going down
test_failed = (request and hasattr(request.node, 'rep_call') and request.node.rep_call.failed) or failure
if self.dtest_config.keep_test_dir or (self.dtest_config.keep_failed_test_dir and test_failed):
self.cluster.stop(gently=self.dtest_config.enable_jacoco_code_coverage)
self.stop_cluster(gently=self.dtest_config.enable_jacoco_code_coverage)
else:
# when recording coverage the jvm has to exit normally
# or the coverage information is not written by the jacoco agent
# otherwise we can just kill the process
if self.dtest_config.enable_jacoco_code_coverage:
self.cluster.stop(gently=True)
self.stop_cluster(gently=True)

# Cleanup everything:
try:
Expand Down

0 comments on commit 88c022c

Please sign in to comment.