Skip to content

Commit

Permalink
DAOS-1452 test: ensure ULTs stacks are dumped when needed (#7768) (#9732
Browse files Browse the repository at this point in the history
)

Address issues with dumping ULTs stacks upon failure/timeout
during CI tests. Add test case to verify this functionality.
Please note that those tests are not run for now since they
report legitimate errors.

Change-Id: Id48031bc7a262cb27aaaac8e4726d45265ad9c88
Signed-off-by: Bruno Faccini <bruno.faccini@intel.com>
  • Loading branch information
bfaccini authored Jul 21, 2022
1 parent 1f66c35 commit 7ea81a4
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 52 deletions.
105 changes: 81 additions & 24 deletions src/tests/ftest/server/daos_server_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
from __future__ import print_function

from apricot import TestWithServers
from general_utils import pcmd, stop_processes
from general_utils import pcmd, dump_engines_stacks

import time

class DaosServerDumpTest(TestWithServers):
"""Daos server dump tests.
Expand All @@ -25,43 +26,99 @@ def __init__(self, *args, **kwargs):
self.start_servers_once = False
self.setup_start_agents = False

def tearDown(self):
"""Tear down after each test case."""
super().tearDown()

# force test status !!
# use mangling trick described at
# https://stackoverflow.com/questions/3385317/private-variables-and-methods-in-python
# to do so
self._Test__status = 'PASS'

# DAOS-1452 may need to check for one file per engine...
ret_codes = pcmd(self.hostlist_servers, r"ls /tmp/daos_dump*.txt")
# Report any failures
if len(ret_codes) > 1 or 0 not in ret_codes:
failed = [
"{}: rc={}".format(val, key)
for key, val in ret_codes.items() if key != 0
]
print(
"no ULT stacks dump found on following hosts: {}".format(
", ".join(failed)))
self._Test__status = 'FAIL'

def test_daos_server_dump_basic(self):
"""JIRA ID: DAOS-1452.
Test Description: Test engine ULT stacks dump.
Test Description: Test engine ULT stacks dump (basic).
:avocado: tags=all,daily_regression
:avocado: tags=vm
:avocado: tags=control,server_start,basic
:avocado: tags=daos_server_dump_test,test_daos_server_dump_basic
:avocado: tags=all
:avocado: tags=daos_server_dump_tests,test_daos_server_dump_basic
"""

ret_codes = stop_processes(self.hostlist_servers, r"daos_engine",
added_filter=r"'\<(grep|defunct)\>'",
dump_ult_stacks=True)
ret_codes = dump_engines_stacks(self.hostlist_servers,
added_filter=r"'\<(grep|defunct)\>'")
# at this time there is no way to know when Argobots ULTs stacks
# has completed, see DAOS-1452/DAOS-9942.
if 1 in ret_codes:
print(
"Stopped daos_engine processes on {}".format(
"Dumped daos_engine stacks on {}".format(
str(ret_codes[1])))
if 0 in ret_codes:
print(
self.fail(
"No daos_engine processes found on {}".format(
str(ret_codes[0])))

# XXX may need to check for one file per engine...
ret_codes = pcmd(self.hostlist_servers, r"ls /tmp/daos_dump*.txt")
# Report any failures
if len(ret_codes) > 1 or 0 not in ret_codes:
failed = [
"{}: rc={}".format(val, key)
for key, val in ret_codes.items() if key != 0
]
self.fail(
"no ULT stacks dump found on following hosts: {}".format(
", ".join(failed)))

self.log.info("Test passed!")

# set stopped servers state to make teardown happy
def test_daos_server_dump_on_error(self):
"""JIRA ID: DAOS-1452.
Test Description: Test engine ULT stacks dump (error case).
:avocado: tags=all
:avocado: tags=daos_server_dump_tests,test_daos_server_dump_on_error
"""

self.log.info("Forcing test error!")
self.error()

def test_daos_server_dump_on_fail(self):
"""JIRA ID: DAOS-1452.
Test Description: Test engine ULT stacks dump (failure case).
:avocado: tags=all
:avocado: tags=daos_server_dump_tests,test_daos_server_dump_on_fail
"""

self.log.info("Forcing test failure!")
self.fail()

def test_daos_server_dump_on_timeout(self):
"""JIRA ID: DAOS-1452.
Test Description: Test engine ULT stacks dump (timeout case).
:avocado: tags=all
:avocado: tags=daos_server_dump_tests,test_daos_server_dump_on_timeout
"""

self.log.info("Sleeping to trigger test timeout!")
time.sleep(30)

def test_daos_server_dump_on_unexpected_engine_status(self):
"""JIRA ID: DAOS-1452.
Test Description: Test engine ULT stacks dump (unexpected engine status case).
:avocado: tags=all
:avocado: tags=daos_server_dump_tests,test_daos_server_dump_on_unexpected_engine_status
"""

self.log.info("Forcing servers expected state to make teardown unhappy!")
# set stopped servers expected state to make teardown unhappy
self.server_managers[0].update_expected_states(
None, ["stopped", "excluded", "errored"])
2 changes: 2 additions & 0 deletions src/tests/ftest/server/daos_server_dump.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ hosts:
- server-E
- server-F
timeout: 140
timeouts:
test_daos_server_dump_on_timeout: 30
server_config:
name: daos_server
servers:
Expand Down
45 changes: 44 additions & 1 deletion src/tests/ftest/util/apricot/apricot/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
from fault_config_utils import FaultInjection
from general_utils import \
get_partition_hosts, stop_processes, get_default_config_file, pcmd, get_file_listing, \
DaosTestError, run_command, get_avocado_config_value, set_avocado_config_value
DaosTestError, run_command, get_avocado_config_value, set_avocado_config_value, \
dump_engines_stacks
from logger_utils import TestLogger
from pydaos.raw import DaosContext, DaosLog, DaosApiError
from server_utils import DaosServerManager
Expand Down Expand Up @@ -697,6 +698,8 @@ def __init__(self, *args, **kwargs):
# self.debug = False
# self.config = None
self.job_manager = None
# whether engines ULT stacks have been already dumped
self.dumped_engines_stacks = False
self.label_generator = LabelGenerator()

def setUp(self):
Expand Down Expand Up @@ -1359,8 +1362,46 @@ def remove_temp_test_dir(self):
errors.append("Error removing temporary test files")
return errors

def dump_engines_stacks(self, message):
"""Dump the engines ULT stacks.
Args:
message (str): reason for dumping the ULT stacks. Defaults to None.
"""
if self.dumped_engines_stacks is False:
self.dumped_engines_stacks = True
self.log.info("%s, dumping ULT stacks", message)
dump_engines_stacks(self.hostlist_servers)

def report_timeout(self):
"""Dump ULTs stacks if this test case was timed out."""
super().report_timeout()
if self.timeout is not None and self.time_elapsed > self.timeout:
# dump engines ULT stacks upon test timeout
self.dump_engines_stacks("Test has timed-out")

def fail(self, message=None):
"""Dump engines ULT stacks upon test failure."""
self.dump_engines_stacks("Test has failed")
super().fail(message)

def error(self, message=None):
"""Dump engines ULT stacks upon test error."""
self.dump_engines_stacks("Test has errored")
super().error(message)

def tearDown(self):
"""Tear down after each test case."""

# dump engines ULT stacks upon test failure
# check of Avocado test status during teardown is presently useless
# and about same behavior has been implemented by adding both fail()
# error() method above, to overload the methods of Avocado base Test
# class (see DAOS-1452/DAOS-9941 and Avocado issue #5217 with
# associated PR-5224)
if self.status is not None and self.status != 'PASS' and self.status != 'SKIP':
self.dump_engines_stacks("Test status is {}".format(self.status))

# Report whether or not the timeout has expired
self.report_timeout()

Expand Down Expand Up @@ -1583,6 +1624,8 @@ def stop_servers(self):
errors.append(
"ERROR: At least one multi-variant server was not found in "
"its expected state; stopping all servers")
# dump engines stacks if not already done
self.dump_engines_stacks("Some engine not in expected state")
self.test_log.info(
"Stopping %s group(s) of servers", len(self.server_managers))
errors.extend(self._stop_managers(self.server_managers, "servers"))
Expand Down
80 changes: 54 additions & 26 deletions src/tests/ftest/util/general_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -753,8 +753,57 @@ def convert_list(value, separator=","):
return separator.join([str(item) for item in value])


def stop_processes(hosts, pattern, verbose=True, timeout=60, added_filter=None,
dump_ult_stacks=False):
def dump_engines_stacks(hosts, verbose=True, timeout=60, added_filter=None):
"""Signal the engines on each hosts to generate their ULT stacks dump.
Args:
hosts (list): hosts on which to signal the engines
verbose (bool, optional): display command output. Defaults to True.
timeout (int, optional): command timeout in seconds. Defaults to 60
seconds.
added_filter (str, optional): negative filter to better identify
engines.
Returns:
dict: a dictionary of return codes keys and accompanying NodeSet
values indicating which hosts yielded the return code.
Return code keys:
0 No engine matched the criteria / No engine signaled.
1 One or more engine matched the criteria and a signal was
sent.
"""
result = {}
log = getLogger()
log.info("Dumping ULT stacks of engines on %s", hosts)

if added_filter:
ps_cmd = "/usr/bin/ps xa | grep daos_engine | grep -vE {}".format(
added_filter)
else:
ps_cmd = "/usr/bin/pgrep --list-full daos_engine"

if hosts is not None:
commands = [
"rc=0",
"if " + ps_cmd,
"then rc=1",
"sudo pkill --signal USR2 daos_engine",
# leave some time for ABT info/stacks dump to complete.
# at this time there is no way to know when Argobots ULTs stacks
# has completed, see DAOS-1452/DAOS-9942.
"sleep 30",
"fi",
"exit $rc",
]
result = pcmd(hosts, "; ".join(commands), verbose, timeout,
None)

return result


def stop_processes(hosts, pattern, verbose=True, timeout=60, added_filter=None):
"""Stop the processes on each hosts that match the pattern.
Args:
Expand All @@ -765,8 +814,6 @@ def stop_processes(hosts, pattern, verbose=True, timeout=60, added_filter=None,
seconds.
added_filter (str, optional): negative filter to better identify
processes.
dump_ult_stacks (bool, optional): whether SIGUSR2 should be sent before
any other sigs, to dump all ULTs stacks of servers.
Returns:
Expand All @@ -780,11 +827,7 @@ def stop_processes(hosts, pattern, verbose=True, timeout=60, added_filter=None,
"""
result = {}
log = getLogger()
if dump_ult_stacks is True:
log.info("First dumping ULT stacks, then Killing any processes on %s "
"that match: %s", hosts, pattern)
else:
log.info("Killing any processes on %s that match: %s", hosts, pattern)
log.info("Killing any processes on %s that match: %s", hosts, pattern)

if added_filter:
ps_cmd = "/usr/bin/ps xa | grep -E {} | grep -vE {}".format(
Expand All @@ -793,22 +836,7 @@ def stop_processes(hosts, pattern, verbose=True, timeout=60, added_filter=None,
ps_cmd = "/usr/bin/pgrep --list-full {}".format(pattern)

if hosts is not None:
if dump_ult_stacks is True and "daos_engine" in pattern:
commands_part1 = [
"rc=0",
"if " + ps_cmd,
"then rc=1",
"sudo pkill --signal USR2 {}".format(pattern),
# leave time for ABT info/stacks dump vs xstream/pool/ULT number
"sleep 20",
"fi",
"exit $rc",
]
result = pcmd(hosts, "; ".join(commands_part1), verbose, timeout,
None)

# in case dump of ULT stacks is still running it may be interrupted
commands_part2 = [
commands = [
"rc=0",
"if " + ps_cmd,
"then rc=1",
Expand All @@ -824,7 +852,7 @@ def stop_processes(hosts, pattern, verbose=True, timeout=60, added_filter=None,
"fi",
"exit $rc",
]
result = pcmd(hosts, "; ".join(commands_part2), verbose, timeout, None)
result = pcmd(hosts, "; ".join(commands), verbose, timeout, None)
return result


Expand Down
2 changes: 1 addition & 1 deletion src/tests/ftest/util/server_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,7 +804,7 @@ def kill(self):
"""Forcibly terminate any server process running on hosts."""
regex = self.manager.job.command_regex
# Try to dump all server's ULTs stacks before kill.
result = stop_processes(self._hosts, regex, dump_ult_stacks=True)
result = stop_processes(self._hosts, regex)
if 0 in result and len(result) == 1:
print("No remote {} server processes killed (none found), done.".format(regex))
else:
Expand Down

0 comments on commit 7ea81a4

Please sign in to comment.