From 9c942d0a234e99c33dbb54955f3bb5024aa80c90 Mon Sep 17 00:00:00 2001 From: Giles Knap Date: Fri, 7 Feb 2025 12:10:52 +0000 Subject: [PATCH 1/4] make sure protocols folder exists --- src/rtems_proxy/copy.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/rtems_proxy/copy.py b/src/rtems_proxy/copy.py index 1667d02..8cc55fb 100644 --- a/src/rtems_proxy/copy.py +++ b/src/rtems_proxy/copy.py @@ -32,10 +32,12 @@ def copy_rtems(): # st.cmd and ioc.db dest_runtime = local_root / "runtime" - # TODO - perhaps do protocol files in this fashion for linux IOCs too, - # in which case this needs to go somewhere generic protocol_folder = GLOBALS.RUNTIME / "protocol" protocol_folder.mkdir(parents=True, exist_ok=True) + + # TODO - perhaps do protocol files in this fashion for linux IOCs too, + # in which case this needs to go somewhere generic + dest_ioc.mkdir(parents=True, exist_ok=True) protocol_files = GLOBALS.SUPPORT.glob("**/*.proto*") for proto_file in protocol_files: dest = protocol_folder / proto_file.name From 184732411592f01589683da0b55f3bfa7e8f554b Mon Sep 17 00:00:00 2001 From: Giles Knap Date: Thu, 13 Feb 2025 14:33:21 +0000 Subject: [PATCH 2/4] Improvements Change root mount point in RTEMS to /epics Add a configure function for MOTBoot Add a stress function for testing reboots --- src/rtems_proxy/__main__.py | 46 +++++++++++++++++++++--- src/rtems_proxy/configure.py | 48 +++++++++++++++++++++++++ src/rtems_proxy/copy.py | 12 ++----- src/rtems_proxy/rsync.sh.jinja | 2 +- src/rtems_proxy/telnet.py | 65 ++++++++++++++++++++++++++++++---- 5 files changed, 152 insertions(+), 21 deletions(-) create mode 100644 src/rtems_proxy/configure.py diff --git a/src/rtems_proxy/__main__.py b/src/rtems_proxy/__main__.py index dc88447..6b71079 100644 --- a/src/rtems_proxy/__main__.py +++ b/src/rtems_proxy/__main__.py @@ -1,13 +1,17 @@ +from datetime import datetime from pathlib import Path import typer from jinja2 import Template from ruamel.yaml import YAML +from rtems_proxy.utils import run_command + from . import __version__ +from .configure import Configure from .copy import copy_rtems from .globals import GLOBALS -from .telnet import ioc_connect, report +from .telnet import ioc_connect, motboot_connect, report __all__ = ["main"] @@ -70,7 +74,7 @@ def start( if copy: copy_rtems() if connect: - ioc_connect(GLOBALS.RTEMS_CONSOLE, reboot=reboot) + ioc_connect(GLOBALS.RTEMS_CONSOLE, reboot=reboot, attach=True) else: report("IOC console connection disabled. ") @@ -154,8 +158,42 @@ def dev( typer.echo(f"\n\nPlease first source {script_file} to set up the dev environment.") -# test with: -# pipenv run python -m ibek +@cli.command() +def configure( + debug: bool = typer.Option(False, help="use debug ioc binary"), + attach: bool = typer.Option( + False, help="attach to the IOC console after configuration" + ), +): + """ + Configure the RTEMS IOC boot parameters + """ + telnet = motboot_connect(GLOBALS.RTEMS_CONSOLE) + config = Configure(telnet, debug) + config.apply_settings() + telnet.close() + if attach: + run_command(telnet.command) + + +@cli.command() +def stress(): + """ + Stress test the IOC by constantly rebooting and checking for failed boot + + Aborts and prints the time when a failed boot is detected + """ + try: + tries = 0 + while True: + tries += 1 + print(f">>>>>> REBOOT ATTEMPT {tries} <<<<<<<") + ioc_connect( + GLOBALS.RTEMS_CONSOLE, reboot=True, attach=False, raise_errors=True + ) + except Exception as e: + msg = f"\n\nIOC boot number {tries} failed at {datetime.now()}.\n\n" + raise RuntimeError(msg) from e if __name__ == "__main__": diff --git a/src/rtems_proxy/configure.py b/src/rtems_proxy/configure.py new file mode 100644 index 0000000..008899d --- /dev/null +++ b/src/rtems_proxy/configure.py @@ -0,0 +1,48 @@ +""" +Class to apply MOTBoot configuration to a VME crate. +""" + +from .globals import GLOBALS +from .telnet import TelnetRTEMS + + +class Configure: + def __init__(self, telnet: TelnetRTEMS, debug: bool = False): + self.telnet = telnet + self.debug = debug + + def apply_nvm(self, variable: str, value: str): + self.telnet.sendline(f"gevE {variable}") + self.telnet.expect(r"\(Blank line terminates input.\)") + self.telnet.sendline(value + "\r") + self.telnet.sendline("\r") + self.telnet.expect(r"\?") + self.telnet.sendline("Y\r") + + def apply_settings(self): + nfs_mount = f"{GLOBALS.RTEMS_NFS_IP}:/iocs/{GLOBALS.IOC_NAME}:/epics" + ioc_bin = "ioc" if self.debug else "ioc.boot" + mot_boot = ( + f"dla=malloc 0x4000000\r" + f"tftpGet -d/dev/enet1" + f" -f{GLOBALS.IOC_NAME.lower()}/ioc/bin/RTEMS-beatnik/{ioc_bin}" + f" -m{GLOBALS.RTEMS_IOC_NETMASK}" + f" -g{GLOBALS.RTEMS_IOC_GATEWAY}" + f" -s{GLOBALS.RTEMS_TFTP_IP}" + f" -c{GLOBALS.RTEMS_IOC_IP}" + f" -adla -r4\r" + f"go -a04000000\r" + f"reset" + ) + + self.apply_nvm("mot-/dev/enet0-snma", GLOBALS.RTEMS_IOC_NETMASK) + self.apply_nvm("mot-/dev/enet0-gipa", GLOBALS.RTEMS_IOC_GATEWAY) + self.apply_nvm("mot-/dev/enet0-sipa", GLOBALS.RTEMS_NFS_IP) + self.apply_nvm("mot-/dev/enet0-cipa", GLOBALS.RTEMS_IOC_IP) + self.apply_nvm("mot-boot-device", "/dev/em1") + self.apply_nvm("mot-script-boot", mot_boot) + self.apply_nvm("rtems-client-name", GLOBALS.IOC_NAME) + self.apply_nvm("epics-script", "/epics/runtime/st.cmd") + self.apply_nvm("epics-nfsmount", nfs_mount) + # self.apply_nvm_variable("epics-ntpserver", "EPICS_TS_NTP_INET") + self.apply_nvm("mot-/dev/enet0-snma", GLOBALS.RTEMS_IOC_NETMASK) diff --git a/src/rtems_proxy/copy.py b/src/rtems_proxy/copy.py index 8cc55fb..fb2eeab 100644 --- a/src/rtems_proxy/copy.py +++ b/src/rtems_proxy/copy.py @@ -2,16 +2,15 @@ functions for moving IOC assets into position for a remote IOC to access """ -import re import shutil -from pathlib import Path from .globals import GLOBALS def copy_rtems(): """ - Copy RTEMS binaries to a location where the RTEMS IOC can access them + Copy RTEMS IOC binary and startup assets to a location where the RTEMS IOC + can access them IMPORTANT: local_root and nfs_root are different perspectives on the same folder. @@ -23,7 +22,6 @@ def copy_rtems(): will look for them using NFS. """ local_root = GLOBALS.RTEMS_TFTP_PATH - nfs_root = Path("/iocs") / GLOBALS.IOC_NAME # where to copy the Generic IOC folder to. This will contain the IOC binary # and the files @@ -55,9 +53,3 @@ def copy_rtems(): GLOBALS.IOC.readlink() / folder, dest_ioc / folder, dirs_exist_ok=True ) shutil.copytree(GLOBALS.RUNTIME, dest_runtime, dirs_exist_ok=True) - - # because we moved the ioc files we need to fix up startup script paths - startup = dest_runtime / "st.cmd" - cmd_txt = startup.read_text() - cmd_txt = re.sub("/epics/", f"{str(nfs_root)}/", cmd_txt) - startup.write_text(cmd_txt) diff --git a/src/rtems_proxy/rsync.sh.jinja b/src/rtems_proxy/rsync.sh.jinja index 1b8cbb4..9fa738a 100644 --- a/src/rtems_proxy/rsync.sh.jinja +++ b/src/rtems_proxy/rsync.sh.jinja @@ -48,7 +48,7 @@ while true; do for i in 1 2 3 ; do # repeat because inotify fires on the first change of several # don't copy the huge ioc binary file with symbols - rsync -rim --exclude bin/RTEMS-beatnik/ioc --delete /$RTEMS_TFTP_PATH/ \ + rsync -rim --delete /$RTEMS_TFTP_PATH/ \ "rsync://$RTEMS_TFTP_IP:12002/files/$IOC_NAME/" sleep 1 done diff --git a/src/rtems_proxy/telnet.py b/src/rtems_proxy/telnet.py index 380f455..8d84a27 100644 --- a/src/rtems_proxy/telnet.py +++ b/src/rtems_proxy/telnet.py @@ -36,8 +36,9 @@ class TelnetRTEMS: IOC_CHECK = "\ntaskwdShow" IOC_RESPONSE = "free nodes" NO_CONNECTION = "Connection closed by foreign host" + FAIL_STRINGS = ["Exception", "RTEMS_FATAL_SOURCE_EXCEPTION"] - def __init__(self, host_and_port: str, ioc_reboot: bool): + def __init__(self, host_and_port: str, ioc_reboot: bool = False): self._hostname, self._port = host_and_port.split(":") self._ioc_reboot = ioc_reboot self._child = None @@ -91,6 +92,7 @@ def check_prompt(self, retries=5) -> RtemsState: while retries > 0: try: # see if we are in the IOC shell + sleep(0.5) self._child.sendline(self.IOC_CHECK) self._child.expect(self.IOC_RESPONSE, timeout=1) except pexpect.exceptions.TIMEOUT: @@ -136,6 +138,12 @@ def reboot(self, into: RtemsState): # send space to boot the IOC self._child.send(" ") + def wait_epics_prompt(self, timeout=50): + expects = self.FAIL_STRINGS + [self.IOC_STARTED] + index = self._child.expect(expects, timeout=timeout) + if index < len(self.FAIL_STRINGS) - 1: + raise RuntimeError(f"IOC boot failed - output included {expects[index]}") + def get_epics_prompt(self): """ Get to the IOC shell prompt, if the IOC is not already running, reboot @@ -149,12 +157,12 @@ def get_epics_prompt(self): sleep(0.2) self.reboot(RtemsState.IOC) self.ioc_rebooted = True - self._child.expect(self.IOC_STARTED, timeout=50) + self.wait_epics_prompt() else: if self._ioc_reboot and not self.ioc_rebooted: self.ioc_rebooted = True self.reboot(RtemsState.IOC) - self._child.expect(self.IOC_STARTED, timeout=50) + self.wait_epics_prompt() def get_boot_prompt(self): """ @@ -171,6 +179,20 @@ def get_boot_prompt(self): report("press enter for bootloader prompt") + def sendline(self, command: str) -> None: + """ + Send a command to the telnet session + """ + assert self._child, "must call connect before send" + self._child.sendline(command) + + def expect(self, pattern, timeout=10) -> None: + """ + Expect a pattern in the telnet session + """ + assert self._child, "must call connect before expect" + self._child.expect(pattern, timeout=timeout) + def close(self): if self._child: self._child.close() @@ -187,7 +209,12 @@ def report(message): print(f"\n>>>> {message} <<<<\n") -def ioc_connect(host_and_port: str, reboot: bool = False): +def ioc_connect( + host_and_port: str, + reboot: bool = False, + attach: bool = True, + raise_errors: bool = False, +): """ Entrypoint to make a connection to an RTEMS IOC over telnet. Once connected, enters an interactive user session with the IOC. @@ -200,6 +227,11 @@ def ioc_connect(host_and_port: str, reboot: bool = False): try: telnet.connect() + + # this will untangle a partially executed gevEdit command + for _ in range(3): + telnet.sendline("\r") + if reboot: telnet.get_epics_prompt() else: @@ -207,7 +239,28 @@ def ioc_connect(host_and_port: str, reboot: bool = False): except (CannotConnect, pexpect.exceptions.TIMEOUT): report("Connection failed. Exiting") telnet.close() + if raise_errors: + raise else: telnet.close() - report("Connecting to IOC console, hit enter for a prompt") - run_command(telnet.command) + if attach: + report("Connecting to IOC console, hit enter for a prompt") + run_command(telnet.command) + + +def motboot_connect(host_and_port: str) -> TelnetRTEMS: + """ + Connect to the MOTBoot bootloader prompt, rebooting if needed. + + Returns a TelnetRTEMS object that is connected to the MOTBoot bootloader + """ + telnet = TelnetRTEMS(host_and_port) + telnet.connect() + + # this will untangle a partially executed gevEdit command + for _ in range(3): + telnet.sendline("\r") + + telnet.get_boot_prompt() + + return telnet From f948069bbea503b6538577653f521790966be34c Mon Sep 17 00:00:00 2001 From: Giles Knap Date: Fri, 14 Feb 2025 08:48:41 +0000 Subject: [PATCH 3/4] add stack trace function --- src/rtems_proxy/__main__.py | 17 +++++++++++++++++ src/rtems_proxy/telnet.py | 6 +++--- src/rtems_proxy/trace.py | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 src/rtems_proxy/trace.py diff --git a/src/rtems_proxy/__main__.py b/src/rtems_proxy/__main__.py index 6b71079..4851272 100644 --- a/src/rtems_proxy/__main__.py +++ b/src/rtems_proxy/__main__.py @@ -5,6 +5,7 @@ from jinja2 import Template from ruamel.yaml import YAML +from rtems_proxy.trace import parse_stack_trace from rtems_proxy.utils import run_command from . import __version__ @@ -196,5 +197,21 @@ def stress(): raise RuntimeError(msg) from e +@cli.command() +def trace( + trace_file: Path = typer.Argument( + ..., + help="The path to the file containing the stack trace", + file_okay=True, + exists=True, + ), +): + """ + Parse a stack trace from a RTEMS failure + """ + trace = trace_file.read_text() + parse_stack_trace(trace) + + if __name__ == "__main__": cli() diff --git a/src/rtems_proxy/telnet.py b/src/rtems_proxy/telnet.py index 8d84a27..cddac46 100644 --- a/src/rtems_proxy/telnet.py +++ b/src/rtems_proxy/telnet.py @@ -36,7 +36,7 @@ class TelnetRTEMS: IOC_CHECK = "\ntaskwdShow" IOC_RESPONSE = "free nodes" NO_CONNECTION = "Connection closed by foreign host" - FAIL_STRINGS = ["Exception", "RTEMS_FATAL_SOURCE_EXCEPTION"] + FAIL_STRINGS = ["Exception", "exception", "RTEMS_FATAL_SOURCE_EXCEPTION"] def __init__(self, host_and_port: str, ioc_reboot: bool = False): self._hostname, self._port = host_and_port.split(":") @@ -141,8 +141,8 @@ def reboot(self, into: RtemsState): def wait_epics_prompt(self, timeout=50): expects = self.FAIL_STRINGS + [self.IOC_STARTED] index = self._child.expect(expects, timeout=timeout) - if index < len(self.FAIL_STRINGS) - 1: - raise RuntimeError(f"IOC boot failed - output included {expects[index]}") + if index != len(self.FAIL_STRINGS): + raise RuntimeError(f"IOC boot failed - output included '{expects[index]}'") def get_epics_prompt(self): """ diff --git a/src/rtems_proxy/trace.py b/src/rtems_proxy/trace.py new file mode 100644 index 0000000..b937c01 --- /dev/null +++ b/src/rtems_proxy/trace.py @@ -0,0 +1,35 @@ +""" +Some functions to interpret a stack trace from a RTEMS failure +""" + +import re + +from .globals import GLOBALS +from .utils import run_command + +IP = re.compile(r"Stack Trace:\n *IP: *(0x[0-9a-f]*)") +STACK = re.compile(r"--\^ (0x[0-9a-f]*)") +symbols = GLOBALS.IOC / "bin" / "RTEMS-beatnik" / "ioc" + + +def parse_stack_trace(trace: str): + """ + Parse a stack trace from a RTEMS failure + + Args: + trace (str): log containing a stack trace + """ + ip = IP.findall(trace) + addrs = STACK.findall(trace) + + print(f"IP: {ip[0]}\nStack {addrs}") + + if len(ip) == 0 or len(addrs) == 0: + raise ValueError("Could not find a stack trace in the log") + elif len(ip) > 1: + raise ValueError("Multiple stack traces in the log") + + addrs.reverse() + for addr in addrs: + run_command(f"rtems-addr2line {addr} -e {symbols}") + run_command(f"rtems-addr2line {ip[0]} -e {symbols}") From 20399d09b71a54f8c0626b096efdce4a7f3ac3be Mon Sep 17 00:00:00 2001 From: Giles Knap Date: Sat, 15 Feb 2025 12:35:49 +0000 Subject: [PATCH 4/4] improved error handling --- proxy-start.sh | 25 ------------------------- src/rtems_proxy/__main__.py | 9 ++++++++- src/rtems_proxy/telnet.py | 20 ++++++++++++++------ 3 files changed, 22 insertions(+), 32 deletions(-) delete mode 100755 proxy-start.sh diff --git a/proxy-start.sh b/proxy-start.sh deleted file mode 100755 index 9dbce57..0000000 --- a/proxy-start.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -set -x - -# This is the folder the PVC for the nfsv2tftp shared volume is mounted into. -export RTEMS_TFTP_PATH=${RTEMS_TFTP_PATH:-/nfsv2-tftp} - -if [ ! -d ${RTEMS_TFTP_PATH} ]; then - echo "ERROR: No PVC folder found." - # make a folder for testing outside of the cluster - mkdir -p ${RTEMS_TFTP_PATH} -fi - -# copy the IOC instance's runtime assets into the shared volume -cp -rL /epics/ioc ${RTEMS_TFTP_PATH} -cp -r /epics/runtime ${RTEMS_TFTP_PATH} -# move binary to the root for shorter paths -mv ${RTEMS_TFTP_PATH}/ioc/bin/*/ioc.boot ${RTEMS_TFTP_PATH} -# fix up the paths in st.cmd -sed -i "s|/epics/|/iocs/${IOC_LOCATION}/${IOC_NAME}/|" ${RTEMS_TFTP_PATH}/runtime/st.cmd - -# keep the container running ... -while true; do - sleep 2 -done diff --git a/src/rtems_proxy/__main__.py b/src/rtems_proxy/__main__.py index 4851272..ae87bd2 100644 --- a/src/rtems_proxy/__main__.py +++ b/src/rtems_proxy/__main__.py @@ -1,5 +1,6 @@ from datetime import datetime from pathlib import Path +from time import sleep import typer from jinja2 import Template @@ -51,6 +52,9 @@ def start( reboot: bool = typer.Option( True, "--reboot/--no-reboot", help="reboot the IOC first" ), + raise_errors: bool = typer.Option( + True, "--raise-errors/--no-raise-errors", help="raise errors instead of exiting" + ), ): """ Starts an RTEMS IOC. Places the IOC binaries in the expected location, @@ -75,7 +79,9 @@ def start( if copy: copy_rtems() if connect: - ioc_connect(GLOBALS.RTEMS_CONSOLE, reboot=reboot, attach=True) + ioc_connect( + GLOBALS.RTEMS_CONSOLE, reboot=reboot, attach=True, raise_errors=raise_errors + ) else: report("IOC console connection disabled. ") @@ -192,6 +198,7 @@ def stress(): ioc_connect( GLOBALS.RTEMS_CONSOLE, reboot=True, attach=False, raise_errors=True ) + sleep(5) except Exception as e: msg = f"\n\nIOC boot number {tries} failed at {datetime.now()}.\n\n" raise RuntimeError(msg) from e diff --git a/src/rtems_proxy/telnet.py b/src/rtems_proxy/telnet.py index cddac46..2166d46 100644 --- a/src/rtems_proxy/telnet.py +++ b/src/rtems_proxy/telnet.py @@ -236,16 +236,24 @@ def ioc_connect( telnet.get_epics_prompt() else: report("Auto reboot disabled. Skipping reboot") + except (CannotConnect, pexpect.exceptions.TIMEOUT): - report("Connection failed. Exiting") + report("Connection failed, Exiting.") + telnet.close() + raise + + except Exception as e: + # still show the remaining output + telnet.expect("_main_") + report(f"An error occurred: {e}") telnet.close() if raise_errors: raise - else: - telnet.close() - if attach: - report("Connecting to IOC console, hit enter for a prompt") - run_command(telnet.command) + + telnet.close() + if attach: + report("Connecting to IOC console, hit enter for a prompt") + run_command(telnet.command) def motboot_connect(host_and_port: str) -> TelnetRTEMS: