From 455f5f2f69436800faacd557053ebca03cb6d9e8 Mon Sep 17 00:00:00 2001 From: Frank Hunleth Date: Thu, 3 Nov 2022 10:35:47 -0700 Subject: [PATCH] Add support for an initialization handshake When enabled, Nerves Heart will expect an initialization handshake message in addition to the regular heart beat messages. This makes it possible to call `:heart.set_callback/1` without worrying about an error that causes the code that calls `:heart.set_callback/1` somehow not getting run. --- README.md | 50 ++++++++++++++++ src/heart.c | 46 ++++++++++++++- tests/heart_test/lib/heart.ex | 25 ++++++-- tests/heart_test/test/heart_test.exs | 86 +++++++++++++++++++++++----- 4 files changed, 188 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 830a0b2..aaf9188 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,9 @@ implementation and provides the following changes: Erlang is unresponsive. The reboot call is not configurable nor is it necessary to invoke another program. 5. Trigger watchdog-protected reboots and shutdowns +6. Support for an application-level initialization handshake. This lets your + application guard against conditions that cause Erlang and Nerves Heart to + think that everything is ok when it's not. See discussion below. ## Timeouts and semantics @@ -115,6 +118,47 @@ matches your device. You may need to add a section to your device tree to configure the driver. For example, if using an external watchdog and petting it via a GPIO, you will need to specify which GPIO in the device tree. +## Application-level initialization handshake + +Erlang's `heart` module has a very useful feature of letting you specify a +callback to extend the heart beat protection beyond the VM. See +[:heart.set_callback/2](https://www.erlang.org/doc/man/heart.html#set_callback-2). +To use it, you specify a callback function that returns `:ok` if your +application seems is ok. If it's not ok, then Erlang won't send a heart beat +message to the heart process and the device will eventually reboot. + +The problem with this mechanism is that if something causes the code that +registers the callback to not be run, the device could continue to run without +knowing that its in a bad state. As an aside, a non-Nerves way of handling this +would be to exit the VM with an error message. On devices, running in a degraded +state can be useful to keep important services going and allowing time for +remote troubleshooting. + +Here's the flow for using the application-level initialization handshake: + +1. Set the `HEART_INIT_TIMEOUT` environment variable to the number of seconds to + wait for the initializing handshake. +2. Call `:heart.set_callback/2` in your program +3. Run `:heart.setcmd('init_done')` + +To set `HEART_INIT_TIMEOUT`, edit the `rel/vm.args.eex` and update the heart +section to look like this: + +```sh +## Enable heartbeat monitoring of the Erlang runtime system +-heart -env HEART_BEAT_TIMEOUT 30 + +## Require an initialization handshake within 15 minutes +-heart -env HEART_INIT_TIMEOUT 900 +``` + +With this configuration, Nerves Heart will operate as normal with petting the +hardware watchdog and expecting Erlang heart beat messages up. If `init_done` is +passed, Nerves Heart will continue working as normal. However, if `init_done` is +not sent at the 15 minute mark, Nerves Heart will pet the hardware WDT one last +time and exit. This will cause the Erlang VM to exit and if the VM decides to +not be happy about this, the hardware WDT will reset. + ## Runtime diagnostic information Both `nerves_heart` and the Linux watchdog subsystem can return useful @@ -143,6 +187,9 @@ iex> Nerves.Runtime.Heart.status! program_version: %Version{major: 2, minor: 0, patch: 0}, heartbeat_timeout: 30, heartbeat_time_left: 53, + init_handshake_happened: true, + init_handshake_timeout: 900, + init_handshake_time_left: 0, wdt_identity: "OMAP Watchdog", wdt_firmware_version: 0, wdt_last_boot: :power_on, @@ -162,6 +209,9 @@ The following table describes keys and their values: | `:program_version` | Nerves heart's version number | | `:heartbeat_timeout` | Erlang's heartbeat timeout setting. Note that the hardware watchdog timeout supersedes this since it reboots. | | `:heartbeat_time_left` | The amount of time left for Erlang to send a heartbeat message before heart times out. | +| `:init_handshake_happened` | `true` if the initialization handshake happened or isn't enabled | +| `:init_handshake_timeout` | The time to wait for the handshake message before timing out | +| `:init_handshake_time_left` | If waiting for an initialization handshake, this is the number of seconds left. | | `:wdt_identity` | The hardware watchdog that's being used | | `:wdt_firmware_version` | An integer that represents the hardware watchdog's firmware revision | | `:wdt_last_boot` | What caused the most recent boot. Whether this is reliable depends on the watchdog. | diff --git a/src/heart.c b/src/heart.c index 3d6cd07..e229991 100644 --- a/src/heart.c +++ b/src/heart.c @@ -110,6 +110,7 @@ # define FD_ZERO(FD_SET_PTR) memset(FD_SET_PTR, 0, sizeof(fd_set)) #endif +#define HEART_INIT_TIMEOUT_ENV "HEART_INIT_TIMEOUT" #define ERL_CRASH_DUMP_SECONDS_ENV "ERL_CRASH_DUMP_SECONDS" #define HEART_KILL_SIGNAL "HEART_KILL_SIGNAL" #define HEART_WATCHDOG_PATH "HEART_WATCHDOG_PATH" @@ -160,6 +161,15 @@ static int wdt_timeout = 60; /* last_wdt_pet_time is the absolute time that hardware watchdog was pet */ static time_t last_wdt_pet_time = 0; +/* Timeout on receiving a handshake message from the application that the heart callback was set. 0=unused */ +static time_t init_handshake_timeout = 0; + +/* Set to 1 if the initialization handshake message came in */ +static int init_handshake_happened = 0; + +/* If !init_handshake_happened, then this is the end time */ +static time_t init_handshake_end_time = 0; + /* All current platforms have a process identifier that fits in an unsigned long and where 0 is an impossible or invalid value */ static pid_t heart_beat_kill_pid = 0; @@ -348,6 +358,15 @@ int main(int argc, char **argv) print_log("heart: " PROGRAM_NAME " v" PROGRAM_VERSION_STR " started."); + // Assume that the handshake happened and this fixes it if a timeout was specified + init_handshake_happened = 1; + if (is_env_set(HEART_INIT_TIMEOUT_ENV)) { + const char *init_tmo_env = get_env(HEART_INIT_TIMEOUT_ENV); + init_handshake_timeout = atoi(init_tmo_env); + if (init_handshake_timeout > 0) + init_handshake_happened = 0; + } + get_arguments(argc, argv); notify_ack(); @@ -375,6 +394,7 @@ static int message_loop() /* Pet the hw watchdog on start */ pet_watchdog(); now = last_heart_beat_time = last_wdt_pet_time = timestamp_seconds(); + init_handshake_end_time = now + init_handshake_timeout; max_fd = STDIN_FILENO; @@ -384,6 +404,9 @@ static int message_loop() timeout.tv_sec = max(1, min(last_heart_beat_time + heart_beat_timeout - now, last_wdt_pet_time + wdt_pet_timeout - now)); timeout.tv_usec = 0; + if (!init_handshake_happened) + timeout.tv_sec = min(timeout.tv_sec, init_handshake_end_time - now); + if ((i = select(max_fd + 1, &read_fds, NULLFDS, NULLFDS, &timeout)) < 0) { print_log("heart: select failed: %s", strerror(errno)); return R_ERROR; @@ -395,6 +418,13 @@ static int message_loop() (unsigned long) (now - last_heart_beat_time)); return R_TIMEOUT; } + + if (!init_handshake_happened && now >= init_handshake_end_time) { + print_log("heart: init handshake never happened -> not received in %lu seconds", + (unsigned long) init_handshake_timeout); + return R_TIMEOUT; + } + /* * Do not check fd-bits if select timeout */ @@ -450,6 +480,9 @@ static int message_loop() print_log("heart: poweroff signaled. No longer petting the WDT"); sync(); + } else if (mp_len == 15 && memcmp(m.fill, "init_handshake", 14) == 0) { + /* Application has said that it's completed initialization */ + init_handshake_happened = 1; } notify_ack(); break; @@ -729,12 +762,23 @@ static int heart_cmd_info_reply(time_t now) int heartbeat_time_left = last_heart_beat_time + heart_beat_timeout - now; int wdt_pet_time_left = last_wdt_pet_time + wdt_pet_timeout - now; + int init_handshake_time_left = init_handshake_end_time - now; + if (init_handshake_happened || init_handshake_time_left < 0) + init_handshake_time_left = 0; /* The reply format is: * = NEWLINE * ... */ - p += sprintf(p, "program_name=" PROGRAM_NAME "\nprogram_version=" PROGRAM_VERSION_STR "\nheartbeat_timeout=%d\nheartbeat_time_left=%d\nwdt_pet_time_left=%d\n", heart_beat_timeout, heartbeat_time_left, wdt_pet_time_left); + p += sprintf(p, "program_name=" PROGRAM_NAME "\nprogram_version=" PROGRAM_VERSION_STR "\n" + "heartbeat_timeout=%d\n" + "heartbeat_time_left=%d\n" + "wdt_pet_time_left=%d\n" + "init_handshake_happened=%d\n" + "init_handshake_timeout=%d\n" + "init_handshake_time_left=%d\n", + heart_beat_timeout, heartbeat_time_left, wdt_pet_time_left, + init_handshake_happened, (int) init_handshake_timeout, init_handshake_time_left); ret = ioctl(watchdog_fd, WDIOC_GETSUPPORT, &info); if (ret == 0) { diff --git a/tests/heart_test/lib/heart.ex b/tests/heart_test/lib/heart.ex index 1930c03..c8d9714 100644 --- a/tests/heart_test/lib/heart.ex +++ b/tests/heart_test/lib/heart.ex @@ -83,6 +83,8 @@ defmodule Heart do open_tries = init_args[:open_tries] || 0 watchdog_path = init_args[:watchdog_path] wdt_timeout = init_args[:wdt_timeout] || 120 + crash_dump_seconds = init_args[:crash_dump_seconds] + init_timeout = init_args[:init_timeout] watchdog_path_env = if watchdog_path do @@ -98,8 +100,6 @@ defmodule Heart do [] end - crash_dump_seconds = init_args[:crash_dump_seconds] - crash_dump_seconds_env = if crash_dump_seconds do [{~c"ERL_CRASH_DUMP_SECONDS", ~c"#{crash_dump_seconds}"}] @@ -107,6 +107,13 @@ defmodule Heart do [] end + init_timeout_env = + if init_timeout do + [{~c"HEART_INIT_TIMEOUT", ~c"#{init_timeout}"}] + else + [] + end + File.exists?(shim) || raise "Can't find heart_fixture.so" File.exists?(heart) || raise "Can't find heart" File.exists?(tmp_dir) || raise "Can't find #{inspect(tmp_dir)}" @@ -127,7 +134,7 @@ defmodule Heart do {~c"DYLD_INSERT_LIBRARIES", c_shim}, {~c"HEART_REPORT_PATH", to_charlist(reports)}, {~c"HEART_WATCHDOG_OPEN_TRIES", to_charlist(open_tries)} - ] ++ watchdog_path_env ++ crash_dump_seconds_env ++ wdt_timeout_env}, + ] ++ watchdog_path_env ++ crash_dump_seconds_env ++ wdt_timeout_env ++ init_timeout_env}, :exit_status ] ) @@ -217,7 +224,17 @@ defmodule Heart do end defp decode_response(<<@heart_ack>>), do: :heart_ack - defp decode_response(<<@heart_cmd, data::binary>>), do: {:heart_cmd, data} + + defp decode_response(<<@heart_cmd, data::binary>>) do + stats = + data + |> String.split("\n", trim: true) + |> Enum.map(&String.split(&1, "=", parts: 2)) + |> Enum.map(fn [x, y] -> {x, y} end) + |> Map.new() + + {:heart_cmd, stats} + end defp open_backend_socket(socket_path) do # Blindly try to remove an old file just in case it exists from a previous run diff --git a/tests/heart_test/test/heart_test.exs b/tests/heart_test/test/heart_test.exs index d9ff1c5..76976c4 100644 --- a/tests/heart_test/test/heart_test.exs +++ b/tests/heart_test/test/heart_test.exs @@ -114,20 +114,23 @@ defmodule HeartTestTest do {:ok, {:heart_cmd, cmd}} = Heart.get_cmd(heart) - assert cmd == """ - program_name=nerves_heart - program_version=2.0.0 - heartbeat_timeout=60 - heartbeat_time_left=60 - wdt_pet_time_left=110 - wdt_identity=OMAP Watchdog - wdt_firmware_version=0 - wdt_options=settimeout,magicclose,keepaliveping, - wdt_time_left=116 - wdt_pre_timeout=0 - wdt_timeout=120 - wdt_last_boot=power_on - """ + assert cmd == %{ + "heartbeat_time_left" => "60", + "heartbeat_timeout" => "60", + "init_handshake_happened" => "1", + "init_handshake_time_left" => "0", + "init_handshake_timeout" => "0", + "program_name" => "nerves_heart", + "program_version" => "2.0.0", + "wdt_firmware_version" => "0", + "wdt_identity" => "OMAP Watchdog", + "wdt_last_boot" => "power_on", + "wdt_options" => "settimeout,magicclose,keepaliveping,", + "wdt_pet_time_left" => "110", + "wdt_pre_timeout" => "0", + "wdt_time_left" => "116", + "wdt_timeout" => "120" + } graceful_shutdown(heart) end @@ -276,4 +279,59 @@ defmodule HeartTestTest do Heart.shutdown(heart) assert Heart.next_event(heart) == {:exit, 0} end + + test "application init handshake times out if not handshaked", context do + heart = start_supervised!({Heart, tmp_dir: context.tmp_dir, init_timeout: 5}) + assert Heart.next_event(heart) == {:heart, :heart_ack} + assert Heart.next_event(heart) == {:event, "open(/dev/watchdog0) succeeded"} + assert Heart.next_event(heart) == {:event, "pet(1)"} + + {:ok, {:heart_cmd, cmd}} = Heart.get_cmd(heart) + + assert cmd["init_handshake_happened"] == "0" + assert cmd["init_handshake_timeout"] == "5" + assert cmd["init_handshake_time_left"] == "5" + + Process.sleep(1000) + + # Check that the time left changed. Don't be too picky here since there are + # some random CI/local delays which sometimes add more delay. + {:ok, {:heart_cmd, cmd}} = Heart.get_cmd(heart) + assert cmd["init_handshake_time_left"] == "4" or cmd["init_handshake_time_left"] == "3" + + # No messages for 3 seconds + assert Heart.next_event(heart, 3000) == :timeout + + # Capture reboot in about 1 second + assert Heart.next_event(heart, 1500) == {:event, "sync()"} + assert Heart.next_event(heart) == {:event, "reboot(0x01234567)"} + assert Heart.next_event(heart) == {:exit, 0} + end + + test "application init handshake works", context do + heart = start_supervised!({Heart, tmp_dir: context.tmp_dir, init_timeout: 5}) + assert Heart.next_event(heart) == {:heart, :heart_ack} + assert Heart.next_event(heart) == {:event, "open(/dev/watchdog0) succeeded"} + assert Heart.next_event(heart) == {:event, "pet(1)"} + + {:ok, {:heart_cmd, cmd}} = Heart.get_cmd(heart) + assert cmd["init_handshake_happened"] == "0" + assert cmd["init_handshake_timeout"] == "5" + assert cmd["init_handshake_time_left"] == "5" + + assert {:ok, :heart_ack} == Heart.set_cmd(heart, "init_handshake") + + {:ok, {:heart_cmd, cmd}} = Heart.get_cmd(heart) + assert cmd["init_handshake_happened"] == "1" + assert cmd["init_handshake_timeout"] == "5" + assert cmd["init_handshake_time_left"] == "0" + + # Init timer shouldn't expire + Process.sleep(5500) + + # No messages means everything is ok. + assert Heart.next_event(heart, 500) == :timeout + + graceful_shutdown(heart) + end end