Skip to content

Commit

Permalink
Merge pull request #232 from eugeneia/watchdog
Browse files Browse the repository at this point in the history
Added watchdog module.
  • Loading branch information
lukego committed Aug 8, 2014
2 parents 16e8cba + 6fb8c92 commit 6b0feea
Show file tree
Hide file tree
Showing 6 changed files with 214 additions and 2 deletions.
26 changes: 24 additions & 2 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ TESTMODS = $(shell find . -regex '[^\#]*\.lua' -printf '%P ' | \
xargs grep -l '^function selftest *[[:punct:]]' | \
sed -e 's_\.lua__' -e 's_/_._g')

# TESTSCRIPTS expands to:
# lib/watchdog/selftest.sh ...
# for each executable selftext.sh script in src.
TESTSCRIPTS = $(shell find . -name "selftest.sh" -executable | xargs)

PATH := ../deps/luajit/usr/local/bin:$(PATH)

all: snabb
Expand All @@ -46,11 +51,11 @@ snabb: $(LUAOBJ) $(HOBJ) $(COBJ) $(ASMOBJ)
@ln -fs snabb snabbswitch
@ls -sh snabb

test: $(TESTMODS)
test: $(TESTMODS) $(TESTSCRIPTS)

test_ci: FAIL_ON_FIRST="true"

test_ci: $(TESTMODS)
test_ci: $(TESTMODS) $(TESTSCRIPTS)

$(TESTMODS): testlog snabb
$(E) "TEST $@"
Expand All @@ -68,6 +73,23 @@ $(TESTMODS): testlog snabb
) \
)

testlog = testlog/$(shell echo "$(@)" | sed -e 's_/_._g')
$(TESTSCRIPTS): testlog snabb
$(E) "TEST $@"
$(Q) ./$@ > $(testlog) || ( \
EXITCODE="$$?"; \
[ "$$EXITCODE" -eq $(TEST_SKIPPED) ] \
&& ( \
echo "SKIPPED $(testlog)"; \
echo "EXITCODE: $$EXITCODE" >> $(testlog); \
) \
|| ( \
echo "ERROR $(testlog)"; \
echo "EXITCODE: $$EXITCODE" >> $(testlog); \
if [ -n "$(FAIL_ON_FIRST)" ]; then exit $$EXITCODE; fi;\
) \
)

$(OBJDIR) testlog:
$(E) "DIR $@"
$(Q) mkdir -p $@
Expand Down
9 changes: 9 additions & 0 deletions src/core/clib.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
// sleep(3) - suspend execution for second intervals
unsigned int sleep(unsigned int seconds);

// usleep(3) - suspend execution for microsecond intervals
int usleep(unsigned long usec);

// alarm(3) - schedule signal after given number of seconds
unsigned alarm(unsigned seconds);

// ualarm(3) - schedule signal after given number of microseconds
unsigned long ualarm(unsigned long usecs, unsigned long interval);

// memcpy(3) - copy memory area
void memcpy(void *dest, const void *src, size_t n);

Expand Down
34 changes: 34 additions & 0 deletions src/lib/watchdog/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
The `lib.watchdog.watchdog` module implements a per-thread watchdog
functionality. Its purpose is to watch and kill processes which fail to
call the watchdog periodically (e.g. hang).

It does so by using `alarm(3)` and `ualarm(3)` to have the OS send a
`SIGALRM` to the process after a specified timeout. Because the process
does not handle the signal it will be killed and exit with status `142`.

Usage is as follows:

-- Use the watchdog module.
watchdog = require("lib.watchdog.wachdog")

`set(n)` sets the watchdog timeout to `n` milliseconds. Because
`alarm(3)` is used for timeouts longer than one second values for `n`
greater than 1000 (e.g. a second) will be rounded up to the next second
(e.g. `set(1100)` <=> `set(2000)`).

-- Set the timeout to 500ms.
watchdog.set(500)

`reset()` will reset the alarm (or start it if it has not been started
before). Thus now you have 500 milliseconds to reset before the process
will be killed.

-- Start or reset the timeout.
watchdog.reset()

Alternatively you can use `stop()` to disable the timeout and prevent the
process to be killed.

-- Disable the timeout.
watchdog.stop()

19 changes: 19 additions & 0 deletions src/lib/watchdog/selftest.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/sh

./snabb ./lib/watchdog/selftest_design alert
if [ $? != 0 ]; then exit 1; fi

./snabb ./lib/watchdog/selftest_design alert_stop
if [ $? != 0 ]; then exit 1; fi

./snabb ./lib/watchdog/selftest_design alert_timeout
if [ $? != 142 ]; then exit 1; fi

./snabb ./lib/watchdog/selftest_design ualert
if [ $? != 0 ]; then exit 1; fi

./snabb ./lib/watchdog/selftest_design ualert_stop
if [ $? != 0 ]; then exit 1; fi

./snabb ./lib/watchdog/selftest_design ualert_timeout
if [ $? != 142 ]; then exit 1; fi
84 changes: 84 additions & 0 deletions src/lib/watchdog/selftest_design
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env snabb

local ffi = require("ffi")
local C = ffi.C
local watchdog = require("lib.watchdog.watchdog")

test = {}

test["ualert"] = function ()
watchdog.set(400)
watchdog.reset()
print("Set timeout, now sleeping...")

C.usleep(300000)
print("Resetting watchdog.")
watchdog.reset()

C.usleep(300000)
print("Exit normally.")
end

test["ualert_stop"] = function ()
watchdog.set(400)
watchdog.reset()
print("Set timeout, now sleeping...")

C.usleep(300000)
print("Stopping watchdog.")
watchdog.stop()

C.usleep(300000)
print("Exit normally.")
end

test["ualert_timeout"] = function ()
watchdog.set(400)
watchdog.reset()
print("Set timeout, now sleeping until watchdog times out.")

C.usleep(800000)
print("Error: SIGABRT not received.")
end

test["alert"] = function ()
watchdog.set(2000)
watchdog.reset()
print("Set timeout, now sleeping...")

C.sleep(1)
print("Resetting watchdog.")
watchdog.reset()

C.sleep(1)
print("Exit normally.")
end

test["alert_stop"] = function ()
watchdog.set(2000)
watchdog.reset()
print("Set timeout, now sleeping...")

C.sleep(1)
print("Stopping watchdog.")
watchdog.stop()

C.sleep(1)
print("Exit normally.")
end

test["alert_timeout"] = function ()
watchdog.set(2000)
watchdog.reset()
print("Set timeout, now sleeping until watchdog times out.")

C.sleep(800000)
print("Error: SIGABRT not received.")
end

function run (testcase)
print("[testing "..testcase.."]")
test[testcase]()
end

run(unpack(main.parameters))
44 changes: 44 additions & 0 deletions src/lib/watchdog/watchdog.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
module(...,package.seeall)

ffi = require("ffi")
C = ffi.C

-- Watchdog timeout in unit defined by `precision' (just below).
timeout = nil

-- Watchdog precision.
precision = nil

-- Set watchdog timeout to mseconds (milliseconds). Does NOT start the
-- watchdog. Values for mseconds>1000 are truncated to the next second,
-- e.g. set(1100) <=> set(2000).
function set (mseconds)
if mseconds > 1000 then
timeout = math.ceil(mseconds / 1000)
precision = "second"
else
timeout = mseconds * 1000
precision = "microsecond"
end
end

-- (Re)set timeout. E.g. starts the watchdog if it has not been started
-- before and resets the timeout otherwise.
function reset ()
if precision == "second" then
C.alarm(timeout)
elseif precision == "microsecond" then
C.ualarm(timeout, 0)
else
error("Watchdog was not set.")
end
end

-- Disable timeout.
function stop ()
if precision == "second" then
C.alarm(0)
elseif precision == "microsecond" then
C.ualarm(0,0)
end
end

0 comments on commit 6b0feea

Please sign in to comment.