Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(healthcheck) delayed_clear function #88

Merged
merged 1 commit into from
Feb 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 78 additions & 16 deletions lib/resty/healthcheck.lua
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ local ngx_log = ngx.log
local tostring = tostring
local ipairs = ipairs
local cjson = require("cjson.safe").new()
local table_insert = table.insert
local table_remove = table.remove
local worker_events = require("resty.worker.events")
local resty_lock = require ("resty.lock")
Expand Down Expand Up @@ -281,20 +282,29 @@ end
function checker:add_target(ip, port, hostname, is_healthy, hostheader)
ip = tostring(assert(ip, "no ip address provided"))
port = assert(tonumber(port), "no port number provided")
hostname = hostname or ip
if is_healthy == nil then
is_healthy = true
end

local internal_health = is_healthy and "healthy" or "unhealthy"

local ok, err = locking_target_list(self, function(target_list)
local found = false

-- check whether we already have this target
for _, target in ipairs(target_list) do
if target.ip == ip and target.port == port and target.hostname == hostname then
self:log(DEBUG, "adding an existing target: ", hostname or "", " ", ip,
":", port, " (ignoring)")
return false
if target.ip == ip and target.port == port and target.hostname == (hostname) then
if target.purge_time == nil then
self:log(DEBUG, "adding an existing target: ", hostname or "", " ", ip,
":", port, " (ignoring)")
return
end
target.purge_time = nil
found = true
internal_health = self:get_target_status(ip, port, hostname) and
"healthy" or "unhealthy"
break
end
end

Expand All @@ -308,12 +318,14 @@ function checker:add_target(ip, port, hostname, is_healthy, hostheader)
end

-- target does not exist, go add it
target_list[#target_list + 1] = {
ip = ip,
port = port,
hostname = hostname,
hostheader = hostheader,
}
if not found then
target_list[#target_list + 1] = {
ip = ip,
port = port,
hostname = hostname,
hostheader = hostheader,
}
end
target_list = serialize(target_list)

ok, err = self.shm:set(self.TARGET_LIST, target_list)
Expand Down Expand Up @@ -433,6 +445,28 @@ function checker:clear()
end


function checker:delayed_clear(delay)
assert(tonumber(delay), "no delay provided")

return locking_target_list(self, function(target_list)
local purge_time = ngx_now() + delay

-- add purge time to all targets
for _, target in ipairs(target_list) do
target.purge_time = purge_time
end

target_list = serialize(target_list)
local ok, err = self.shm:set(self.TARGET_LIST, target_list)
if not ok then
return nil, "failed to store target_list in shm: " .. err
end

return true
end)
end


--- Get the current status of the target.
-- @param ip IP address of the target being checked.
-- @param port the port being checked against.
Expand Down Expand Up @@ -1071,7 +1105,7 @@ function checker:event_handler(event_name, ip, port, hostname)
end
end
self:log(DEBUG, "event: target '", hostname or "", " (", ip, ":", port,
"' removed")
")' removed")

else
self:log(WARN, "event: trying to remove an unknown target '",
Expand All @@ -1085,10 +1119,10 @@ function checker:event_handler(event_name, ip, port, hostname)
then
if not target_found then
-- it is a new target, must add it first
target_found = { ip = ip, port = port, hostname = hostname }
target_found = { ip = ip, port = port, hostname = hostname or ip }
self.targets[target_found.ip] = self.targets[target_found.ip] or {}
self.targets[target_found.ip][target_found.port] = self.targets[target_found.ip][target_found.port] or {}
self.targets[target_found.ip][target_found.port][target_found.hostname or ip] = target_found
self.targets[target_found.ip][target_found.port][target_found.hostname] = target_found
self.targets[#self.targets + 1] = target_found
self:log(DEBUG, "event: target added '", hostname or "", "(", ip, ":", port, ")'")
end
Expand Down Expand Up @@ -1451,9 +1485,8 @@ function _M.new(opts)
return nil, err
end

-- if active checker is needed and not running, start it
if (self.checks.active.healthy.active or self.checks.active.unhealthy.active)
and active_check_timer == nil then
-- if active checker is not running, start it
if active_check_timer == nil then

self:log(DEBUG, "worker ", ngx_worker_id(), " (pid: ", ngx_worker_pid(), ") ",
"starting active check timer")
Expand All @@ -1475,6 +1508,35 @@ function _M.new(opts)

local cur_time = ngx_now()
for _, checker_obj in ipairs(hcs) do
-- clear targets marked for delayed removal
locking_target_list(checker_obj, function(target_list)
local removed_targets = {}
local index = 1
while index <= #target_list do
local target = target_list[index]
if target.purge_time and target.purge_time <= cur_time then
table_insert(removed_targets, target)
table_remove(target_list, index)
else
index = index + 1
end
end

if #removed_targets > 0 then
target_list = serialize(target_list)

local ok, err = shm:set(checker_obj.TARGET_LIST, target_list)
if not ok then
return nil, "failed to store target_list in shm: " .. err
end

for _, target in ipairs(removed_targets) do
clear_target_data_from_shm(checker_obj, target.ip, target.port, target.hostname)
checker_obj:raise_event(checker_obj.events.remove, target.ip, target.port, target.hostname)
end
end
end)

if checker_obj.checks.active.healthy.active and
(checker_obj.checks.active.healthy.last_run +
checker_obj.checks.active.healthy.interval <= cur_time)
Expand Down
48 changes: 24 additions & 24 deletions t/09-active_probes.t
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ GET /t
false
--- error_log
checking unhealthy targets: nothing to do
unhealthy HTTP increment (1/3) for '(127.0.0.1:2114)'
unhealthy HTTP increment (2/3) for '(127.0.0.1:2114)'
unhealthy HTTP increment (3/3) for '(127.0.0.1:2114)'
event: target status '(127.0.0.1:2114)' from 'true' to 'false'
unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)'
event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false'
checking healthy targets: nothing to do


Expand Down Expand Up @@ -123,10 +123,10 @@ GET /t
true
--- error_log
checking healthy targets: nothing to do
healthy SUCCESS increment (1/3) for '(127.0.0.1:2114)'
healthy SUCCESS increment (2/3) for '(127.0.0.1:2114)'
healthy SUCCESS increment (3/3) for '(127.0.0.1:2114)'
event: target status '(127.0.0.1:2114)' from 'false' to 'true'
healthy SUCCESS increment (1/3) for '127.0.0.1(127.0.0.1:2114)'
healthy SUCCESS increment (2/3) for '127.0.0.1(127.0.0.1:2114)'
healthy SUCCESS increment (3/3) for '127.0.0.1(127.0.0.1:2114)'
event: target status '127.0.0.1(127.0.0.1:2114)' from 'false' to 'true'
checking unhealthy targets: nothing to do

=== TEST 3: active probes, custom http status (regression test for pre-filled defaults)
Expand Down Expand Up @@ -179,10 +179,10 @@ true
checking unhealthy targets: nothing to do
--- no_error_log
checking healthy targets: nothing to do
unhealthy HTTP increment (1/3) for '(127.0.0.1:2114)'
unhealthy HTTP increment (2/3) for '(127.0.0.1:2114)'
unhealthy HTTP increment (3/3) for '(127.0.0.1:2114)'
event: target status '(127.0.0.1:2114)' from 'true' to 'false'
unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)'
event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false'


=== TEST 4: active probes, custom http status, node failing
Expand Down Expand Up @@ -234,10 +234,10 @@ GET /t
false
--- error_log
checking unhealthy targets: nothing to do
unhealthy HTTP increment (1/3) for '(127.0.0.1:2114)'
unhealthy HTTP increment (2/3) for '(127.0.0.1:2114)'
unhealthy HTTP increment (3/3) for '(127.0.0.1:2114)'
event: target status '(127.0.0.1:2114)' from 'true' to 'false'
unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)'
event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false'
checking healthy targets: nothing to do


Expand Down Expand Up @@ -340,10 +340,10 @@ GET /t
false
--- error_log
checking unhealthy targets: nothing to do
unhealthy TCP increment (1/3) for '(127.0.0.1:2114)'
unhealthy TCP increment (2/3) for '(127.0.0.1:2114)'
unhealthy TCP increment (3/3) for '(127.0.0.1:2114)'
event: target status '(127.0.0.1:2114)' from 'true' to 'false'
unhealthy TCP increment (1/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy TCP increment (2/3) for '127.0.0.1(127.0.0.1:2114)'
unhealthy TCP increment (3/3) for '127.0.0.1(127.0.0.1:2114)'
event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false'
checking healthy targets: nothing to do


Expand Down Expand Up @@ -396,10 +396,10 @@ GET /t
true
--- error_log
checking healthy targets: nothing to do
healthy SUCCESS increment (1/3) for '(127.0.0.1:2114)'
healthy SUCCESS increment (2/3) for '(127.0.0.1:2114)'
healthy SUCCESS increment (3/3) for '(127.0.0.1:2114)'
event: target status '(127.0.0.1:2114)' from 'false' to 'true'
healthy SUCCESS increment (1/3) for '127.0.0.1(127.0.0.1:2114)'
healthy SUCCESS increment (2/3) for '127.0.0.1(127.0.0.1:2114)'
healthy SUCCESS increment (3/3) for '127.0.0.1(127.0.0.1:2114)'
event: target status '127.0.0.1(127.0.0.1:2114)' from 'false' to 'true'
checking unhealthy targets: nothing to do


Expand Down
118 changes: 115 additions & 3 deletions t/11-clear.t
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use Cwd qw(cwd);

workers(1);

plan tests => repeat_each() * 23;
plan tests => repeat_each() * 27;

my $pwd = cwd();

Expand Down Expand Up @@ -164,7 +164,119 @@ GET /t
true

--- error_log
unhealthy HTTP increment (1/3) for '(127.0.0.1:21120)'
unhealthy HTTP increment (2/3) for '(127.0.0.1:21120)'
unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:21120)'
unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:21120)'
--- no_error_log
unhealthy HTTP increment (3/3) for '(127.0.0.1:21120)'


=== TEST 4: delayed_clear() clears the list, after interval new checkers don't see it
--- http_config eval: $::HttpConfig
--- config
location = /t {
content_by_lua_block {
local we = require "resty.worker.events"
assert(we.configure{ shm = "my_worker_events", interval = 0.1 })
local healthcheck = require("resty.healthcheck")
local config = {
name = "testing",
shm_name = "test_shm",
checks = {
active = {
healthy = {
interval = 0.1
},
unhealthy = {
interval = 0.1
}
}
}
}
local checker1 = healthcheck.new(config)
for i = 1, 10 do
checker1:add_target("127.0.0.1", 10000 + i, nil, false)
end
ngx.sleep(0.2) -- wait twice the interval
ngx.say(checker1:get_target_status("127.0.0.1", 10001))
checker1:delayed_clear(0.2)

local checker2 = healthcheck.new(config)
ngx.say(checker2:get_target_status("127.0.0.1", 10001))
ngx.sleep(0.4) -- wait while the targets are cleared
local status, err = checker2:get_target_status("127.0.0.1", 10001)
if status ~= nil then
ngx.say(status)
else
ngx.say(err)
end
}
}
--- request
GET /t
--- response_body
false
false
target not found

=== TEST 5: delayed_clear() would clear tgt list, but adding again keeps the previous status
--- http_config eval: $::HttpConfig
--- config
location = /t {
content_by_lua_block {
local we = require "resty.worker.events"
assert(we.configure{ shm = "my_worker_events", interval = 0.1 })
local healthcheck = require("resty.healthcheck")
local config = {
name = "testing",
shm_name = "test_shm",
checks = {
active = {
healthy = {
interval = 0.1
},
unhealthy = {
interval = 0.1
}
}
}
}
local checker1 = healthcheck.new(config)
checker1:add_target("127.0.0.1", 10001, nil, false)
checker1:add_target("127.0.0.1", 10002, nil, false)
checker1:add_target("127.0.0.1", 10003, nil, false)
ngx.sleep(0.2) -- wait twice the interval
ngx.say(checker1:get_target_status("127.0.0.1", 10002))
checker1:delayed_clear(0.2)

local checker2 = healthcheck.new(config)
checker2:add_target("127.0.0.1", 10002, nil, true)
ngx.say(checker2:get_target_status("127.0.0.1", 10002))
ngx.sleep(0.4) -- wait while the targets would be cleared
local status, err = checker2:get_target_status("127.0.0.1", 10001)
if status ~= nil then
ngx.say(status)
else
ngx.say(err)
end
status, err = checker2:get_target_status("127.0.0.1", 10002)
if status ~= nil then
ngx.say(status)
else
ngx.say(err)
end
status, err = checker2:get_target_status("127.0.0.1", 10003)
if status ~= nil then
ngx.say(status)
else
ngx.say(err)
end
}
}
--- request
GET /t
--- response_body
false
false
target not found
false
target not found