diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 89d1284a..c1298a24 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -31,6 +31,7 @@ local ngx_log = ngx.log local tostring = tostring local ipairs = ipairs local cjson = require("cjson.safe").new() +local table_insert = table.insert local table_remove = table.remove local worker_events = require("resty.worker.events") local resty_lock = require ("resty.lock") @@ -281,6 +282,7 @@ end function checker:add_target(ip, port, hostname, is_healthy, hostheader) ip = tostring(assert(ip, "no ip address provided")) port = assert(tonumber(port), "no port number provided") + hostname = hostname or ip if is_healthy == nil then is_healthy = true end @@ -288,13 +290,21 @@ function checker:add_target(ip, port, hostname, is_healthy, hostheader) local internal_health = is_healthy and "healthy" or "unhealthy" local ok, err = locking_target_list(self, function(target_list) + local found = false -- check whether we already have this target for _, target in ipairs(target_list) do - if target.ip == ip and target.port == port and target.hostname == hostname then - self:log(DEBUG, "adding an existing target: ", hostname or "", " ", ip, - ":", port, " (ignoring)") - return false + if target.ip == ip and target.port == port and target.hostname == (hostname) then + if target.purge_time == nil then + self:log(DEBUG, "adding an existing target: ", hostname or "", " ", ip, + ":", port, " (ignoring)") + return + end + target.purge_time = nil + found = true + internal_health = self:get_target_status(ip, port, hostname) and + "healthy" or "unhealthy" + break end end @@ -308,12 +318,14 @@ function checker:add_target(ip, port, hostname, is_healthy, hostheader) end -- target does not exist, go add it - target_list[#target_list + 1] = { - ip = ip, - port = port, - hostname = hostname, - hostheader = hostheader, - } + if not found then + target_list[#target_list + 1] = { + ip = ip, + port = port, + hostname = hostname, + hostheader = hostheader, + } + end target_list = serialize(target_list) ok, err = self.shm:set(self.TARGET_LIST, target_list) @@ -433,6 +445,28 @@ function checker:clear() end +function checker:delayed_clear(delay) + assert(tonumber(delay), "no delay provided") + + return locking_target_list(self, function(target_list) + local purge_time = ngx_now() + delay + + -- add purge time to all targets + for _, target in ipairs(target_list) do + target.purge_time = purge_time + end + + target_list = serialize(target_list) + local ok, err = self.shm:set(self.TARGET_LIST, target_list) + if not ok then + return nil, "failed to store target_list in shm: " .. err + end + + return true + end) +end + + --- Get the current status of the target. -- @param ip IP address of the target being checked. -- @param port the port being checked against. @@ -1071,7 +1105,7 @@ function checker:event_handler(event_name, ip, port, hostname) end end self:log(DEBUG, "event: target '", hostname or "", " (", ip, ":", port, - "' removed") + ")' removed") else self:log(WARN, "event: trying to remove an unknown target '", @@ -1085,10 +1119,10 @@ function checker:event_handler(event_name, ip, port, hostname) then if not target_found then -- it is a new target, must add it first - target_found = { ip = ip, port = port, hostname = hostname } + target_found = { ip = ip, port = port, hostname = hostname or ip } self.targets[target_found.ip] = self.targets[target_found.ip] or {} self.targets[target_found.ip][target_found.port] = self.targets[target_found.ip][target_found.port] or {} - self.targets[target_found.ip][target_found.port][target_found.hostname or ip] = target_found + self.targets[target_found.ip][target_found.port][target_found.hostname] = target_found self.targets[#self.targets + 1] = target_found self:log(DEBUG, "event: target added '", hostname or "", "(", ip, ":", port, ")'") end @@ -1451,9 +1485,8 @@ function _M.new(opts) return nil, err end - -- if active checker is needed and not running, start it - if (self.checks.active.healthy.active or self.checks.active.unhealthy.active) - and active_check_timer == nil then + -- if active checker is not running, start it + if active_check_timer == nil then self:log(DEBUG, "worker ", ngx_worker_id(), " (pid: ", ngx_worker_pid(), ") ", "starting active check timer") @@ -1475,6 +1508,35 @@ function _M.new(opts) local cur_time = ngx_now() for _, checker_obj in ipairs(hcs) do + -- clear targets marked for delayed removal + locking_target_list(checker_obj, function(target_list) + local removed_targets = {} + local index = 1 + while index <= #target_list do + local target = target_list[index] + if target.purge_time and target.purge_time <= cur_time then + table_insert(removed_targets, target) + table_remove(target_list, index) + else + index = index + 1 + end + end + + if #removed_targets > 0 then + target_list = serialize(target_list) + + local ok, err = shm:set(checker_obj.TARGET_LIST, target_list) + if not ok then + return nil, "failed to store target_list in shm: " .. err + end + + for _, target in ipairs(removed_targets) do + clear_target_data_from_shm(checker_obj, target.ip, target.port, target.hostname) + checker_obj:raise_event(checker_obj.events.remove, target.ip, target.port, target.hostname) + end + end + end) + if checker_obj.checks.active.healthy.active and (checker_obj.checks.active.healthy.last_run + checker_obj.checks.active.healthy.interval <= cur_time) diff --git a/t/09-active_probes.t b/t/09-active_probes.t index 86c523cd..dd68faf4 100644 --- a/t/09-active_probes.t +++ b/t/09-active_probes.t @@ -67,10 +67,10 @@ GET /t false --- error_log checking unhealthy targets: nothing to do -unhealthy HTTP increment (1/3) for '(127.0.0.1:2114)' -unhealthy HTTP increment (2/3) for '(127.0.0.1:2114)' -unhealthy HTTP increment (3/3) for '(127.0.0.1:2114)' -event: target status '(127.0.0.1:2114)' from 'true' to 'false' +unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)' +event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false' checking healthy targets: nothing to do @@ -123,10 +123,10 @@ GET /t true --- error_log checking healthy targets: nothing to do -healthy SUCCESS increment (1/3) for '(127.0.0.1:2114)' -healthy SUCCESS increment (2/3) for '(127.0.0.1:2114)' -healthy SUCCESS increment (3/3) for '(127.0.0.1:2114)' -event: target status '(127.0.0.1:2114)' from 'false' to 'true' +healthy SUCCESS increment (1/3) for '127.0.0.1(127.0.0.1:2114)' +healthy SUCCESS increment (2/3) for '127.0.0.1(127.0.0.1:2114)' +healthy SUCCESS increment (3/3) for '127.0.0.1(127.0.0.1:2114)' +event: target status '127.0.0.1(127.0.0.1:2114)' from 'false' to 'true' checking unhealthy targets: nothing to do === TEST 3: active probes, custom http status (regression test for pre-filled defaults) @@ -179,10 +179,10 @@ true checking unhealthy targets: nothing to do --- no_error_log checking healthy targets: nothing to do -unhealthy HTTP increment (1/3) for '(127.0.0.1:2114)' -unhealthy HTTP increment (2/3) for '(127.0.0.1:2114)' -unhealthy HTTP increment (3/3) for '(127.0.0.1:2114)' -event: target status '(127.0.0.1:2114)' from 'true' to 'false' +unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)' +event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false' === TEST 4: active probes, custom http status, node failing @@ -234,10 +234,10 @@ GET /t false --- error_log checking unhealthy targets: nothing to do -unhealthy HTTP increment (1/3) for '(127.0.0.1:2114)' -unhealthy HTTP increment (2/3) for '(127.0.0.1:2114)' -unhealthy HTTP increment (3/3) for '(127.0.0.1:2114)' -event: target status '(127.0.0.1:2114)' from 'true' to 'false' +unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)' +event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false' checking healthy targets: nothing to do @@ -340,10 +340,10 @@ GET /t false --- error_log checking unhealthy targets: nothing to do -unhealthy TCP increment (1/3) for '(127.0.0.1:2114)' -unhealthy TCP increment (2/3) for '(127.0.0.1:2114)' -unhealthy TCP increment (3/3) for '(127.0.0.1:2114)' -event: target status '(127.0.0.1:2114)' from 'true' to 'false' +unhealthy TCP increment (1/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy TCP increment (2/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy TCP increment (3/3) for '127.0.0.1(127.0.0.1:2114)' +event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false' checking healthy targets: nothing to do @@ -396,10 +396,10 @@ GET /t true --- error_log checking healthy targets: nothing to do -healthy SUCCESS increment (1/3) for '(127.0.0.1:2114)' -healthy SUCCESS increment (2/3) for '(127.0.0.1:2114)' -healthy SUCCESS increment (3/3) for '(127.0.0.1:2114)' -event: target status '(127.0.0.1:2114)' from 'false' to 'true' +healthy SUCCESS increment (1/3) for '127.0.0.1(127.0.0.1:2114)' +healthy SUCCESS increment (2/3) for '127.0.0.1(127.0.0.1:2114)' +healthy SUCCESS increment (3/3) for '127.0.0.1(127.0.0.1:2114)' +event: target status '127.0.0.1(127.0.0.1:2114)' from 'false' to 'true' checking unhealthy targets: nothing to do diff --git a/t/11-clear.t b/t/11-clear.t index d3a9ebd7..b25822c3 100644 --- a/t/11-clear.t +++ b/t/11-clear.t @@ -3,7 +3,7 @@ use Cwd qw(cwd); workers(1); -plan tests => repeat_each() * 23; +plan tests => repeat_each() * 27; my $pwd = cwd(); @@ -164,7 +164,119 @@ GET /t true --- error_log -unhealthy HTTP increment (1/3) for '(127.0.0.1:21120)' -unhealthy HTTP increment (2/3) for '(127.0.0.1:21120)' +unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:21120)' +unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:21120)' --- no_error_log unhealthy HTTP increment (3/3) for '(127.0.0.1:21120)' + + +=== TEST 4: delayed_clear() clears the list, after interval new checkers don't see it +--- http_config eval: $::HttpConfig +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + local healthcheck = require("resty.healthcheck") + local config = { + name = "testing", + shm_name = "test_shm", + checks = { + active = { + healthy = { + interval = 0.1 + }, + unhealthy = { + interval = 0.1 + } + } + } + } + local checker1 = healthcheck.new(config) + for i = 1, 10 do + checker1:add_target("127.0.0.1", 10000 + i, nil, false) + end + ngx.sleep(0.2) -- wait twice the interval + ngx.say(checker1:get_target_status("127.0.0.1", 10001)) + checker1:delayed_clear(0.2) + + local checker2 = healthcheck.new(config) + ngx.say(checker2:get_target_status("127.0.0.1", 10001)) + ngx.sleep(0.4) -- wait while the targets are cleared + local status, err = checker2:get_target_status("127.0.0.1", 10001) + if status ~= nil then + ngx.say(status) + else + ngx.say(err) + end + } + } +--- request +GET /t +--- response_body +false +false +target not found + +=== TEST 5: delayed_clear() would clear tgt list, but adding again keeps the previous status +--- http_config eval: $::HttpConfig +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + local healthcheck = require("resty.healthcheck") + local config = { + name = "testing", + shm_name = "test_shm", + checks = { + active = { + healthy = { + interval = 0.1 + }, + unhealthy = { + interval = 0.1 + } + } + } + } + local checker1 = healthcheck.new(config) + checker1:add_target("127.0.0.1", 10001, nil, false) + checker1:add_target("127.0.0.1", 10002, nil, false) + checker1:add_target("127.0.0.1", 10003, nil, false) + ngx.sleep(0.2) -- wait twice the interval + ngx.say(checker1:get_target_status("127.0.0.1", 10002)) + checker1:delayed_clear(0.2) + + local checker2 = healthcheck.new(config) + checker2:add_target("127.0.0.1", 10002, nil, true) + ngx.say(checker2:get_target_status("127.0.0.1", 10002)) + ngx.sleep(0.4) -- wait while the targets would be cleared + local status, err = checker2:get_target_status("127.0.0.1", 10001) + if status ~= nil then + ngx.say(status) + else + ngx.say(err) + end + status, err = checker2:get_target_status("127.0.0.1", 10002) + if status ~= nil then + ngx.say(status) + else + ngx.say(err) + end + status, err = checker2:get_target_status("127.0.0.1", 10003) + if status ~= nil then + ngx.say(status) + else + ngx.say(err) + end + } + } +--- request +GET /t +--- response_body +false +false +target not found +false +target not found