From 2dc6ddbff0467601a5d8b89d4790dcedd3265c45 Mon Sep 17 00:00:00 2001 From: Keery Nie Date: Thu, 16 May 2024 11:51:48 +0800 Subject: [PATCH] fix(*): avoid creating multiple timers to run the same active check (#157) * fix(*): avoid creating multiple timers to run the same active check (#156) * docs(changelog): add changelog --- README.md | 4 + lib/resty/healthcheck.lua | 48 +++++++++++ t/with_resty-events/14-tls_active_probes.t | 15 ++-- t/with_resty-events/19-timer.t | 93 +++++++++++++++++++++ t/with_worker-events/14-tls_active_probes.t | 15 ++-- t/with_worker-events/19-timer.t | 72 ++++++++++++++++ 6 files changed, 235 insertions(+), 12 deletions(-) create mode 100644 t/with_resty-events/19-timer.t create mode 100644 t/with_worker-events/19-timer.t diff --git a/README.md b/README.md index e200374f..eba5e835 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,10 @@ Versioning is strictly based on [Semantic Versioning](https://semver.org/) * push commit and tag * upload rock to luarocks: `luarocks upload rockspecs/[name] --api-key=abc` +### Unreleased + +* Fix: avoid creating multiple timers to run the same active check [#157](https://github.com/Kong/lua-resty-healthcheck/pull/157) + ### 3.0.1 (22-Dec-2023) * Fix: fix delay clean logic when multiple healthchecker was started [#146](https://github.com/Kong/lua-resty-healthcheck/pull/146) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 675ca24c..67bad17b 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -34,6 +34,7 @@ local table_insert = table.insert local table_remove = table.remove local table_concat = table.concat local string_format = string.format +local math_max = math.max local ssl = require("ngx.ssl") local resty_timer = require "resty.timer" local bit = require("bit") @@ -1201,6 +1202,32 @@ local function renew_periodic_lock(shm, key) end +local function get_callback_lock(shm, key, ttl) + local value = shm:get(key) + if value == nil then + -- no worker is checking, try to acquire the lock + local ok, err = shm:add(key, true, ttl or LOCK_PERIOD) + if not ok then + if err == "exists" then + -- another worker got the lock before + return false + end + + return nil, err + end + + return true + end + + return false +end + + +local function remove_callback_lock(shm, key) + return shm:delete(key) +end + + --- Active health check callback function. -- @param self the checker object this timer runs on -- @param health_mode either "healthy" or "unhealthy" to indicate what check @@ -1209,10 +1236,27 @@ local function checker_callback(self, health_mode) self.checker_callback_count = self.checker_callback_count + 1 end + -- Set a callback pending lock will exist for 2x the time of the active check. + -- An active check should be finished within this time and next timer will be + -- executed to exit a pending status. + local callback_pending_ttl = (math_max(self.checks.active.healthy.active and + self.checks.active.healthy.interval or 0, + self.checks.active.unhealthy.active and + self.checks.active.unhealthy.interval or 0) + + self.checks.active.timeout) * 2 + + local callback_lock = self.CALLBACK_LOCK .. health_mode + -- a pending timer already exists, so skip this time + local ok, _ = get_callback_lock(self.shm, callback_lock, callback_pending_ttl) + if not ok then + return + end + local list_to_check = {} local targets, err = fetch_target_list(self) if not targets then self:log(ERR, "checker_callback: ", err) + remove_callback_lock(self.shm, callback_lock) return end @@ -1236,6 +1280,7 @@ local function checker_callback(self, health_mode) if not list_to_check[1] then self:log(DEBUG, "checking ", health_mode, " targets: nothing to do") + remove_callback_lock(self.shm, callback_lock) else local timer = resty_timer({ interval = 0, @@ -1245,10 +1290,12 @@ local function checker_callback(self, health_mode) expire = function() self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check) self:active_check_targets(list_to_check) + remove_callback_lock(self.shm, callback_lock) end, }) if timer == nil then self:log(ERR, "failed to create timer to check ", health_mode) + remove_callback_lock(self.shm, callback_lock) end end end @@ -1618,6 +1665,7 @@ function _M.new(opts) self.TARGET_LIST_LOCK = SHM_PREFIX .. self.name .. ":target_list_lock" self.TARGET_LOCK = SHM_PREFIX .. self.name .. ":target_lock" self.PERIODIC_LOCK = SHM_PREFIX .. ":period_lock:" + self.CALLBACK_LOCK = SHM_PREFIX .. self.name .. ":callback_lock:" -- prepare constants self.EVENT_SOURCE = EVENT_SOURCE_PREFIX .. " [" .. self.name .. "]" self.LOG_PREFIX = LOG_PREFIX .. "(" .. self.name .. ") " diff --git a/t/with_resty-events/14-tls_active_probes.t b/t/with_resty-events/14-tls_active_probes.t index d9e44902..f4c175e0 100644 --- a/t/with_resty-events/14-tls_active_probes.t +++ b/t/with_resty-events/14-tls_active_probes.t @@ -46,6 +46,7 @@ __DATA__ events_module = "resty.events", checks = { active = { + timeout = 2, type = "https", http_path = "/", healthy = { @@ -60,7 +61,7 @@ events_module = "resty.events", } }) local ok, err = checker:add_target("104.154.89.105", 443, "badssl.com", false) - ngx.sleep(8) -- wait for 4x the check interval + ngx.sleep(16) -- wait for 4x the check interval ngx.say(checker:get_target_status("104.154.89.105", 443, "badssl.com")) -- true } } @@ -69,7 +70,7 @@ GET /t --- response_body true --- timeout -15 +20 === TEST 2: active probes, invalid cert --- http_config eval: $::HttpConfig @@ -87,6 +88,7 @@ true events_module = "resty.events", checks = { active = { + timeout = 2, type = "https", http_path = "/", healthy = { @@ -101,7 +103,7 @@ events_module = "resty.events", } }) local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", true) - ngx.sleep(8) -- wait for 4x the check interval + ngx.sleep(16) -- wait for 4x the check interval ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- false } } @@ -110,7 +112,7 @@ GET /t --- response_body false --- timeout -15 +20 === TEST 3: active probes, accept invalid cert when disabling check --- http_config eval: $::HttpConfig @@ -128,6 +130,7 @@ false events_module = "resty.events", checks = { active = { + timeout = 2, type = "https", https_verify_certificate = false, http_path = "/", @@ -143,7 +146,7 @@ events_module = "resty.events", } }) local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", false) - ngx.sleep(8) -- wait for 4x the check interval + ngx.sleep(16) -- wait for 4x the check interval ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- true } } @@ -152,4 +155,4 @@ GET /t --- response_body true --- timeout -15 +20 diff --git a/t/with_resty-events/19-timer.t b/t/with_resty-events/19-timer.t new file mode 100644 index 00000000..fa2d510a --- /dev/null +++ b/t/with_resty-events/19-timer.t @@ -0,0 +1,93 @@ +use Test::Nginx::Socket::Lua; +use Cwd qw(cwd); + +workers(1); + +plan tests => repeat_each() * blocks() * 2; + +my $pwd = cwd(); +$ENV{TEST_NGINX_SERVROOT} = server_root(); + +our $HttpConfig = qq{ + lua_package_path "$pwd/lib/?.lua;;"; + lua_shared_dict test_shm 8m; + + init_worker_by_lua_block { + local we = require "resty.events.compat" + assert(we.configure({ + unique_timeout = 5, + broker_id = 0, + listening = "unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock" + })) + assert(we.configured()) + } + + server { + server_name kong_worker_events; + listen unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock; + access_log off; + location / { + content_by_lua_block { + require("resty.events.compat").run() + } + } + } +}; + +run_tests(); + +__DATA__ + + + + +=== TEST 1: active probes, http node failing +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2130; + location = /status { + content_by_lua_block { + ngx.sleep(2) + ngx.exit(500); + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.events", + type = "http", + checks = { + active = { + timeout = 1, + http_path = "/status", + healthy = { + interval = 0.1, + successes = 3, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + local ok, err = checker:add_target("127.0.0.1", 2130, nil, true) + ngx.sleep(3) -- wait for some time to let the checks run + -- There should be no more than 3 timers running atm, but + -- add a few spaces for worker events + ngx.say(tonumber(ngx.timer.running_count()) <= 5) + } + } +--- request +GET /t +--- response_body +true diff --git a/t/with_worker-events/14-tls_active_probes.t b/t/with_worker-events/14-tls_active_probes.t index 51854928..a9d854b7 100644 --- a/t/with_worker-events/14-tls_active_probes.t +++ b/t/with_worker-events/14-tls_active_probes.t @@ -34,6 +34,7 @@ __DATA__ shm_name = "test_shm", checks = { active = { + timeout = 2, type = "https", http_path = "/", healthy = { @@ -48,7 +49,7 @@ __DATA__ } }) local ok, err = checker:add_target("104.154.89.105", 443, "badssl.com", false) - ngx.sleep(8) -- wait for 4x the check interval + ngx.sleep(16) -- wait for 4x the check interval ngx.say(checker:get_target_status("104.154.89.105", 443, "badssl.com")) -- true } } @@ -57,7 +58,7 @@ GET /t --- response_body true --- timeout -15 +20 === TEST 2: active probes, invalid cert --- http_config eval: $::HttpConfig @@ -74,6 +75,7 @@ true shm_name = "test_shm", checks = { active = { + timeout = 2, type = "https", http_path = "/", healthy = { @@ -88,7 +90,7 @@ true } }) local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", true) - ngx.sleep(8) -- wait for 4x the check interval + ngx.sleep(16) -- wait for 4x the check interval ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- false } } @@ -97,7 +99,7 @@ GET /t --- response_body false --- timeout -15 +20 === TEST 3: active probes, accept invalid cert when disabling check --- http_config eval: $::HttpConfig @@ -114,6 +116,7 @@ false shm_name = "test_shm", checks = { active = { + timeout = 2, type = "https", https_verify_certificate = false, http_path = "/", @@ -129,7 +132,7 @@ false } }) local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", false) - ngx.sleep(8) -- wait for 4x the check interval + ngx.sleep(16) -- wait for 4x the check interval ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- true } } @@ -138,4 +141,4 @@ GET /t --- response_body true --- timeout -15 +20 diff --git a/t/with_worker-events/19-timer.t b/t/with_worker-events/19-timer.t new file mode 100644 index 00000000..b871b47f --- /dev/null +++ b/t/with_worker-events/19-timer.t @@ -0,0 +1,72 @@ +use Test::Nginx::Socket::Lua; +use Cwd qw(cwd); + +workers(1); + +plan tests => repeat_each() * blocks() * 2; + +my $pwd = cwd(); + +our $HttpConfig = qq{ + lua_package_path "$pwd/lib/?.lua;;"; + lua_shared_dict test_shm 8m; + lua_shared_dict my_worker_events 8m; +}; + +run_tests(); + +__DATA__ + + + +=== TEST 1: active probes, http node failing +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2130; + location = /status { + content_by_lua_block { + ngx.sleep(2) + ngx.exit(500); + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + type = "http", + checks = { + active = { + timeout = 1, + http_path = "/status", + healthy = { + interval = 0.1, + successes = 3, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + local ok, err = checker:add_target("127.0.0.1", 2130, nil, true) + ngx.sleep(3) -- wait for some time to let the checks run + -- There should be no more than 3 timers running atm, but + -- add a few spaces for worker events + ngx.say(tonumber(ngx.timer.running_count()) <= 5) + } + } +--- request +GET /t +--- response_body +true