From 50a4b4dd3208d8d9ad50e0c9fbb481efbad456b2 Mon Sep 17 00:00:00 2001 From: Michael Martin <3277009+flrgh@users.noreply.github.com> Date: Fri, 12 Aug 2022 17:18:56 -0700 Subject: [PATCH 1/4] feat(cmd) clean up dangling unix sockets at startup --- kong/cmd/start.lua | 38 +++++ .../02-cmd/02-start_stop_spec.lua | 152 +++++++++++++++++- 2 files changed, 186 insertions(+), 4 deletions(-) diff --git a/kong/cmd/start.lua b/kong/cmd/start.lua index c6ffedb318ba..3bfa90e70a30 100644 --- a/kong/cmd/start.lua +++ b/kong/cmd/start.lua @@ -6,8 +6,43 @@ local kong_global = require "kong.global" local kill = require "kong.cmd.utils.kill" local log = require "kong.cmd.utils.log" local DB = require "kong.db" +local lfs = require "lfs" +local function is_socket(path) + return lfs.attributes(path, "mode") == "socket" +end + +local function cleanup_dangling_unix_sockets(prefix) + local found = {} + + for child in lfs.dir(prefix) do + local path = prefix .. "/" .. child + if is_socket(path) then + table.insert(found, path) + end + end + + if #found < 1 then + return + end + + log.warn("Found dangling unix sockets in the prefix directory (%q) while " .. + "preparing to start Kong. This may be a sign that Kong was " .. + "previously shut down uncleanly or is in an unknown state and " .. + "could require further investigation.", + prefix) + + log.warn("Attempting to remove dangling sockets before starting Kong...") + + for _, sock in ipairs(found) do + if is_socket(sock) then + log.warn("removing unix socket: %s", sock) + assert(os.remove(sock)) + end + end +end + local function execute(args) args.db_timeout = args.db_timeout * 1000 args.lock_timeout = args.lock_timeout @@ -26,6 +61,9 @@ local function execute(args) assert(not kill.is_running(conf.nginx_pid), "Kong is already running in " .. conf.prefix) + + cleanup_dangling_unix_sockets(conf.prefix) + _G.kong = kong_global.new() kong_global.init_pdk(_G.kong, conf) diff --git a/spec/02-integration/02-cmd/02-start_stop_spec.lua b/spec/02-integration/02-cmd/02-start_stop_spec.lua index bd111b0cfae1..ceb4fd1a63af 100644 --- a/spec/02-integration/02-cmd/02-start_stop_spec.lua +++ b/spec/02-integration/02-cmd/02-start_stop_spec.lua @@ -5,10 +5,7 @@ for _, strategy in helpers.each_strategy() do describe("kong start/stop #" .. strategy, function() lazy_setup(function() - helpers.get_db_utils(nil, { - "routes", - "services", - }) -- runs migrations + helpers.get_db_utils(strategy) -- runs migrations helpers.prepare_prefix() end) after_each(function() @@ -617,5 +614,152 @@ describe("kong start/stop #" .. strategy, function() assert.matches("the 'worker_consistency' configuration property is deprecated", stderr, nil, true) end) end) + + describe("dangling socket cleanup", function() + local prefix = helpers.test_conf.prefix + local pidfile = helpers.test_conf.nginx_pid + + -- the worker events socket is just one of many unix sockets we use + local event_sock = prefix .. "/worker_events.sock" + + local env = { + prefix = prefix, + database = strategy, + admin_listen = "127.0.0.1:9001", + proxy_listen = "127.0.0.1:8000", + stream_listen = "127.0.0.1:9022", + nginx_main_worker_processes = 2, -- keeping this low for the sake of speed + } + + local function start() + local cmd = string.format("start -p %q", prefix) + return helpers.kong_exec(cmd, env, true) + end + + + local function sigkill(pid) + if type(pid) == "table" then + pid = table.concat(pid, " ") + end + + helpers.execute("kill -9 " .. pid) + + helpers.wait_until(function() + -- kill returns: + -- + -- * 0 on success + -- * 1 on failure + -- * 64 on partial failure/success + -- + -- we might be passing in multiple pids, so we need to explicitly + -- check the exit code is 1, otherwise one or more processes might + -- still be alive + local _, code = helpers.execute("kill -0 " .. pid, true) + return code == 1 + end) + end + + local function get_worker_pids() + local admin = assert(helpers.admin_client()) + local res = admin:get("/") + + assert.res_status(200, res) + + local json = assert.response(res).has.jsonbody() + admin:close() + + return json.pids.workers + end + + local function kill_all() + local workers = get_worker_pids() + + local master = assert(helpers.file.read(pidfile)) + master = master:gsub("%s+", "") + sigkill(master) + sigkill(workers) + end + + + before_each(function() + helpers.clean_prefix(prefix) + + assert(start()) + + -- sanity + helpers.wait_until(function() + return helpers.kong_exec("health", env) + end, 5) + + -- sanity + helpers.wait_until(function() + return helpers.path.exists(event_sock) + end, 5) + + kill_all() + + assert(helpers.path.exists(event_sock), + "events socket (" .. event_sock .. ") unexpectedly removed") + end) + + it("removes unix socket files in the prefix directory", function() + local ok, code, stdout, stderr = start() + assert.truthy(ok, "expected `kong start` to succeed: " .. tostring(code or stderr)) + assert.equals(0, code) + + finally(function() + helpers.stop_kong(prefix) + end) + + assert.matches("Kong started", stdout) + + assert.matches("[warn] Found dangling unix sockets in the prefix directory", stderr, nil, true) + assert.matches(prefix, stderr, nil, true) + + assert.matches("removing unix socket", stderr) + assert.matches(event_sock, stderr, nil, true) + end) + + it("does not log anything if Kong was stopped cleanly and no sockets are found", function() + local ok, code, stdout, stderr = start() + assert.truthy(ok, "expected `kong start` to succeed: " .. tostring(code or stderr)) + assert.equals(0, code) + assert.matches("Kong started", stdout) + + assert(helpers.stop_kong(prefix, true)) + + ok, code, stdout, stderr = start() + finally(function() + helpers.stop_kong(prefix) + end) + + assert.truthy(ok, "expected `kong start` to succeed: " .. tostring(code or stderr)) + assert.equals(0, code) + assert.matches("Kong started", stdout) + assert.not_matches("prefix directory .*not found", stdout) + + assert.not_matches("[warn]", stderr, nil, true) + assert.not_matches("unix socket", stderr) + end) + + it("does not do anything if kong is already running", function() + local ok, code, stdout, stderr = start() + assert.truthy(ok, "initial startup of kong failed: " .. stderr) + assert.equals(0, code) + + finally(function() + helpers.stop_kong(prefix) + end) + + assert.matches("Kong started", stdout) + + ok, code, _, stderr = start() + assert.falsy(ok, "expected `kong start` to fail with kong already running") + assert.equals(1, code) + assert.not_matches("unix socket", stderr) + assert(helpers.path.exists(event_sock)) + end) + end) + end) end From 5da2a0965c98add5b036391d62a27355c7613751 Mon Sep 17 00:00:00 2001 From: Michael Martin <3277009+flrgh@users.noreply.github.com> Date: Wed, 24 Aug 2022 11:37:09 -0700 Subject: [PATCH 2/4] fix(cmd) check if Kong is running before preparing the prefix If Kong is already running, `kong start` should not "prepare" or otherwise alter the prefix directory. --- kong/cmd/start.lua | 3 +- .../02-cmd/02-start_stop_spec.lua | 36 ++++++++++++++++++- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/kong/cmd/start.lua b/kong/cmd/start.lua index 3bfa90e70a30..46ae2e64a45e 100644 --- a/kong/cmd/start.lua +++ b/kong/cmd/start.lua @@ -56,11 +56,10 @@ local function execute(args) conf.cassandra_timeout = args.db_timeout -- connect + send + read conf.cassandra_schema_consensus_timeout = args.db_timeout - assert(prefix_handler.prepare_prefix(conf, args.nginx_conf)) - assert(not kill.is_running(conf.nginx_pid), "Kong is already running in " .. conf.prefix) + assert(prefix_handler.prepare_prefix(conf, args.nginx_conf)) cleanup_dangling_unix_sockets(conf.prefix) diff --git a/spec/02-integration/02-cmd/02-start_stop_spec.lua b/spec/02-integration/02-cmd/02-start_stop_spec.lua index ceb4fd1a63af..429083aa935d 100644 --- a/spec/02-integration/02-cmd/02-start_stop_spec.lua +++ b/spec/02-integration/02-cmd/02-start_stop_spec.lua @@ -501,7 +501,7 @@ describe("kong start/stop #" .. strategy, function() assert.False(ok) assert.matches("Kong is already running in " .. helpers.test_conf.prefix, stderr, nil, true) end) - it("should not stop Kong if already running in prefix", function() + it("should not start Kong if already running in prefix", function() local kill = require "kong.cmd.utils.kill" assert(helpers.kong_exec("start --prefix " .. helpers.test_conf.prefix, { @@ -517,6 +517,40 @@ describe("kong start/stop #" .. strategy, function() assert(kill.is_running(helpers.test_conf.nginx_pid)) end) + + it("does not prepare the prefix directory if Kong is already running", function() + local prefix = helpers.test_conf.prefix + + assert(helpers.kong_exec("start --prefix " .. prefix, { + database = "off", + nginx_main_worker_processes = "1", + })) + + finally(function() + helpers.stop_kong() + end) + + local kong_env = prefix .. "/.kong_env" + + local before, err = helpers.file.read(kong_env) + assert.truthy(before, "failed reading .kong_env: " .. tostring(err)) + assert.matches("nginx_main_worker_processes = 1", before) -- sanity + + local ok, stderr = helpers.kong_exec("start --prefix " .. prefix, { + database = "off", + nginx_main_worker_processes = "2", + }) + + assert.falsy(ok) + assert.matches("Kong is already running", stderr) + + local after + after, err = helpers.file.read(kong_env) + assert.truthy(after, "failed reading .kong_env: " .. tostring(err)) + + assert.equal(before, after, ".kong_env file was rewritten") + end) + it("ensures the required shared dictionaries are defined", function() local constants = require "kong.constants" local pl_file = require "pl.file" From 0956345f068877d715fccce5db4f69ca7340439f Mon Sep 17 00:00:00 2001 From: Michael Martin <3277009+flrgh@users.noreply.github.com> Date: Wed, 24 Aug 2022 11:49:49 -0700 Subject: [PATCH 3/4] docs(changelog) add entries for #9254 --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e8ab3f5827fc..ce226e8ba7fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -367,6 +367,9 @@ [#8497](https://github.com/Kong/kong/pull/8497) [9265](https://github.com/Kong/kong/pull/9265) - Improved error handling and debugging info in the DNS code [#8902](https://github.com/Kong/kong/pull/8902) +- Kong will now attempt to recover from an unclean shutdown by detecting and + removing dangling unix sockets in the prefix directory + [#9254](https://github.com/Kong/kong/pull/9254) #### Admin API @@ -500,6 +503,8 @@ [#9255](https://github.com/Kong/kong/pull/9255) - Fixed an issue where cache entries of some entities were not being properly invalidated after a cascade delete [#9261](https://github.com/Kong/kong/pull/9261) +- Running `kong start` when Kong is already running will no longer clobber + the existing `.kong_env` file [#9254](https://github.com/Kong/kong/pull/9254) #### Admin API From f2f996876492ec6c3c9c5bdd28ff2612e278ec0d Mon Sep 17 00:00:00 2001 From: Michael Martin <3277009+flrgh@users.noreply.github.com> Date: Wed, 24 Aug 2022 11:59:30 -0700 Subject: [PATCH 4/4] tests(cmd) fix test case --- spec/02-integration/02-cmd/02-start_stop_spec.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/02-integration/02-cmd/02-start_stop_spec.lua b/spec/02-integration/02-cmd/02-start_stop_spec.lua index 429083aa935d..9ed8278ed9a8 100644 --- a/spec/02-integration/02-cmd/02-start_stop_spec.lua +++ b/spec/02-integration/02-cmd/02-start_stop_spec.lua @@ -772,7 +772,7 @@ describe("kong start/stop #" .. strategy, function() assert.matches("Kong started", stdout) assert.not_matches("prefix directory .*not found", stdout) - assert.not_matches("[warn]", stderr, nil, true) + assert.not_matches("[warn] Found dangling unix sockets in the prefix directory", stderr, nil, true) assert.not_matches("unix socket", stderr) end)