Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(cmd) clean dangling unix sockets on startup #9254

Merged
merged 4 commits into from
Aug 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,9 @@
[#8497](https://github.com/Kong/kong/pull/8497) [9265](https://github.com/Kong/kong/pull/9265)
- Improved error handling and debugging info in the DNS code
[#8902](https://github.com/Kong/kong/pull/8902)
- Kong will now attempt to recover from an unclean shutdown by detecting and
removing dangling unix sockets in the prefix directory
[#9254](https://github.com/Kong/kong/pull/9254)

#### Admin API

Expand Down Expand Up @@ -500,6 +503,8 @@
[#9255](https://github.com/Kong/kong/pull/9255)
- Fixed an issue where cache entries of some entities were not being properly invalidated after a cascade delete
[#9261](https://github.com/Kong/kong/pull/9261)
- Running `kong start` when Kong is already running will no longer clobber
the existing `.kong_env` file [#9254](https://github.com/Kong/kong/pull/9254)


#### Admin API
Expand Down
41 changes: 39 additions & 2 deletions kong/cmd/start.lua
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,43 @@ local kong_global = require "kong.global"
local kill = require "kong.cmd.utils.kill"
local log = require "kong.cmd.utils.log"
local DB = require "kong.db"
local lfs = require "lfs"


local function is_socket(path)
return lfs.attributes(path, "mode") == "socket"
end

local function cleanup_dangling_unix_sockets(prefix)
local found = {}

for child in lfs.dir(prefix) do
local path = prefix .. "/" .. child
if is_socket(path) then
table.insert(found, path)
end
end

if #found < 1 then
return
end

log.warn("Found dangling unix sockets in the prefix directory (%q) while " ..
"preparing to start Kong. This may be a sign that Kong was " ..
"previously shut down uncleanly or is in an unknown state and " ..
"could require further investigation.",
prefix)

log.warn("Attempting to remove dangling sockets before starting Kong...")

for _, sock in ipairs(found) do
if is_socket(sock) then
log.warn("removing unix socket: %s", sock)
assert(os.remove(sock))
end
end
end

local function execute(args)
args.db_timeout = args.db_timeout * 1000
args.lock_timeout = args.lock_timeout
Expand All @@ -21,11 +56,13 @@ local function execute(args)
conf.cassandra_timeout = args.db_timeout -- connect + send + read
conf.cassandra_schema_consensus_timeout = args.db_timeout

assert(prefix_handler.prepare_prefix(conf, args.nginx_conf))

assert(not kill.is_running(conf.nginx_pid),
"Kong is already running in " .. conf.prefix)

assert(prefix_handler.prepare_prefix(conf, args.nginx_conf))

cleanup_dangling_unix_sockets(conf.prefix)

_G.kong = kong_global.new()
kong_global.init_pdk(_G.kong, conf)

Expand Down
188 changes: 183 additions & 5 deletions spec/02-integration/02-cmd/02-start_stop_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@ for _, strategy in helpers.each_strategy() do

describe("kong start/stop #" .. strategy, function()
lazy_setup(function()
helpers.get_db_utils(nil, {
"routes",
"services",
}) -- runs migrations
helpers.get_db_utils(strategy) -- runs migrations
helpers.prepare_prefix()
end)
after_each(function()
Expand Down Expand Up @@ -504,7 +501,7 @@ describe("kong start/stop #" .. strategy, function()
assert.False(ok)
assert.matches("Kong is already running in " .. helpers.test_conf.prefix, stderr, nil, true)
end)
it("should not stop Kong if already running in prefix", function()
it("should not start Kong if already running in prefix", function()
local kill = require "kong.cmd.utils.kill"

assert(helpers.kong_exec("start --prefix " .. helpers.test_conf.prefix, {
Expand All @@ -520,6 +517,40 @@ describe("kong start/stop #" .. strategy, function()

assert(kill.is_running(helpers.test_conf.nginx_pid))
end)

it("does not prepare the prefix directory if Kong is already running", function()
local prefix = helpers.test_conf.prefix

assert(helpers.kong_exec("start --prefix " .. prefix, {
database = "off",
nginx_main_worker_processes = "1",
}))

finally(function()
helpers.stop_kong()
end)

local kong_env = prefix .. "/.kong_env"

local before, err = helpers.file.read(kong_env)
assert.truthy(before, "failed reading .kong_env: " .. tostring(err))
assert.matches("nginx_main_worker_processes = 1", before) -- sanity

local ok, stderr = helpers.kong_exec("start --prefix " .. prefix, {
database = "off",
nginx_main_worker_processes = "2",
})

assert.falsy(ok)
assert.matches("Kong is already running", stderr)

local after
after, err = helpers.file.read(kong_env)
assert.truthy(after, "failed reading .kong_env: " .. tostring(err))

assert.equal(before, after, ".kong_env file was rewritten")
end)

it("ensures the required shared dictionaries are defined", function()
local constants = require "kong.constants"
local pl_file = require "pl.file"
Expand Down Expand Up @@ -617,5 +648,152 @@ describe("kong start/stop #" .. strategy, function()
assert.matches("the 'worker_consistency' configuration property is deprecated", stderr, nil, true)
end)
end)

describe("dangling socket cleanup", function()
local prefix = helpers.test_conf.prefix
local pidfile = helpers.test_conf.nginx_pid

-- the worker events socket is just one of many unix sockets we use
local event_sock = prefix .. "/worker_events.sock"

local env = {
prefix = prefix,
database = strategy,
admin_listen = "127.0.0.1:9001",
proxy_listen = "127.0.0.1:8000",
stream_listen = "127.0.0.1:9022",
nginx_main_worker_processes = 2, -- keeping this low for the sake of speed
}

local function start()
local cmd = string.format("start -p %q", prefix)
return helpers.kong_exec(cmd, env, true)
end


local function sigkill(pid)
if type(pid) == "table" then
pid = table.concat(pid, " ")
end

helpers.execute("kill -9 " .. pid)

helpers.wait_until(function()
-- kill returns:
--
-- * 0 on success
-- * 1 on failure
-- * 64 on partial failure/success
--
-- we might be passing in multiple pids, so we need to explicitly
-- check the exit code is 1, otherwise one or more processes might
-- still be alive
local _, code = helpers.execute("kill -0 " .. pid, true)
return code == 1
end)
end

local function get_worker_pids()
local admin = assert(helpers.admin_client())
local res = admin:get("/")

assert.res_status(200, res)

local json = assert.response(res).has.jsonbody()
admin:close()

return json.pids.workers
end

local function kill_all()
local workers = get_worker_pids()

local master = assert(helpers.file.read(pidfile))
master = master:gsub("%s+", "")
sigkill(master)
sigkill(workers)
end


before_each(function()
helpers.clean_prefix(prefix)

assert(start())

-- sanity
helpers.wait_until(function()
return helpers.kong_exec("health", env)
end, 5)

-- sanity
helpers.wait_until(function()
return helpers.path.exists(event_sock)
end, 5)

kill_all()

assert(helpers.path.exists(event_sock),
"events socket (" .. event_sock .. ") unexpectedly removed")
end)

it("removes unix socket files in the prefix directory", function()
local ok, code, stdout, stderr = start()
assert.truthy(ok, "expected `kong start` to succeed: " .. tostring(code or stderr))
assert.equals(0, code)

finally(function()
helpers.stop_kong(prefix)
end)

assert.matches("Kong started", stdout)

assert.matches("[warn] Found dangling unix sockets in the prefix directory", stderr, nil, true)
assert.matches(prefix, stderr, nil, true)

assert.matches("removing unix socket", stderr)
assert.matches(event_sock, stderr, nil, true)
end)

it("does not log anything if Kong was stopped cleanly and no sockets are found", function()
local ok, code, stdout, stderr = start()
assert.truthy(ok, "expected `kong start` to succeed: " .. tostring(code or stderr))
assert.equals(0, code)
assert.matches("Kong started", stdout)

assert(helpers.stop_kong(prefix, true))

ok, code, stdout, stderr = start()
finally(function()
helpers.stop_kong(prefix)
end)

assert.truthy(ok, "expected `kong start` to succeed: " .. tostring(code or stderr))
assert.equals(0, code)
assert.matches("Kong started", stdout)
assert.not_matches("prefix directory .*not found", stdout)

assert.not_matches("[warn] Found dangling unix sockets in the prefix directory", stderr, nil, true)
assert.not_matches("unix socket", stderr)
end)

it("does not do anything if kong is already running", function()
local ok, code, stdout, stderr = start()
assert.truthy(ok, "initial startup of kong failed: " .. stderr)
assert.equals(0, code)

finally(function()
helpers.stop_kong(prefix)
end)

assert.matches("Kong started", stdout)

ok, code, _, stderr = start()
assert.falsy(ok, "expected `kong start` to fail with kong already running")
assert.equals(1, code)
assert.not_matches("unix socket", stderr)
assert(helpers.path.exists(event_sock))
end)
end)

end)
end