Skip to content

Commit

Permalink
Revert "fix(dns): fix retry and timeout handling (#11386)"
Browse files Browse the repository at this point in the history
This reverts commit 7315817.
  • Loading branch information
AndyZhang0707 committed Jul 26, 2024
1 parent a6b24fe commit 76f4a85
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 140 deletions.

This file was deleted.

95 changes: 56 additions & 39 deletions kong/resty/dns/client.lua
Original file line number Diff line number Diff line change
Expand Up @@ -360,9 +360,9 @@ end
-- @param self the try_list to add to
-- @param status string with current status, added to the list for the current try
-- @return the try_list
local function add_status_to_try_list(self, status)
local try_list = self[#self].msg
try_list[#try_list + 1] = status
local function try_status(self, status)
local status_list = self[#self].msg
status_list[#status_list + 1] = status
return self
end

Expand All @@ -383,7 +383,8 @@ end
-- @section resolving


local resolve_max_wait
local poolMaxWait
local poolMaxRetry

--- Initialize the client. Can be called multiple times. When called again it
-- will clear the cache.
Expand Down Expand Up @@ -637,7 +638,8 @@ _M.init = function(options)

config = options -- store it in our module level global

resolve_max_wait = options.timeout / 1000 * options.retrans -- maximum time to wait for the dns resolver to hit its timeouts
poolMaxRetry = 1 -- do one retry, dns resolver is already doing 'retrans' number of retries on top
poolMaxWait = options.timeout / 1000 * options.retrans -- default is to wait for the dns resolver to hit its timeouts

return true
end
Expand Down Expand Up @@ -670,7 +672,7 @@ local function parseAnswer(qname, qtype, answers, try_list)

if (answer.type ~= qtype) or (answer.name ~= check_qname) then
local key = answer.type..":"..answer.name
add_status_to_try_list(try_list, key .. " removed")
try_status(try_list, key .. " removed")
local lst = others[key]
if not lst then
lst = {}
Expand Down Expand Up @@ -708,7 +710,7 @@ local function individualQuery(qname, r_opts, try_list)
return r, "failed to create a resolver: " .. err, try_list
end

add_status_to_try_list(try_list, "querying")
try_status(try_list, "querying")

local result
result, err = r:query(qname, r_opts)
Expand All @@ -735,7 +737,7 @@ local function executeQuery(premature, item)
--[[
log(DEBUG, PREFIX, "Query executing: ", item.qname, ":", item.r_opts.qtype, " ", fquery(item))
--]]
add_status_to_try_list(item.try_list, "querying")
try_status(item.try_list, "querying")
item.result, item.err = r:query(item.qname, item.r_opts)
if item.result then
--[[
Expand Down Expand Up @@ -777,7 +779,7 @@ local function asyncQuery(qname, r_opts, try_list)
--[[
log(DEBUG, PREFIX, "Query async (exists): ", key, " ", fquery(item))
--]]
add_status_to_try_list(try_list, "in progress (async)")
try_status(try_list, "in progress (async)")
return item -- already in progress, return existing query
end

Expand All @@ -799,32 +801,41 @@ local function asyncQuery(qname, r_opts, try_list)
--[[
log(DEBUG, PREFIX, "Query async (scheduled): ", key, " ", fquery(item))
--]]
add_status_to_try_list(try_list, "scheduled")
try_status(try_list, "scheduled")

return item
end


-- schedules a sync query.
-- This will be synchronized, so multiple calls (sync or async) might result in 1 query.
-- The maximum delay would be `options.timeout * options.retrans`.
-- The `poolMaxWait` is how long a thread waits for another to complete the query.
-- The `poolMaxRetry` is how often we wait for another query to complete.
-- The maximum delay would be `poolMaxWait * poolMaxRetry`.
-- @param qname the name to query for
-- @param r_opts a table with the query options
-- @param try_list the try_list object to add to
-- @return `result + nil + try_list`, or `nil + err + try_list` in case of errors
local function syncQuery(qname, r_opts, try_list)
local function syncQuery(qname, r_opts, try_list, count)
local key = qname..":"..r_opts.qtype
local item = queue[key]
count = count or 1

-- if nothing is in progress, we start a new async query
if not item then
local err
item, err = asyncQuery(qname, r_opts, try_list)
--[[
log(DEBUG, PREFIX, "Query sync (new): ", key, " ", fquery(item)," count=", count)
--]]
if not item then
return item, err, try_list
end
else
add_status_to_try_list(try_list, "in progress (sync)")
--[[
log(DEBUG, PREFIX, "Query sync (exists): ", key, " ", fquery(item)," count=", count)
--]]
try_status(try_list, "in progress (sync)")
end

local supported_semaphore_wait_phases = {
Expand All @@ -849,7 +860,7 @@ local function syncQuery(qname, r_opts, try_list)
end

-- block and wait for the async query to complete
local ok, err = item.semaphore:wait(resolve_max_wait)
local ok, err = item.semaphore:wait(poolMaxWait)
if ok and item.result then
-- we were released, and have a query result from the
-- other thread, so all is well, return it
Expand All @@ -860,16 +871,25 @@ local function syncQuery(qname, r_opts, try_list)
return item.result, item.err, try_list
end

err = err or item.err or "unknown"
add_status_to_try_list(try_list, "error: "..err)
-- there was an error, either a semaphore timeout, or a lookup error
-- go retry
try_status(try_list, "try "..count.." error: "..(item.err or err or "unknown"))
if count > poolMaxRetry then
--[[
log(DEBUG, PREFIX, "Query sync (fail): ", key, " ", fquery(item)," retries exceeded. count=", count)
--]]
return nil, "dns lookup pool exceeded retries (" ..
tostring(poolMaxRetry) .. "): "..tostring(item.err or err),
try_list
end

-- don't block on the same thread again, so remove it from the queue
if queue[key] == item then
queue[key] = nil
end
if queue[key] == item then queue[key] = nil end

-- there was an error, either a semaphore timeout, or a lookup error
return nil, err
--[[
log(DEBUG, PREFIX, "Query sync (fail): ", key, " ", fquery(item)," retrying. count=", count)
--]]
return syncQuery(qname, r_opts, try_list, count + 1)
end

-- will lookup a name in the cache, or alternatively query the nameservers.
Expand Down Expand Up @@ -908,7 +928,7 @@ local function lookup(qname, r_opts, dnsCacheOnly, try_list)
try_list = try_add(try_list, qname, r_opts.qtype, "cache-hit")
if entry.expired then
-- the cached record is stale but usable, so we do a refresh query in the background
add_status_to_try_list(try_list, "stale")
try_status(try_list, "stale")
asyncQuery(qname, r_opts, try_list)
end

Expand All @@ -926,7 +946,7 @@ local function check_ipv6(qname, qtype, try_list)

local record = cachelookup(qname, qtype)
if record then
add_status_to_try_list(try_list, "cached")
try_status(try_list, "cached")
return record, nil, try_list
end

Expand All @@ -946,7 +966,7 @@ local function check_ipv6(qname, qtype, try_list)
end
if qtype == _M.TYPE_AAAA and
check:match("^%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?$") then
add_status_to_try_list(try_list, "validated")
try_status(try_list, "validated")
record = {{
address = qname,
type = _M.TYPE_AAAA,
Expand All @@ -958,7 +978,7 @@ local function check_ipv6(qname, qtype, try_list)
else
-- not a valid IPv6 address, or a bad type (non ipv6)
-- return a "server error"
add_status_to_try_list(try_list, "bad IPv6")
try_status(try_list, "bad IPv6")
record = {
errcode = 3,
errstr = "name error",
Expand All @@ -979,12 +999,12 @@ local function check_ipv4(qname, qtype, try_list)

local record = cachelookup(qname, qtype)
if record then
add_status_to_try_list(try_list, "cached")
try_status(try_list, "cached")
return record, nil, try_list
end

if qtype == _M.TYPE_A then
add_status_to_try_list(try_list, "validated")
try_status(try_list, "validated")
record = {{
address = qname,
type = _M.TYPE_A,
Expand All @@ -996,7 +1016,7 @@ local function check_ipv4(qname, qtype, try_list)
else
-- bad query type for this ipv4 address
-- return a "server error"
add_status_to_try_list(try_list, "bad IPv4")
try_status(try_list, "bad IPv4")
record = {
errcode = 3,
errstr = "name error",
Expand Down Expand Up @@ -1140,7 +1160,7 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list)
records = nil
-- luacheck: pop
err = "recursion detected"
add_status_to_try_list(try_list, "recursion detected")
try_status(try_list, "recursion detected")
return nil, err, try_list
end
end
Expand All @@ -1152,14 +1172,14 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list)
-- luacheck: push no unused
records = nil
-- luacheck: pop
try_list = add_status_to_try_list(try_list, "stale")
try_list = try_status(try_list, "stale")

else
-- a valid non-stale record
-- check for CNAME records, and dereferencing the CNAME
if (records[1] or EMPTY).type == _M.TYPE_CNAME and qtype ~= _M.TYPE_CNAME then
opts.qtype = nil
add_status_to_try_list(try_list, "dereferencing")
try_status(try_list, "dereferencing")
return resolve(records[1].cname, opts, dnsCacheOnly, try_list)
end

Expand Down Expand Up @@ -1207,10 +1227,8 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list)
end

if not records then -- luacheck: ignore
-- An error has occurred, terminate the lookup process. We don't want to try other record types because
-- that would potentially cause us to respond with wrong answers (i.e. the contents of an A record if the
-- query for the SRV record failed due to a network error).
goto failed
-- nothing to do, an error
-- fall through to the next entry in our search sequence

elseif records.errcode then
-- dns error: fall through to the next entry in our search sequence
Expand Down Expand Up @@ -1269,7 +1287,7 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list)
if records[1].type == _M.TYPE_CNAME and qtype ~= _M.TYPE_CNAME then
-- dereference CNAME
opts.qtype = nil
add_status_to_try_list(try_list, "dereferencing")
try_status(try_list, "dereferencing")
return resolve(records[1].cname, opts, dnsCacheOnly, try_list)
end

Expand All @@ -1278,9 +1296,8 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list)
end

-- we had some error, record it in the status list
add_status_to_try_list(try_list, err)
try_status(try_list, err)
end
::failed::

-- we failed, clear cache and return last error
if not dnsCacheOnly then
Expand Down Expand Up @@ -1490,7 +1507,7 @@ local function toip(qname, port, dnsCacheOnly, try_list)
local entry = rec[roundRobinW(rec)]
-- our SRV entry might still contain a hostname, so recurse, with found port number
local srvport = (entry.port ~= 0 and entry.port) or port -- discard port if it is 0
add_status_to_try_list(try_list, "dereferencing SRV")
try_status(try_list, "dereferencing SRV")
return toip(entry.target, srvport, dnsCacheOnly, try_list)
else
-- must be A or AAAA
Expand Down
Loading

0 comments on commit 76f4a85

Please sign in to comment.