diff --git a/changelog/unreleased/kong/fix_dns_retry_and_timeout_handling.yml b/changelog/unreleased/kong/fix_dns_retry_and_timeout_handling.yml deleted file mode 100644 index 359666786c56..000000000000 --- a/changelog/unreleased/kong/fix_dns_retry_and_timeout_handling.yml +++ /dev/null @@ -1,3 +0,0 @@ -message: Update the DNS client to follow configured timeouts in a more predictable manner -type: bugfix -scope: Core diff --git a/kong/resty/dns/client.lua b/kong/resty/dns/client.lua index c5bfc55821ff..b361a25c020a 100644 --- a/kong/resty/dns/client.lua +++ b/kong/resty/dns/client.lua @@ -360,9 +360,9 @@ end -- @param self the try_list to add to -- @param status string with current status, added to the list for the current try -- @return the try_list -local function add_status_to_try_list(self, status) - local try_list = self[#self].msg - try_list[#try_list + 1] = status +local function try_status(self, status) + local status_list = self[#self].msg + status_list[#status_list + 1] = status return self end @@ -383,7 +383,8 @@ end -- @section resolving -local resolve_max_wait +local poolMaxWait +local poolMaxRetry --- Initialize the client. Can be called multiple times. When called again it -- will clear the cache. @@ -637,7 +638,8 @@ _M.init = function(options) config = options -- store it in our module level global - resolve_max_wait = options.timeout / 1000 * options.retrans -- maximum time to wait for the dns resolver to hit its timeouts + poolMaxRetry = 1 -- do one retry, dns resolver is already doing 'retrans' number of retries on top + poolMaxWait = options.timeout / 1000 * options.retrans -- default is to wait for the dns resolver to hit its timeouts return true end @@ -670,7 +672,7 @@ local function parseAnswer(qname, qtype, answers, try_list) if (answer.type ~= qtype) or (answer.name ~= check_qname) then local key = answer.type..":"..answer.name - add_status_to_try_list(try_list, key .. " removed") + try_status(try_list, key .. " removed") local lst = others[key] if not lst then lst = {} @@ -708,7 +710,7 @@ local function individualQuery(qname, r_opts, try_list) return r, "failed to create a resolver: " .. err, try_list end - add_status_to_try_list(try_list, "querying") + try_status(try_list, "querying") local result result, err = r:query(qname, r_opts) @@ -735,7 +737,7 @@ local function executeQuery(premature, item) --[[ log(DEBUG, PREFIX, "Query executing: ", item.qname, ":", item.r_opts.qtype, " ", fquery(item)) --]] - add_status_to_try_list(item.try_list, "querying") + try_status(item.try_list, "querying") item.result, item.err = r:query(item.qname, item.r_opts) if item.result then --[[ @@ -777,7 +779,7 @@ local function asyncQuery(qname, r_opts, try_list) --[[ log(DEBUG, PREFIX, "Query async (exists): ", key, " ", fquery(item)) --]] - add_status_to_try_list(try_list, "in progress (async)") + try_status(try_list, "in progress (async)") return item -- already in progress, return existing query end @@ -799,7 +801,7 @@ local function asyncQuery(qname, r_opts, try_list) --[[ log(DEBUG, PREFIX, "Query async (scheduled): ", key, " ", fquery(item)) --]] - add_status_to_try_list(try_list, "scheduled") + try_status(try_list, "scheduled") return item end @@ -807,24 +809,33 @@ end -- schedules a sync query. -- This will be synchronized, so multiple calls (sync or async) might result in 1 query. --- The maximum delay would be `options.timeout * options.retrans`. +-- The `poolMaxWait` is how long a thread waits for another to complete the query. +-- The `poolMaxRetry` is how often we wait for another query to complete. +-- The maximum delay would be `poolMaxWait * poolMaxRetry`. -- @param qname the name to query for -- @param r_opts a table with the query options -- @param try_list the try_list object to add to -- @return `result + nil + try_list`, or `nil + err + try_list` in case of errors -local function syncQuery(qname, r_opts, try_list) +local function syncQuery(qname, r_opts, try_list, count) local key = qname..":"..r_opts.qtype local item = queue[key] + count = count or 1 -- if nothing is in progress, we start a new async query if not item then local err item, err = asyncQuery(qname, r_opts, try_list) + --[[ + log(DEBUG, PREFIX, "Query sync (new): ", key, " ", fquery(item)," count=", count) + --]] if not item then return item, err, try_list end else - add_status_to_try_list(try_list, "in progress (sync)") + --[[ + log(DEBUG, PREFIX, "Query sync (exists): ", key, " ", fquery(item)," count=", count) + --]] + try_status(try_list, "in progress (sync)") end local supported_semaphore_wait_phases = { @@ -849,7 +860,7 @@ local function syncQuery(qname, r_opts, try_list) end -- block and wait for the async query to complete - local ok, err = item.semaphore:wait(resolve_max_wait) + local ok, err = item.semaphore:wait(poolMaxWait) if ok and item.result then -- we were released, and have a query result from the -- other thread, so all is well, return it @@ -860,16 +871,25 @@ local function syncQuery(qname, r_opts, try_list) return item.result, item.err, try_list end - err = err or item.err or "unknown" - add_status_to_try_list(try_list, "error: "..err) + -- there was an error, either a semaphore timeout, or a lookup error + -- go retry + try_status(try_list, "try "..count.." error: "..(item.err or err or "unknown")) + if count > poolMaxRetry then + --[[ + log(DEBUG, PREFIX, "Query sync (fail): ", key, " ", fquery(item)," retries exceeded. count=", count) + --]] + return nil, "dns lookup pool exceeded retries (" .. + tostring(poolMaxRetry) .. "): "..tostring(item.err or err), + try_list + end -- don't block on the same thread again, so remove it from the queue - if queue[key] == item then - queue[key] = nil - end + if queue[key] == item then queue[key] = nil end - -- there was an error, either a semaphore timeout, or a lookup error - return nil, err + --[[ + log(DEBUG, PREFIX, "Query sync (fail): ", key, " ", fquery(item)," retrying. count=", count) + --]] + return syncQuery(qname, r_opts, try_list, count + 1) end -- will lookup a name in the cache, or alternatively query the nameservers. @@ -908,7 +928,7 @@ local function lookup(qname, r_opts, dnsCacheOnly, try_list) try_list = try_add(try_list, qname, r_opts.qtype, "cache-hit") if entry.expired then -- the cached record is stale but usable, so we do a refresh query in the background - add_status_to_try_list(try_list, "stale") + try_status(try_list, "stale") asyncQuery(qname, r_opts, try_list) end @@ -926,7 +946,7 @@ local function check_ipv6(qname, qtype, try_list) local record = cachelookup(qname, qtype) if record then - add_status_to_try_list(try_list, "cached") + try_status(try_list, "cached") return record, nil, try_list end @@ -946,7 +966,7 @@ local function check_ipv6(qname, qtype, try_list) end if qtype == _M.TYPE_AAAA and check:match("^%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?$") then - add_status_to_try_list(try_list, "validated") + try_status(try_list, "validated") record = {{ address = qname, type = _M.TYPE_AAAA, @@ -958,7 +978,7 @@ local function check_ipv6(qname, qtype, try_list) else -- not a valid IPv6 address, or a bad type (non ipv6) -- return a "server error" - add_status_to_try_list(try_list, "bad IPv6") + try_status(try_list, "bad IPv6") record = { errcode = 3, errstr = "name error", @@ -979,12 +999,12 @@ local function check_ipv4(qname, qtype, try_list) local record = cachelookup(qname, qtype) if record then - add_status_to_try_list(try_list, "cached") + try_status(try_list, "cached") return record, nil, try_list end if qtype == _M.TYPE_A then - add_status_to_try_list(try_list, "validated") + try_status(try_list, "validated") record = {{ address = qname, type = _M.TYPE_A, @@ -996,7 +1016,7 @@ local function check_ipv4(qname, qtype, try_list) else -- bad query type for this ipv4 address -- return a "server error" - add_status_to_try_list(try_list, "bad IPv4") + try_status(try_list, "bad IPv4") record = { errcode = 3, errstr = "name error", @@ -1140,7 +1160,7 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) records = nil -- luacheck: pop err = "recursion detected" - add_status_to_try_list(try_list, "recursion detected") + try_status(try_list, "recursion detected") return nil, err, try_list end end @@ -1152,14 +1172,14 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) -- luacheck: push no unused records = nil -- luacheck: pop - try_list = add_status_to_try_list(try_list, "stale") + try_list = try_status(try_list, "stale") else -- a valid non-stale record -- check for CNAME records, and dereferencing the CNAME if (records[1] or EMPTY).type == _M.TYPE_CNAME and qtype ~= _M.TYPE_CNAME then opts.qtype = nil - add_status_to_try_list(try_list, "dereferencing") + try_status(try_list, "dereferencing") return resolve(records[1].cname, opts, dnsCacheOnly, try_list) end @@ -1207,10 +1227,8 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) end if not records then -- luacheck: ignore - -- An error has occurred, terminate the lookup process. We don't want to try other record types because - -- that would potentially cause us to respond with wrong answers (i.e. the contents of an A record if the - -- query for the SRV record failed due to a network error). - goto failed + -- nothing to do, an error + -- fall through to the next entry in our search sequence elseif records.errcode then -- dns error: fall through to the next entry in our search sequence @@ -1269,7 +1287,7 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) if records[1].type == _M.TYPE_CNAME and qtype ~= _M.TYPE_CNAME then -- dereference CNAME opts.qtype = nil - add_status_to_try_list(try_list, "dereferencing") + try_status(try_list, "dereferencing") return resolve(records[1].cname, opts, dnsCacheOnly, try_list) end @@ -1278,9 +1296,8 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) end -- we had some error, record it in the status list - add_status_to_try_list(try_list, err) + try_status(try_list, err) end - ::failed:: -- we failed, clear cache and return last error if not dnsCacheOnly then @@ -1490,7 +1507,7 @@ local function toip(qname, port, dnsCacheOnly, try_list) local entry = rec[roundRobinW(rec)] -- our SRV entry might still contain a hostname, so recurse, with found port number local srvport = (entry.port ~= 0 and entry.port) or port -- discard port if it is 0 - add_status_to_try_list(try_list, "dereferencing SRV") + try_status(try_list, "dereferencing SRV") return toip(entry.target, srvport, dnsCacheOnly, try_list) else -- must be A or AAAA diff --git a/spec/01-unit/21-dns-client/02-client_spec.lua b/spec/01-unit/21-dns-client/02-client_spec.lua index 106e47fde1b3..586ab0edeb85 100644 --- a/spec/01-unit/21-dns-client/02-client_spec.lua +++ b/spec/01-unit/21-dns-client/02-client_spec.lua @@ -3,13 +3,6 @@ local tempfilename = require("pl.path").tmpname local pretty = require("pl.pretty").write --- Several DNS tests use the actual DNS to verify the client behavior against real name servers. It seems that --- even though we have a DNS mocking system, it is good to have some coverage against actual servers to ensure that --- we're not relying on mocked behavior. We use the domain name kong-gateway-testing.link, which is hosted in Route53 --- in the AWS sandbox, allowing Gateway developers to make additions if required. -local TEST_DOMAIN="kong-gateway-testing.link" - - -- empty records and not found errors should be identical, hence we -- define a constant for that error message local NOT_FOUND_ERROR = "dns server error: 3 name error" @@ -86,7 +79,7 @@ describe("[DNS client]", function() resolvConf = {"nameserver [fe80::1%enp0s20f0u1u1]"}, }) end) - local ip, port = client.toip(TEST_DOMAIN) + local ip, port = client.toip("thijsschreijer.nl") assert.is_nil(ip) assert.not_matches([[failed to parse host name "[fe80::1%enp0s20f0u1u1]": invalid IPv6 address]], port, nil, true) assert.matches([[failed to create a resolver: no nameservers specified]], port, nil, true) @@ -564,68 +557,13 @@ describe("[DNS client]", function() }, list) end) - for retrans in ipairs({1, 2}) do - for _, timeout in ipairs({1, 2}) do - it("correctly observes #timeout of " .. tostring(timeout) .. " seconds with " .. tostring(retrans) .. " retries", function() - -- KAG-2300 - https://github.com/Kong/kong/issues/10182 - -- If we encounter a timeout while talking to the DNS server, expect the total timeout to be close to the - -- configured timeout * retrans parameters - assert(client.init({ - resolvConf = { - "nameserver 198.51.100.0", - "domain one.com", - }, - timeout = timeout * 1000, - retrans = retrans, - hosts = { - "127.0.0.1 host" - } - })) - query_func = function(self, original_query_func, name, options) - ngx.sleep(5) - return nil - end - local start_time = ngx.now() - client.resolve("host1.one.com.") - local duration = ngx.now() - start_time - assert.truthy(duration < (timeout * retrans + 1)) - end) - end - end - - -- The domain name below needs to have both a SRV and an A record - local SRV_A_TEST_NAME = "timeouttest."..TEST_DOMAIN - - it("verify correctly set up test DNS entry", function() - assert(client.init({ timeout = 1000, retrans = 2 })) - local answers = client.resolve(SRV_A_TEST_NAME, { qtype = client.TYPE_SRV}) - assert.same(client.TYPE_SRV, answers[1].type) - answers = client.resolve(SRV_A_TEST_NAME, { qtype = client.TYPE_A}) - assert.same(client.TYPE_A, answers[1].type) - end) - - it("does not respond with incorrect answer on transient failure", function() - -- KAG-2300 - https://github.com/Kong/kong/issues/10182 - -- If we encounter a timeout while talking to the DNS server, don't keep trying with other record types - assert(client.init({ timeout = 1000, retrans = 2 })) - query_func = function(self, original_query_func, name, options) - if options.qtype == client.TYPE_SRV then - ngx.sleep(10) - else - return original_query_func(self, name, options) - end - end - local answers = client.resolve(SRV_A_TEST_NAME) - assert.is_nil(answers) - end) - end) it("fetching a record without nameservers errors", function() assert(client.init({ resolvConf = {} })) - local host = TEST_DOMAIN + local host = "thijsschreijer.nl" local typ = client.TYPE_A local answers, err, _ = client.resolve(host, { qtype = typ }) @@ -636,7 +574,7 @@ describe("[DNS client]", function() it("fetching a TXT record", function() assert(client.init()) - local host = "txttest."..TEST_DOMAIN + local host = "txttest.thijsschreijer.nl" local typ = client.TYPE_TXT local answers, err, try_list = client.resolve(host, { qtype = typ }) @@ -649,7 +587,7 @@ describe("[DNS client]", function() it("fetching a CNAME record", function() assert(client.init()) - local host = "smtp."..TEST_DOMAIN + local host = "smtp.thijsschreijer.nl" local typ = client.TYPE_CNAME local answers = assert(client.resolve(host, { qtype = typ })) @@ -661,7 +599,7 @@ describe("[DNS client]", function() it("fetching a CNAME record FQDN", function() assert(client.init()) - local host = "smtp."..TEST_DOMAIN + local host = "smtp.thijsschreijer.nl" local typ = client.TYPE_CNAME local answers = assert(client.resolve(host .. ".", { qtype = typ })) @@ -673,7 +611,7 @@ describe("[DNS client]", function() it("expire and touch times", function() assert(client.init()) - local host = "txttest."..TEST_DOMAIN + local host = "txttest.thijsschreijer.nl" local typ = client.TYPE_TXT local answers, _, _ = assert(client.resolve(host, { qtype = typ })) @@ -729,7 +667,7 @@ describe("[DNS client]", function() it("fetching multiple A records", function() assert(client.init()) - local host = "atest."..TEST_DOMAIN + local host = "atest.thijsschreijer.nl" local typ = client.TYPE_A local answers = assert(client.resolve(host, { qtype = typ })) @@ -743,7 +681,7 @@ describe("[DNS client]", function() it("fetching multiple A records FQDN", function() assert(client.init()) - local host = "atest."..TEST_DOMAIN + local host = "atest.thijsschreijer.nl" local typ = client.TYPE_A local answers = assert(client.resolve(host .. ".", { qtype = typ })) @@ -772,20 +710,20 @@ describe("[DNS client]", function() This does not affect client side code, as the result is always the final A record. --]] - local host = "smtp."..TEST_DOMAIN + local host = "smtp.thijsschreijer.nl" local typ = client.TYPE_A local answers, _, _ = assert(client.resolve(host)) -- check first CNAME local key1 = client.TYPE_CNAME..":"..host local entry1 = lrucache:get(key1) - assert.are.equal(host, entry1[1].name) -- the 1st record is the original 'smtp.'..TEST_DOMAIN + assert.are.equal(host, entry1[1].name) -- the 1st record is the original 'smtp.thijsschreijer.nl' assert.are.equal(client.TYPE_CNAME, entry1[1].type) -- and that is a CNAME -- check second CNAME local key2 = client.TYPE_CNAME..":"..entry1[1].cname local entry2 = lrucache:get(key2) - assert.are.equal(entry1[1].cname, entry2[1].name) -- the 2nd is the middle 'thuis.'..TEST_DOMAIN + assert.are.equal(entry1[1].cname, entry2[1].name) -- the 2nd is the middle 'thuis.thijsschreijer.nl' assert.are.equal(client.TYPE_CNAME, entry2[1].type) -- and that is also a CNAME -- check second target to match final record @@ -807,7 +745,7 @@ describe("[DNS client]", function() it("fetching multiple SRV records (un-typed)", function() assert(client.init()) - local host = "srvtest."..TEST_DOMAIN + local host = "srvtest.thijsschreijer.nl" local typ = client.TYPE_SRV -- un-typed lookup @@ -825,7 +763,7 @@ describe("[DNS client]", function() assert(client.init({ search = {}, })) local lrucache = client.getcache() - local host = "cname2srv."..TEST_DOMAIN + local host = "cname2srv.thijsschreijer.nl" local typ = client.TYPE_SRV -- un-typed lookup @@ -855,7 +793,7 @@ describe("[DNS client]", function() }, })) - local host = "srvtest."..TEST_DOMAIN + local host = "srvtest.thijsschreijer.nl" local typ = client.TYPE_A --> the entry is SRV not A local answers, err, _ = client.resolve(host, {qtype = typ}) @@ -871,7 +809,7 @@ describe("[DNS client]", function() }, })) - local host = "IsNotHere."..TEST_DOMAIN + local host = "IsNotHere.thijsschreijer.nl" local answers, err, _ = client.resolve(host) assert.is_nil(answers) @@ -1161,7 +1099,7 @@ describe("[DNS client]", function() describe("toip() function", function() it("A/AAAA-record, round-robin",function() assert(client.init({ search = {}, })) - local host = "atest."..TEST_DOMAIN + local host = "atest.thijsschreijer.nl" local answers = assert(client.resolve(host)) answers.last_index = nil -- make sure to clean local ips = {} @@ -1365,12 +1303,11 @@ describe("[DNS client]", function() assert.is_number(port) assert.is_not.equal(0, port) end) - it("port passing if SRV port=0",function() assert(client.init({ search = {}, })) local ip, port, host - host = "srvport0."..TEST_DOMAIN + host = "srvport0.thijsschreijer.nl" ip, port = client.toip(host, 10) assert.is_string(ip) assert.is_number(port) @@ -1380,7 +1317,6 @@ describe("[DNS client]", function() assert.is_string(ip) assert.is_nil(port) end) - it("recursive SRV pointing to itself",function() assert(client.init({ resolvConf = { @@ -1389,7 +1325,7 @@ describe("[DNS client]", function() }, })) local ip, record, port, host, err, _ - host = "srvrecurse."..TEST_DOMAIN + host = "srvrecurse.thijsschreijer.nl" -- resolve SRV specific should return the record including its -- recursive entry @@ -1544,7 +1480,7 @@ describe("[DNS client]", function() --empty/error responses should be cached for a configurable time local emptyTtl = 0.1 local staleTtl = 0.1 - local qname = "really.really.really.does.not.exist."..TEST_DOMAIN + local qname = "really.really.really.does.not.exist.thijsschreijer.nl" assert(client.init({ emptyTtl = emptyTtl, staleTtl = staleTtl, @@ -1712,7 +1648,7 @@ describe("[DNS client]", function() -- starting resolving coroutine.yield(coroutine.running()) local result, _, _ = client.resolve( - TEST_DOMAIN, + "thijsschreijer.nl", { qtype = client.TYPE_A } ) table.insert(results, result) @@ -1756,7 +1692,7 @@ describe("[DNS client]", function() })) -- insert a stub thats waits and returns a fixed record - local name = TEST_DOMAIN + local name = "thijsschreijer.nl" query_func = function() local ip = "1.4.2.3" local entry = { @@ -1802,7 +1738,7 @@ describe("[DNS client]", function() -- all results are equal, as they all will wait for the first response for i = 1, 10 do - assert.equal("timeout", results[i]) + assert.equal("dns lookup pool exceeded retries (1): timeout", results[i]) end end) end) @@ -1819,7 +1755,7 @@ describe("[DNS client]", function() -- insert a stub thats waits and returns a fixed record local call_count = 0 - local name = TEST_DOMAIN + local name = "thijsschreijer.nl" query_func = function() local ip = "1.4.2.3" local entry = { diff --git a/spec/fixtures/mocks/lua-resty-dns/resty/dns/resolver.lua b/spec/fixtures/mocks/lua-resty-dns/resty/dns/resolver.lua index f1b83355ab7a..75dcea922b97 100644 --- a/spec/fixtures/mocks/lua-resty-dns/resty/dns/resolver.lua +++ b/spec/fixtures/mocks/lua-resty-dns/resty/dns/resolver.lua @@ -95,18 +95,10 @@ resolver.query = function(self, name, options, tries) end if not mocks_only then - -- No mock, so invoke original resolver. Note that if the original resolver fails (i.e. because an - -- invalid domain name like example.com was used), we return an empty result set instead of passing - -- the error up to the caller. This is done so that if the mock contains "A" records (which would - -- be the most common case), the initial query for a SRV record does not fail, but appear not to have - -- yielded any results. This will make dns/client.lua try finding an A record next. - local records, err, tries = old_query(self, name, options, tries) - if records then - return records, err, tries - end + -- no mock, so invoke original resolver + local a, b, c = old_query(self, name, options, tries) + return a, b, c end - - return {}, nil, tries end do