From e5fde8c63663639df256eaa6d95fdf2ff33468e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans=20H=C3=BCbner?= Date: Fri, 11 Aug 2023 16:41:54 +0200 Subject: [PATCH] fix(dns): fix retry and timeout handling (#11386) - Stop retrying in dns/client.lua, let the resolver handle this. This change also makes it possible to disable retries, which previously was not possible - Be more faithful to the timeouts set by the user. Previously, the timeout configured was used only for the ultimate request sent to the DNS server, but asynchronous requests allowed longer timeouts which was not transparent. - When the DNS server fails, stop trying other query types. Previously, the behavior was such that after an (intermediate) failure to query for one record type (say "SRV"), the client would try the next record type (say "A") and succeed with that. It would then return the contents of the "A" record even if the "SRV" record pointed to a different address. - Change domain names used for testing the DNS client into the kong-gateway-testing.link zone, which is controlled by the Kong Gateway team. Fixes https://github.com/Kong/kong/issues/10182 KAG-2300 --- .../fix_dns_retry_and_timeout_handling.yml | 3 + kong/resty/dns/client.lua | 95 +++++++-------- spec/01-unit/21-dns-client/02-client_spec.lua | 110 ++++++++++++++---- .../lua-resty-dns/resty/dns/resolver.lua | 14 ++- 4 files changed, 140 insertions(+), 82 deletions(-) create mode 100644 changelog/unreleased/kong/fix_dns_retry_and_timeout_handling.yml diff --git a/changelog/unreleased/kong/fix_dns_retry_and_timeout_handling.yml b/changelog/unreleased/kong/fix_dns_retry_and_timeout_handling.yml new file mode 100644 index 00000000000..359666786c5 --- /dev/null +++ b/changelog/unreleased/kong/fix_dns_retry_and_timeout_handling.yml @@ -0,0 +1,3 @@ +message: Update the DNS client to follow configured timeouts in a more predictable manner +type: bugfix +scope: Core diff --git a/kong/resty/dns/client.lua b/kong/resty/dns/client.lua index e5bfafc76c4..9bc3a1761ef 100644 --- a/kong/resty/dns/client.lua +++ b/kong/resty/dns/client.lua @@ -365,9 +365,9 @@ end -- @param self the try_list to add to -- @param status string with current status, added to the list for the current try -- @return the try_list -local function try_status(self, status) - local status_list = self[#self].msg - status_list[#status_list + 1] = status +local function add_status_to_try_list(self, status) + local try_list = self[#self].msg + try_list[#try_list + 1] = status return self end @@ -388,8 +388,7 @@ end -- @section resolving -local poolMaxWait -local poolMaxRetry +local resolve_max_wait --- Initialize the client. Can be called multiple times. When called again it -- will clear the cache. @@ -643,8 +642,7 @@ _M.init = function(options) config = options -- store it in our module level global - poolMaxRetry = 1 -- do one retry, dns resolver is already doing 'retrans' number of retries on top - poolMaxWait = options.timeout / 1000 * options.retrans -- default is to wait for the dns resolver to hit its timeouts + resolve_max_wait = options.timeout / 1000 * options.retrans -- maximum time to wait for the dns resolver to hit its timeouts return true end @@ -677,7 +675,7 @@ local function parseAnswer(qname, qtype, answers, try_list) if (answer.type ~= qtype) or (answer.name ~= check_qname) then local key = answer.type..":"..answer.name - try_status(try_list, key .. " removed") + add_status_to_try_list(try_list, key .. " removed") local lst = others[key] if not lst then lst = {} @@ -714,7 +712,7 @@ local function individualQuery(qname, r_opts, try_list) return r, "failed to create a resolver: " .. err, try_list end - try_status(try_list, "querying") + add_status_to_try_list(try_list, "querying") local result result, err = r:query(qname, r_opts) @@ -747,7 +745,7 @@ local function executeQuery(premature, item) --[[ log(DEBUG, PREFIX, "Query executing: ", item.qname, ":", item.r_opts.qtype, " ", fquery(item)) --]] - try_status(item.try_list, "querying") + add_status_to_try_list(item.try_list, "querying") item.result, item.err = r:query(item.qname, item.r_opts) if item.result then --[[ @@ -791,7 +789,7 @@ local function asyncQuery(qname, r_opts, try_list) --[[ log(DEBUG, PREFIX, "Query async (exists): ", key, " ", fquery(item)) --]] - try_status(try_list, "in progress (async)") + add_status_to_try_list(try_list, "in progress (async)") return item -- already in progress, return existing query end @@ -813,7 +811,7 @@ local function asyncQuery(qname, r_opts, try_list) --[[ log(DEBUG, PREFIX, "Query async (scheduled): ", key, " ", fquery(item)) --]] - try_status(try_list, "scheduled") + add_status_to_try_list(try_list, "scheduled") return item end @@ -821,33 +819,24 @@ end -- schedules a sync query. -- This will be synchronized, so multiple calls (sync or async) might result in 1 query. --- The `poolMaxWait` is how long a thread waits for another to complete the query. --- The `poolMaxRetry` is how often we wait for another query to complete. --- The maximum delay would be `poolMaxWait * poolMaxRetry`. +-- The maximum delay would be `options.timeout * options.retrans`. -- @param qname the name to query for -- @param r_opts a table with the query options -- @param try_list the try_list object to add to -- @return `result + nil + try_list`, or `nil + err + try_list` in case of errors -local function syncQuery(qname, r_opts, try_list, count) +local function syncQuery(qname, r_opts, try_list) local key = qname..":"..r_opts.qtype local item = queue[key] - count = count or 1 -- if nothing is in progress, we start a new async query if not item then local err item, err = asyncQuery(qname, r_opts, try_list) - --[[ - log(DEBUG, PREFIX, "Query sync (new): ", key, " ", fquery(item)," count=", count) - --]] if not item then return item, err, try_list end else - --[[ - log(DEBUG, PREFIX, "Query sync (exists): ", key, " ", fquery(item)," count=", count) - --]] - try_status(try_list, "in progress (sync)") + add_status_to_try_list(try_list, "in progress (sync)") end local supported_semaphore_wait_phases = { @@ -872,7 +861,7 @@ local function syncQuery(qname, r_opts, try_list, count) end -- block and wait for the async query to complete - local ok, err = item.semaphore:wait(poolMaxWait) + local ok, err = item.semaphore:wait(resolve_max_wait) if ok and item.result then -- we were released, and have a query result from the -- other thread, so all is well, return it @@ -883,25 +872,16 @@ local function syncQuery(qname, r_opts, try_list, count) return item.result, item.err, try_list end - -- there was an error, either a semaphore timeout, or a lookup error - -- go retry - try_status(try_list, "try "..count.." error: "..(item.err or err or "unknown")) - if count > poolMaxRetry then - --[[ - log(DEBUG, PREFIX, "Query sync (fail): ", key, " ", fquery(item)," retries exceeded. count=", count) - --]] - return nil, "dns lookup pool exceeded retries (" .. - tostring(poolMaxRetry) .. "): "..tostring(item.err or err), - try_list - end + err = err or item.err or "unknown" + add_status_to_try_list(try_list, "error: "..err) -- don't block on the same thread again, so remove it from the queue - if queue[key] == item then queue[key] = nil end + if queue[key] == item then + queue[key] = nil + end - --[[ - log(DEBUG, PREFIX, "Query sync (fail): ", key, " ", fquery(item)," retrying. count=", count) - --]] - return syncQuery(qname, r_opts, try_list, count + 1) + -- there was an error, either a semaphore timeout, or a lookup error + return nil, err end -- will lookup a name in the cache, or alternatively query the nameservers. @@ -944,7 +924,7 @@ local function lookup(qname, r_opts, dnsCacheOnly, try_list) try_list = try_add(try_list, qname, r_opts.qtype, "cache-hit") if entry.expired then -- the cached record is stale but usable, so we do a refresh query in the background - try_status(try_list, "stale") + add_status_to_try_list(try_list, "stale") asyncQuery(qname, r_opts, try_list) end @@ -962,7 +942,7 @@ local function check_ipv6(qname, qtype, try_list) local record = cachelookup(qname, qtype) if record then - try_status(try_list, "cached") + add_status_to_try_list(try_list, "cached") return record, nil, try_list end @@ -982,7 +962,7 @@ local function check_ipv6(qname, qtype, try_list) end if qtype == _M.TYPE_AAAA and check:match("^%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?$") then - try_status(try_list, "validated") + add_status_to_try_list(try_list, "validated") record = {{ address = qname, type = _M.TYPE_AAAA, @@ -994,7 +974,7 @@ local function check_ipv6(qname, qtype, try_list) else -- not a valid IPv6 address, or a bad type (non ipv6) -- return a "server error" - try_status(try_list, "bad IPv6") + add_status_to_try_list(try_list, "bad IPv6") record = { errcode = 103, errstr = clientErrors[103], @@ -1015,12 +995,12 @@ local function check_ipv4(qname, qtype, try_list) local record = cachelookup(qname, qtype) if record then - try_status(try_list, "cached") + add_status_to_try_list(try_list, "cached") return record, nil, try_list end if qtype == _M.TYPE_A then - try_status(try_list, "validated") + add_status_to_try_list(try_list, "validated") record = {{ address = qname, type = _M.TYPE_A, @@ -1032,7 +1012,7 @@ local function check_ipv4(qname, qtype, try_list) else -- bad query type for this ipv4 address -- return a "server error" - try_status(try_list, "bad IPv4") + add_status_to_try_list(try_list, "bad IPv4") record = { errcode = 102, errstr = clientErrors[102], @@ -1178,7 +1158,7 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) records = nil -- luacheck: pop err = "recursion detected" - try_status(try_list, err) + add_status_to_try_list(try_list, err) return nil, err, try_list end end @@ -1190,14 +1170,14 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) -- luacheck: push no unused records = nil -- luacheck: pop - try_list = try_status(try_list, "stale") + try_list = add_status_to_try_list(try_list, "stale") else -- a valid non-stale record -- check for CNAME records, and dereferencing the CNAME if (records[1] or EMPTY).type == _M.TYPE_CNAME and qtype ~= _M.TYPE_CNAME then opts.qtype = nil - try_status(try_list, "dereferencing CNAME") + add_status_to_try_list(try_list, "dereferencing CNAME") return resolve(records[1].cname, opts, dnsCacheOnly, try_list) end @@ -1245,8 +1225,10 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) end if not records then -- luacheck: ignore - -- nothing to do, an error - -- fall through to the next entry in our search sequence + -- An error has occurred, terminate the lookup process. We don't want to try other record types because + -- that would potentially cause us to respond with wrong answers (i.e. the contents of an A record if the + -- query for the SRV record failed due to a network error). + goto failed elseif records.errcode then -- dns error: fall through to the next entry in our search sequence @@ -1305,7 +1287,7 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) if records[1].type == _M.TYPE_CNAME and qtype ~= _M.TYPE_CNAME then -- dereference CNAME opts.qtype = nil - try_status(try_list, "dereferencing CNAME") + add_status_to_try_list(try_list, "dereferencing CNAME") return resolve(records[1].cname, opts, dnsCacheOnly, try_list) end @@ -1314,8 +1296,9 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) end -- we had some error, record it in the status list - try_status(try_list, err) + add_status_to_try_list(try_list, err) end + ::failed:: -- we failed, clear cache and return last error if not dnsCacheOnly then @@ -1525,7 +1508,7 @@ local function toip(qname, port, dnsCacheOnly, try_list) local entry = rec[roundRobinW(rec)] -- our SRV entry might still contain a hostname, so recurse, with found port number local srvport = (entry.port ~= 0 and entry.port) or port -- discard port if it is 0 - try_status(try_list, "dereferencing SRV") + add_status_to_try_list(try_list, "dereferencing SRV") return toip(entry.target, srvport, dnsCacheOnly, try_list) else -- must be A or AAAA diff --git a/spec/01-unit/21-dns-client/02-client_spec.lua b/spec/01-unit/21-dns-client/02-client_spec.lua index 37256b6402b..42e20a716bc 100644 --- a/spec/01-unit/21-dns-client/02-client_spec.lua +++ b/spec/01-unit/21-dns-client/02-client_spec.lua @@ -3,6 +3,13 @@ local tempfilename = require("pl.path").tmpname local pretty = require("pl.pretty").write +-- Several DNS tests use the actual DNS to verify the client behavior against real name servers. It seems that +-- even though we have a DNS mocking system, it is good to have some coverage against actual servers to ensure that +-- we're not relying on mocked behavior. We use the domain name kong-gateway-testing.link, which is hosted in Route53 +-- in the AWS sandbox, allowing Gateway developers to make additions if required. +local TEST_DOMAIN="kong-gateway-testing.link" + + -- empty records and not found errors should be identical, hence we -- define a constant for that error message local NOT_FOUND_ERROR = "dns server error: 3 name error" @@ -81,7 +88,7 @@ describe("[DNS client]", function() resolvConf = {"nameserver [fe80::1%enp0s20f0u1u1]"}, }) end) - local ip, port = client.toip("thijsschreijer.nl") + local ip, port = client.toip(TEST_DOMAIN) assert.is_nil(ip) assert.not_matches([[failed to parse host name "[fe80::1%enp0s20f0u1u1]": invalid IPv6 address]], port, nil, true) assert.matches([[failed to create a resolver: no nameservers specified]], port, nil, true) @@ -559,13 +566,68 @@ describe("[DNS client]", function() }, list) end) + for retrans in ipairs({1, 2}) do + for _, timeout in ipairs({1, 2}) do + it("correctly observes #timeout of " .. tostring(timeout) .. " seconds with " .. tostring(retrans) .. " retries", function() + -- KAG-2300 - https://github.com/Kong/kong/issues/10182 + -- If we encounter a timeout while talking to the DNS server, expect the total timeout to be close to the + -- configured timeout * retrans parameters + assert(client.init({ + resolvConf = { + "nameserver 198.51.100.0", + "domain one.com", + }, + timeout = timeout * 1000, + retrans = retrans, + hosts = { + "127.0.0.1 host" + } + })) + query_func = function(self, original_query_func, name, options) + ngx.sleep(5) + return nil + end + local start_time = ngx.now() + client.resolve("host1.one.com.") + local duration = ngx.now() - start_time + assert.truthy(duration < (timeout * retrans + 1)) + end) + end + end + + -- The domain name below needs to have both a SRV and an A record + local SRV_A_TEST_NAME = "timeouttest."..TEST_DOMAIN + + it("verify correctly set up test DNS entry", function() + assert(client.init({ timeout = 1000, retrans = 2 })) + local answers = client.resolve(SRV_A_TEST_NAME, { qtype = client.TYPE_SRV}) + assert.same(client.TYPE_SRV, answers[1].type) + answers = client.resolve(SRV_A_TEST_NAME, { qtype = client.TYPE_A}) + assert.same(client.TYPE_A, answers[1].type) + end) + + it("does not respond with incorrect answer on transient failure", function() + -- KAG-2300 - https://github.com/Kong/kong/issues/10182 + -- If we encounter a timeout while talking to the DNS server, don't keep trying with other record types + assert(client.init({ timeout = 1000, retrans = 2 })) + query_func = function(self, original_query_func, name, options) + if options.qtype == client.TYPE_SRV then + ngx.sleep(10) + else + return original_query_func(self, name, options) + end + end + local answers = client.resolve(SRV_A_TEST_NAME) + assert.is_nil(answers) + end) + end) it("fetching a record without nameservers errors", function() assert(client.init({ resolvConf = {} })) - local host = "thijsschreijer.nl" + local host = TEST_DOMAIN local typ = client.TYPE_A local answers, err, _ = client.resolve(host, { qtype = typ }) @@ -576,7 +638,7 @@ describe("[DNS client]", function() it("fetching a TXT record", function() assert(client.init()) - local host = "txttest.thijsschreijer.nl" + local host = "txttest."..TEST_DOMAIN local typ = client.TYPE_TXT local answers, err, try_list = client.resolve(host, { qtype = typ }) @@ -589,7 +651,7 @@ describe("[DNS client]", function() it("fetching a CNAME record", function() assert(client.init()) - local host = "smtp.thijsschreijer.nl" + local host = "smtp."..TEST_DOMAIN local typ = client.TYPE_CNAME local answers = assert(client.resolve(host, { qtype = typ })) @@ -601,7 +663,7 @@ describe("[DNS client]", function() it("fetching a CNAME record FQDN", function() assert(client.init()) - local host = "smtp.thijsschreijer.nl" + local host = "smtp."..TEST_DOMAIN local typ = client.TYPE_CNAME local answers = assert(client.resolve(host .. ".", { qtype = typ })) @@ -613,7 +675,7 @@ describe("[DNS client]", function() it("expire and touch times", function() assert(client.init()) - local host = "txttest.thijsschreijer.nl" + local host = "txttest."..TEST_DOMAIN local typ = client.TYPE_TXT local answers, _, _ = assert(client.resolve(host, { qtype = typ })) @@ -669,7 +731,7 @@ describe("[DNS client]", function() it("fetching multiple A records", function() assert(client.init()) - local host = "atest.thijsschreijer.nl" + local host = "atest."..TEST_DOMAIN local typ = client.TYPE_A local answers = assert(client.resolve(host, { qtype = typ })) @@ -683,7 +745,7 @@ describe("[DNS client]", function() it("fetching multiple A records FQDN", function() assert(client.init()) - local host = "atest.thijsschreijer.nl" + local host = "atest."..TEST_DOMAIN local typ = client.TYPE_A local answers = assert(client.resolve(host .. ".", { qtype = typ })) @@ -712,20 +774,20 @@ describe("[DNS client]", function() This does not affect client side code, as the result is always the final A record. --]] - local host = "smtp.thijsschreijer.nl" + local host = "smtp."..TEST_DOMAIN local typ = client.TYPE_A local answers, _, _ = assert(client.resolve(host)) -- check first CNAME local key1 = client.TYPE_CNAME..":"..host local entry1 = lrucache:get(key1) - assert.are.equal(host, entry1[1].name) -- the 1st record is the original 'smtp.thijsschreijer.nl' + assert.are.equal(host, entry1[1].name) -- the 1st record is the original 'smtp.'..TEST_DOMAIN assert.are.equal(client.TYPE_CNAME, entry1[1].type) -- and that is a CNAME -- check second CNAME local key2 = client.TYPE_CNAME..":"..entry1[1].cname local entry2 = lrucache:get(key2) - assert.are.equal(entry1[1].cname, entry2[1].name) -- the 2nd is the middle 'thuis.thijsschreijer.nl' + assert.are.equal(entry1[1].cname, entry2[1].name) -- the 2nd is the middle 'thuis.'..TEST_DOMAIN assert.are.equal(client.TYPE_CNAME, entry2[1].type) -- and that is also a CNAME -- check second target to match final record @@ -747,7 +809,7 @@ describe("[DNS client]", function() it("fetching multiple SRV records (un-typed)", function() assert(client.init()) - local host = "srvtest.thijsschreijer.nl" + local host = "srvtest."..TEST_DOMAIN local typ = client.TYPE_SRV -- un-typed lookup @@ -765,7 +827,7 @@ describe("[DNS client]", function() assert(client.init({ search = {}, })) local lrucache = client.getcache() - local host = "cname2srv.thijsschreijer.nl" + local host = "cname2srv."..TEST_DOMAIN local typ = client.TYPE_SRV -- un-typed lookup @@ -795,7 +857,7 @@ describe("[DNS client]", function() search = {}, })) - local host = "srvtest.thijsschreijer.nl" + local host = "srvtest."..TEST_DOMAIN local typ = client.TYPE_A --> the entry is SRV not A local answers, err, _ = client.resolve(host, {qtype = typ}) @@ -811,7 +873,7 @@ describe("[DNS client]", function() search = {}, })) - local host = "IsNotHere.thijsschreijer.nl" + local host = "IsNotHere."..TEST_DOMAIN local answers, err, _ = client.resolve(host) assert.is_nil(answers) @@ -1101,7 +1163,7 @@ describe("[DNS client]", function() describe("toip() function", function() it("A/AAAA-record, round-robin",function() assert(client.init({ search = {}, })) - local host = "atest.thijsschreijer.nl" + local host = "atest."..TEST_DOMAIN local answers = assert(client.resolve(host)) answers.last_index = nil -- make sure to clean local ips = {} @@ -1305,11 +1367,12 @@ describe("[DNS client]", function() assert.is_number(port) assert.is_not.equal(0, port) end) + it("port passing if SRV port=0",function() assert(client.init({ search = {}, })) local ip, port, host - host = "srvport0.thijsschreijer.nl" + host = "srvport0."..TEST_DOMAIN ip, port = client.toip(host, 10) assert.is_string(ip) assert.is_number(port) @@ -1319,10 +1382,11 @@ describe("[DNS client]", function() assert.is_string(ip) assert.is_nil(port) end) + it("recursive SRV pointing to itself",function() assert(client.init({ search = {}, })) local ip, record, port, host, err, _ - host = "srvrecurse.thijsschreijer.nl" + host = "srvrecurse."..TEST_DOMAIN -- resolve SRV specific should return the record including its -- recursive entry @@ -1477,7 +1541,7 @@ describe("[DNS client]", function() --empty/error responses should be cached for a configurable time local emptyTtl = 0.1 local staleTtl = 0.1 - local qname = "really.really.really.does.not.exist.thijsschreijer.nl" + local qname = "really.really.really.does.not.exist."..TEST_DOMAIN assert(client.init({ emptyTtl = emptyTtl, staleTtl = staleTtl, @@ -1645,7 +1709,7 @@ describe("[DNS client]", function() -- starting resolving coroutine.yield(coroutine.running()) local result, _, _ = client.resolve( - "thijsschreijer.nl", + TEST_DOMAIN, { qtype = client.TYPE_A } ) table.insert(results, result) @@ -1689,7 +1753,7 @@ describe("[DNS client]", function() })) -- insert a stub thats waits and returns a fixed record - local name = "thijsschreijer.nl" + local name = TEST_DOMAIN query_func = function() local ip = "1.4.2.3" local entry = { @@ -1735,7 +1799,7 @@ describe("[DNS client]", function() -- all results are equal, as they all will wait for the first response for i = 1, 10 do - assert.equal("dns lookup pool exceeded retries (1): timeout", results[i]) + assert.equal("timeout", results[i]) end end) end) @@ -1752,7 +1816,7 @@ describe("[DNS client]", function() -- insert a stub thats waits and returns a fixed record local call_count = 0 - local name = "thijsschreijer.nl" + local name = TEST_DOMAIN query_func = function() local ip = "1.4.2.3" local entry = { diff --git a/spec/fixtures/mocks/lua-resty-dns/resty/dns/resolver.lua b/spec/fixtures/mocks/lua-resty-dns/resty/dns/resolver.lua index f6d56e71b86..cc59178fe09 100644 --- a/spec/fixtures/mocks/lua-resty-dns/resty/dns/resolver.lua +++ b/spec/fixtures/mocks/lua-resty-dns/resty/dns/resolver.lua @@ -95,10 +95,18 @@ resolver.query = function(self, name, options, tries) end if not mocks_only then - -- no mock, so invoke original resolver - local a, b, c = old_query(self, name, options, tries) - return a, b, c + -- No mock, so invoke original resolver. Note that if the original resolver fails (i.e. because an + -- invalid domain name like example.com was used), we return an empty result set instead of passing + -- the error up to the caller. This is done so that if the mock contains "A" records (which would + -- be the most common case), the initial query for a SRV record does not fail, but appear not to have + -- yielded any results. This will make dns/client.lua try finding an A record next. + local records, err, tries = old_query(self, name, options, tries) + if records then + return records, err, tries + end end + + return {}, nil, tries end -- do