diff --git a/changelog/unreleased/kong/fix_dns_retry_and_timeout_handling.yml b/changelog/unreleased/kong/fix_dns_retry_and_timeout_handling.yml new file mode 100644 index 00000000000..359666786c5 --- /dev/null +++ b/changelog/unreleased/kong/fix_dns_retry_and_timeout_handling.yml @@ -0,0 +1,3 @@ +message: Update the DNS client to follow configured timeouts in a more predictable manner +type: bugfix +scope: Core diff --git a/kong/resty/dns/client.lua b/kong/resty/dns/client.lua index 3efa99312c6..bc498c9ddcb 100644 --- a/kong/resty/dns/client.lua +++ b/kong/resty/dns/client.lua @@ -365,9 +365,9 @@ end -- @param self the try_list to add to -- @param status string with current status, added to the list for the current try -- @return the try_list -local function try_status(self, status) - local status_list = self[#self].msg - status_list[#status_list + 1] = status +local function add_status_to_try_list(self, status) + local try_list = self[#self].msg + try_list[#try_list + 1] = status return self end @@ -388,8 +388,7 @@ end -- @section resolving -local poolMaxWait -local poolMaxRetry +local resolve_max_wait --- Initialize the client. Can be called multiple times. When called again it -- will clear the cache. @@ -643,8 +642,7 @@ _M.init = function(options) config = options -- store it in our module level global - poolMaxRetry = 1 -- do one retry, dns resolver is already doing 'retrans' number of retries on top - poolMaxWait = options.timeout / 1000 * options.retrans -- default is to wait for the dns resolver to hit its timeouts + resolve_max_wait = options.timeout / 1000 * options.retrans -- maximum time to wait for the dns resolver to hit its timeouts return true end @@ -677,7 +675,7 @@ local function parseAnswer(qname, qtype, answers, try_list) if (answer.type ~= qtype) or (answer.name ~= check_qname) then local key = answer.type..":"..answer.name - try_status(try_list, key .. " removed") + add_status_to_try_list(try_list, key .. " removed") local lst = others[key] if not lst then lst = {} @@ -714,7 +712,7 @@ local function individualQuery(qname, r_opts, try_list) return r, "failed to create a resolver: " .. err, try_list end - try_status(try_list, "querying") + add_status_to_try_list(try_list, "querying") local result result, err = r:query(qname, r_opts) @@ -747,7 +745,7 @@ local function executeQuery(premature, item) --[[ log(DEBUG, PREFIX, "Query executing: ", item.qname, ":", item.r_opts.qtype, " ", fquery(item)) --]] - try_status(item.try_list, "querying") + add_status_to_try_list(item.try_list, "querying") item.result, item.err = r:query(item.qname, item.r_opts) if item.result then --[[ @@ -772,7 +770,9 @@ local function executeQuery(premature, item) item.semaphore = nil ngx.sleep(0) -- 3) destroy the resolver -- ditto in individualQuery - r:destroy() + if r then + r:destroy() + end end @@ -791,7 +791,7 @@ local function asyncQuery(qname, r_opts, try_list) --[[ log(DEBUG, PREFIX, "Query async (exists): ", key, " ", fquery(item)) --]] - try_status(try_list, "in progress (async)") + add_status_to_try_list(try_list, "in progress (async)") return item -- already in progress, return existing query end @@ -813,7 +813,7 @@ local function asyncQuery(qname, r_opts, try_list) --[[ log(DEBUG, PREFIX, "Query async (scheduled): ", key, " ", fquery(item)) --]] - try_status(try_list, "scheduled") + add_status_to_try_list(try_list, "scheduled") return item end @@ -821,33 +821,24 @@ end -- schedules a sync query. -- This will be synchronized, so multiple calls (sync or async) might result in 1 query. --- The `poolMaxWait` is how long a thread waits for another to complete the query. --- The `poolMaxRetry` is how often we wait for another query to complete. --- The maximum delay would be `poolMaxWait * poolMaxRetry`. +-- The maximum delay would be `options.timeout * options.retrans`. -- @param qname the name to query for -- @param r_opts a table with the query options -- @param try_list the try_list object to add to -- @return `result + nil + try_list`, or `nil + err + try_list` in case of errors -local function syncQuery(qname, r_opts, try_list, count) +local function syncQuery(qname, r_opts, try_list) local key = qname..":"..r_opts.qtype local item = queue[key] - count = count or 1 -- if nothing is in progress, we start a new async query if not item then local err item, err = asyncQuery(qname, r_opts, try_list) - --[[ - log(DEBUG, PREFIX, "Query sync (new): ", key, " ", fquery(item)," count=", count) - --]] if not item then return item, err, try_list end else - --[[ - log(DEBUG, PREFIX, "Query sync (exists): ", key, " ", fquery(item)," count=", count) - --]] - try_status(try_list, "in progress (sync)") + add_status_to_try_list(try_list, "in progress (sync)") end local supported_semaphore_wait_phases = { @@ -872,7 +863,7 @@ local function syncQuery(qname, r_opts, try_list, count) end -- block and wait for the async query to complete - local ok, err = item.semaphore:wait(poolMaxWait) + local ok, err = item.semaphore:wait(resolve_max_wait) if ok and item.result then -- we were released, and have a query result from the -- other thread, so all is well, return it @@ -883,25 +874,16 @@ local function syncQuery(qname, r_opts, try_list, count) return item.result, item.err, try_list end - -- there was an error, either a semaphore timeout, or a lookup error - -- go retry - try_status(try_list, "try "..count.." error: "..(item.err or err or "unknown")) - if count > poolMaxRetry then - --[[ - log(DEBUG, PREFIX, "Query sync (fail): ", key, " ", fquery(item)," retries exceeded. count=", count) - --]] - return nil, "dns lookup pool exceeded retries (" .. - tostring(poolMaxRetry) .. "): "..tostring(item.err or err), - try_list - end + err = err or item.err or "unknown" + add_status_to_try_list(try_list, "error: "..err) -- don't block on the same thread again, so remove it from the queue - if queue[key] == item then queue[key] = nil end + if queue[key] == item then + queue[key] = nil + end - --[[ - log(DEBUG, PREFIX, "Query sync (fail): ", key, " ", fquery(item)," retrying. count=", count) - --]] - return syncQuery(qname, r_opts, try_list, count + 1) + -- there was an error, either a semaphore timeout, or a lookup error + return nil, err end -- will lookup a name in the cache, or alternatively query the nameservers. @@ -944,7 +926,7 @@ local function lookup(qname, r_opts, dnsCacheOnly, try_list) try_list = try_add(try_list, qname, r_opts.qtype, "cache-hit") if entry.expired then -- the cached record is stale but usable, so we do a refresh query in the background - try_status(try_list, "stale") + add_status_to_try_list(try_list, "stale") asyncQuery(qname, r_opts, try_list) end @@ -962,7 +944,7 @@ local function check_ipv6(qname, qtype, try_list) local record = cachelookup(qname, qtype) if record then - try_status(try_list, "cached") + add_status_to_try_list(try_list, "cached") return record, nil, try_list end @@ -982,7 +964,7 @@ local function check_ipv6(qname, qtype, try_list) end if qtype == _M.TYPE_AAAA and check:match("^%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?:%x%x?%x?%x?$") then - try_status(try_list, "validated") + add_status_to_try_list(try_list, "validated") record = {{ address = qname, type = _M.TYPE_AAAA, @@ -994,7 +976,7 @@ local function check_ipv6(qname, qtype, try_list) else -- not a valid IPv6 address, or a bad type (non ipv6) -- return a "server error" - try_status(try_list, "bad IPv6") + add_status_to_try_list(try_list, "bad IPv6") record = { errcode = 103, errstr = clientErrors[103], @@ -1015,12 +997,12 @@ local function check_ipv4(qname, qtype, try_list) local record = cachelookup(qname, qtype) if record then - try_status(try_list, "cached") + add_status_to_try_list(try_list, "cached") return record, nil, try_list end if qtype == _M.TYPE_A then - try_status(try_list, "validated") + add_status_to_try_list(try_list, "validated") record = {{ address = qname, type = _M.TYPE_A, @@ -1032,7 +1014,7 @@ local function check_ipv4(qname, qtype, try_list) else -- bad query type for this ipv4 address -- return a "server error" - try_status(try_list, "bad IPv4") + add_status_to_try_list(try_list, "bad IPv4") record = { errcode = 102, errstr = clientErrors[102], @@ -1178,7 +1160,7 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) records = nil -- luacheck: pop err = "recursion detected" - try_status(try_list, err) + add_status_to_try_list(try_list, err) return nil, err, try_list end end @@ -1190,14 +1172,14 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) -- luacheck: push no unused records = nil -- luacheck: pop - try_list = try_status(try_list, "stale") + try_list = add_status_to_try_list(try_list, "stale") else -- a valid non-stale record -- check for CNAME records, and dereferencing the CNAME if (records[1] or EMPTY).type == _M.TYPE_CNAME and qtype ~= _M.TYPE_CNAME then opts.qtype = nil - try_status(try_list, "dereferencing CNAME") + add_status_to_try_list(try_list, "dereferencing CNAME") return resolve(records[1].cname, opts, dnsCacheOnly, try_list) end @@ -1245,8 +1227,10 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) end if not records then -- luacheck: ignore - -- nothing to do, an error - -- fall through to the next entry in our search sequence + -- An error has occurred, terminate the lookup process. We don't want to try other record types because + -- that would potentially cause us to respond with wrong answers (i.e. the contents of an A record if the + -- query for the SRV record failed due to a network error). + goto failed elseif records.errcode then -- dns error: fall through to the next entry in our search sequence @@ -1305,7 +1289,7 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) if records[1].type == _M.TYPE_CNAME and qtype ~= _M.TYPE_CNAME then -- dereference CNAME opts.qtype = nil - try_status(try_list, "dereferencing CNAME") + add_status_to_try_list(try_list, "dereferencing CNAME") return resolve(records[1].cname, opts, dnsCacheOnly, try_list) end @@ -1314,8 +1298,9 @@ local function resolve(qname, r_opts, dnsCacheOnly, try_list) end -- we had some error, record it in the status list - try_status(try_list, err) + add_status_to_try_list(try_list, err) end + ::failed:: -- we failed, clear cache and return last error if not dnsCacheOnly then @@ -1525,7 +1510,7 @@ local function toip(qname, port, dnsCacheOnly, try_list) local entry = rec[roundRobinW(rec)] -- our SRV entry might still contain a hostname, so recurse, with found port number local srvport = (entry.port ~= 0 and entry.port) or port -- discard port if it is 0 - try_status(try_list, "dereferencing SRV") + add_status_to_try_list(try_list, "dereferencing SRV") return toip(entry.target, srvport, dnsCacheOnly, try_list) else -- must be A or AAAA diff --git a/spec/01-unit/21-dns-client/02-client_spec.lua b/spec/01-unit/21-dns-client/02-client_spec.lua index 37256b6402b..42e20a716bc 100644 --- a/spec/01-unit/21-dns-client/02-client_spec.lua +++ b/spec/01-unit/21-dns-client/02-client_spec.lua @@ -3,6 +3,13 @@ local tempfilename = require("pl.path").tmpname local pretty = require("pl.pretty").write +-- Several DNS tests use the actual DNS to verify the client behavior against real name servers. It seems that +-- even though we have a DNS mocking system, it is good to have some coverage against actual servers to ensure that +-- we're not relying on mocked behavior. We use the domain name kong-gateway-testing.link, which is hosted in Route53 +-- in the AWS sandbox, allowing Gateway developers to make additions if required. +local TEST_DOMAIN="kong-gateway-testing.link" + + -- empty records and not found errors should be identical, hence we -- define a constant for that error message local NOT_FOUND_ERROR = "dns server error: 3 name error" @@ -81,7 +88,7 @@ describe("[DNS client]", function() resolvConf = {"nameserver [fe80::1%enp0s20f0u1u1]"}, }) end) - local ip, port = client.toip("thijsschreijer.nl") + local ip, port = client.toip(TEST_DOMAIN) assert.is_nil(ip) assert.not_matches([[failed to parse host name "[fe80::1%enp0s20f0u1u1]": invalid IPv6 address]], port, nil, true) assert.matches([[failed to create a resolver: no nameservers specified]], port, nil, true) @@ -559,13 +566,68 @@ describe("[DNS client]", function() }, list) end) + for retrans in ipairs({1, 2}) do + for _, timeout in ipairs({1, 2}) do + it("correctly observes #timeout of " .. tostring(timeout) .. " seconds with " .. tostring(retrans) .. " retries", function() + -- KAG-2300 - https://github.com/Kong/kong/issues/10182 + -- If we encounter a timeout while talking to the DNS server, expect the total timeout to be close to the + -- configured timeout * retrans parameters + assert(client.init({ + resolvConf = { + "nameserver 198.51.100.0", + "domain one.com", + }, + timeout = timeout * 1000, + retrans = retrans, + hosts = { + "127.0.0.1 host" + } + })) + query_func = function(self, original_query_func, name, options) + ngx.sleep(5) + return nil + end + local start_time = ngx.now() + client.resolve("host1.one.com.") + local duration = ngx.now() - start_time + assert.truthy(duration < (timeout * retrans + 1)) + end) + end + end + + -- The domain name below needs to have both a SRV and an A record + local SRV_A_TEST_NAME = "timeouttest."..TEST_DOMAIN + + it("verify correctly set up test DNS entry", function() + assert(client.init({ timeout = 1000, retrans = 2 })) + local answers = client.resolve(SRV_A_TEST_NAME, { qtype = client.TYPE_SRV}) + assert.same(client.TYPE_SRV, answers[1].type) + answers = client.resolve(SRV_A_TEST_NAME, { qtype = client.TYPE_A}) + assert.same(client.TYPE_A, answers[1].type) + end) + + it("does not respond with incorrect answer on transient failure", function() + -- KAG-2300 - https://github.com/Kong/kong/issues/10182 + -- If we encounter a timeout while talking to the DNS server, don't keep trying with other record types + assert(client.init({ timeout = 1000, retrans = 2 })) + query_func = function(self, original_query_func, name, options) + if options.qtype == client.TYPE_SRV then + ngx.sleep(10) + else + return original_query_func(self, name, options) + end + end + local answers = client.resolve(SRV_A_TEST_NAME) + assert.is_nil(answers) + end) + end) it("fetching a record without nameservers errors", function() assert(client.init({ resolvConf = {} })) - local host = "thijsschreijer.nl" + local host = TEST_DOMAIN local typ = client.TYPE_A local answers, err, _ = client.resolve(host, { qtype = typ }) @@ -576,7 +638,7 @@ describe("[DNS client]", function() it("fetching a TXT record", function() assert(client.init()) - local host = "txttest.thijsschreijer.nl" + local host = "txttest."..TEST_DOMAIN local typ = client.TYPE_TXT local answers, err, try_list = client.resolve(host, { qtype = typ }) @@ -589,7 +651,7 @@ describe("[DNS client]", function() it("fetching a CNAME record", function() assert(client.init()) - local host = "smtp.thijsschreijer.nl" + local host = "smtp."..TEST_DOMAIN local typ = client.TYPE_CNAME local answers = assert(client.resolve(host, { qtype = typ })) @@ -601,7 +663,7 @@ describe("[DNS client]", function() it("fetching a CNAME record FQDN", function() assert(client.init()) - local host = "smtp.thijsschreijer.nl" + local host = "smtp."..TEST_DOMAIN local typ = client.TYPE_CNAME local answers = assert(client.resolve(host .. ".", { qtype = typ })) @@ -613,7 +675,7 @@ describe("[DNS client]", function() it("expire and touch times", function() assert(client.init()) - local host = "txttest.thijsschreijer.nl" + local host = "txttest."..TEST_DOMAIN local typ = client.TYPE_TXT local answers, _, _ = assert(client.resolve(host, { qtype = typ })) @@ -669,7 +731,7 @@ describe("[DNS client]", function() it("fetching multiple A records", function() assert(client.init()) - local host = "atest.thijsschreijer.nl" + local host = "atest."..TEST_DOMAIN local typ = client.TYPE_A local answers = assert(client.resolve(host, { qtype = typ })) @@ -683,7 +745,7 @@ describe("[DNS client]", function() it("fetching multiple A records FQDN", function() assert(client.init()) - local host = "atest.thijsschreijer.nl" + local host = "atest."..TEST_DOMAIN local typ = client.TYPE_A local answers = assert(client.resolve(host .. ".", { qtype = typ })) @@ -712,20 +774,20 @@ describe("[DNS client]", function() This does not affect client side code, as the result is always the final A record. --]] - local host = "smtp.thijsschreijer.nl" + local host = "smtp."..TEST_DOMAIN local typ = client.TYPE_A local answers, _, _ = assert(client.resolve(host)) -- check first CNAME local key1 = client.TYPE_CNAME..":"..host local entry1 = lrucache:get(key1) - assert.are.equal(host, entry1[1].name) -- the 1st record is the original 'smtp.thijsschreijer.nl' + assert.are.equal(host, entry1[1].name) -- the 1st record is the original 'smtp.'..TEST_DOMAIN assert.are.equal(client.TYPE_CNAME, entry1[1].type) -- and that is a CNAME -- check second CNAME local key2 = client.TYPE_CNAME..":"..entry1[1].cname local entry2 = lrucache:get(key2) - assert.are.equal(entry1[1].cname, entry2[1].name) -- the 2nd is the middle 'thuis.thijsschreijer.nl' + assert.are.equal(entry1[1].cname, entry2[1].name) -- the 2nd is the middle 'thuis.'..TEST_DOMAIN assert.are.equal(client.TYPE_CNAME, entry2[1].type) -- and that is also a CNAME -- check second target to match final record @@ -747,7 +809,7 @@ describe("[DNS client]", function() it("fetching multiple SRV records (un-typed)", function() assert(client.init()) - local host = "srvtest.thijsschreijer.nl" + local host = "srvtest."..TEST_DOMAIN local typ = client.TYPE_SRV -- un-typed lookup @@ -765,7 +827,7 @@ describe("[DNS client]", function() assert(client.init({ search = {}, })) local lrucache = client.getcache() - local host = "cname2srv.thijsschreijer.nl" + local host = "cname2srv."..TEST_DOMAIN local typ = client.TYPE_SRV -- un-typed lookup @@ -795,7 +857,7 @@ describe("[DNS client]", function() search = {}, })) - local host = "srvtest.thijsschreijer.nl" + local host = "srvtest."..TEST_DOMAIN local typ = client.TYPE_A --> the entry is SRV not A local answers, err, _ = client.resolve(host, {qtype = typ}) @@ -811,7 +873,7 @@ describe("[DNS client]", function() search = {}, })) - local host = "IsNotHere.thijsschreijer.nl" + local host = "IsNotHere."..TEST_DOMAIN local answers, err, _ = client.resolve(host) assert.is_nil(answers) @@ -1101,7 +1163,7 @@ describe("[DNS client]", function() describe("toip() function", function() it("A/AAAA-record, round-robin",function() assert(client.init({ search = {}, })) - local host = "atest.thijsschreijer.nl" + local host = "atest."..TEST_DOMAIN local answers = assert(client.resolve(host)) answers.last_index = nil -- make sure to clean local ips = {} @@ -1305,11 +1367,12 @@ describe("[DNS client]", function() assert.is_number(port) assert.is_not.equal(0, port) end) + it("port passing if SRV port=0",function() assert(client.init({ search = {}, })) local ip, port, host - host = "srvport0.thijsschreijer.nl" + host = "srvport0."..TEST_DOMAIN ip, port = client.toip(host, 10) assert.is_string(ip) assert.is_number(port) @@ -1319,10 +1382,11 @@ describe("[DNS client]", function() assert.is_string(ip) assert.is_nil(port) end) + it("recursive SRV pointing to itself",function() assert(client.init({ search = {}, })) local ip, record, port, host, err, _ - host = "srvrecurse.thijsschreijer.nl" + host = "srvrecurse."..TEST_DOMAIN -- resolve SRV specific should return the record including its -- recursive entry @@ -1477,7 +1541,7 @@ describe("[DNS client]", function() --empty/error responses should be cached for a configurable time local emptyTtl = 0.1 local staleTtl = 0.1 - local qname = "really.really.really.does.not.exist.thijsschreijer.nl" + local qname = "really.really.really.does.not.exist."..TEST_DOMAIN assert(client.init({ emptyTtl = emptyTtl, staleTtl = staleTtl, @@ -1645,7 +1709,7 @@ describe("[DNS client]", function() -- starting resolving coroutine.yield(coroutine.running()) local result, _, _ = client.resolve( - "thijsschreijer.nl", + TEST_DOMAIN, { qtype = client.TYPE_A } ) table.insert(results, result) @@ -1689,7 +1753,7 @@ describe("[DNS client]", function() })) -- insert a stub thats waits and returns a fixed record - local name = "thijsschreijer.nl" + local name = TEST_DOMAIN query_func = function() local ip = "1.4.2.3" local entry = { @@ -1735,7 +1799,7 @@ describe("[DNS client]", function() -- all results are equal, as they all will wait for the first response for i = 1, 10 do - assert.equal("dns lookup pool exceeded retries (1): timeout", results[i]) + assert.equal("timeout", results[i]) end end) end) @@ -1752,7 +1816,7 @@ describe("[DNS client]", function() -- insert a stub thats waits and returns a fixed record local call_count = 0 - local name = "thijsschreijer.nl" + local name = TEST_DOMAIN query_func = function() local ip = "1.4.2.3" local entry = { diff --git a/spec/fixtures/mocks/lua-resty-dns/resty/dns/resolver.lua b/spec/fixtures/mocks/lua-resty-dns/resty/dns/resolver.lua index f6d56e71b86..cc59178fe09 100644 --- a/spec/fixtures/mocks/lua-resty-dns/resty/dns/resolver.lua +++ b/spec/fixtures/mocks/lua-resty-dns/resty/dns/resolver.lua @@ -95,10 +95,18 @@ resolver.query = function(self, name, options, tries) end if not mocks_only then - -- no mock, so invoke original resolver - local a, b, c = old_query(self, name, options, tries) - return a, b, c + -- No mock, so invoke original resolver. Note that if the original resolver fails (i.e. because an + -- invalid domain name like example.com was used), we return an empty result set instead of passing + -- the error up to the caller. This is done so that if the mock contains "A" records (which would + -- be the most common case), the initial query for a SRV record does not fail, but appear not to have + -- yielded any results. This will make dns/client.lua try finding an A record next. + local records, err, tries = old_query(self, name, options, tries) + if records then + return records, err, tries + end end + + return {}, nil, tries end -- do