From dbe8d947e7d6a6c12f8464fc2f08276be1cd392f Mon Sep 17 00:00:00 2001 From: Jun Ouyang Date: Wed, 1 Mar 2023 17:07:55 +0800 Subject: [PATCH] fix(runloop): use upstream status code in passive health check (#10325) We expect the upstream passive unhealthy healthcheck on HTTP status code should only mark the targets as "down" (not reachable or proxiable) using the returned HTTP status code from the target, and it should not be affected by the final HTTP status code set by Kong plugin, which is returned to the client. So we change the passive health check implementation to use nginx var `upstream_code`, this will not be changed by any plugin. FTI-4841 Fix #10281 --- CHANGELOG.md | 4 +- kong/runloop/handler.lua | 7 +- .../10-balancer/01-healthchecks_spec.lua | 66 +++++++++++++++++++ 3 files changed, 74 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 12dd9d42c2ea..b82cec089747 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -86,13 +86,15 @@ [#10352](https://github.com/Kong/kong/pull/10352) - Fix an issue where validation to regex routes may be skipped when the old-fashioned config is used for DB-less Kong. [#10348](https://github.com/Kong/kong/pull/10348) +- Fix an issue where balancer passive healthcheck would use wrong status code when kong changes status code + from upstream in `header_filter` phase. + [#10325](https://github.com/Kong/kong/pull/10325) ### Dependencies - Bumped lua-resty-session from 4.0.2 to 4.0.3 [#10338](https://github.com/Kong/kong/pull/10338) - ## 3.2.0 ### Breaking Changes diff --git a/kong/runloop/handler.lua b/kong/runloop/handler.lua index e5ce9271741f..3a7a94345091 100644 --- a/kong/runloop/handler.lua +++ b/kong/runloop/handler.lua @@ -11,7 +11,7 @@ local certificate = require "kong.runloop.certificate" local concurrency = require "kong.concurrency" local lrucache = require "resty.lrucache" local marshall = require "kong.cache.marshall" -local ktls = require("resty.kong.tls") +local ktls = require "resty.kong.tls" local PluginsIterator = require "kong.runloop.plugins_iterator" local instrumentation = require "kong.tracing.instrumentation" @@ -1511,7 +1511,10 @@ return { -- Report HTTP status for health checks local balancer_data = ctx.balancer_data if balancer_data and balancer_data.balancer_handle then - local status = ngx.status + -- https://nginx.org/en/docs/http/ngx_http_upstream_module.html#variables + -- because of the way of Nginx do the upstream_status variable, it may be + -- a string or a number, so we need to parse it to get the status + local status = tonumber(sub(var.upstream_status or "", -3)) or ngx.status if status == 504 then balancer_data.balancer.report_timeout(balancer_data.balancer_handle) else diff --git a/spec/02-integration/05-proxy/10-balancer/01-healthchecks_spec.lua b/spec/02-integration/05-proxy/10-balancer/01-healthchecks_spec.lua index 1759dcec081f..ad3d6f79b34d 100644 --- a/spec/02-integration/05-proxy/10-balancer/01-healthchecks_spec.lua +++ b/spec/02-integration/05-proxy/10-balancer/01-healthchecks_spec.lua @@ -1183,6 +1183,72 @@ for _, strategy in helpers.each_strategy() do end end) + it("perform passive health checks in downstream status code was changed", function() + + for nfails = 1, 3 do + + bu.begin_testcase_setup(strategy, bp) + -- configure healthchecks + local upstream_name, upstream_id = bu.add_upstream(bp, { + healthchecks = bu.healthchecks_config { + passive = { + unhealthy = { + http_failures = nfails, + } + } + } + }) + local port1 = bu.add_target(bp, upstream_id, localhost) + local port2 = bu.add_target(bp, upstream_id, localhost) + local api_host, service_id = bu.add_api(bp, upstream_name) + bp.plugins:insert({ + name = "pre-function", + service = { id = service_id }, + config = { + header_filter ={ + [[ + ngx.exit(200) + ]], + }, + } + }) + + bu.end_testcase_setup(strategy, bp) + + local requests = bu.SLOTS * 2 -- go round the balancer twice + + -- setup target servers: + -- server2 will only respond for part of the test, + -- then server1 will take over. + local server2_oks = math.floor(requests / 4) + local server1 = https_server.new(port1, localhost) + local server2 = https_server.new(port2, localhost) + server1:start() + server2:start() + + -- Go hit them with our test requests + local client_oks1, client_fails1 = bu.client_requests(bu.SLOTS, api_host) + assert(bu.direct_request(localhost, port2, "/unhealthy")) + local client_oks2, client_fails2 = bu.client_requests(bu.SLOTS, api_host) + + local client_oks = client_oks1 + client_oks2 + local client_fails = client_fails1 + client_fails2 + + -- collect server results; hitcount + local count1 = server1:shutdown() + local count2 = server2:shutdown() + + -- verify + assert.are.equal(requests - server2_oks - nfails, count1.ok) + assert.are.equal(server2_oks, count2.ok) + assert.are.equal(0, count1.fail) + assert.are.equal(nfails, count2.fail) + + assert.are.equal(client_oks, requests) + assert.are.equal(0, client_fails) + end + end) + it("threshold for health checks", function() local fixtures = { dns_mock = helpers.dns_mock.new()