From 9c58541258e8def4cf0b8f457dadf7176d49efc3 Mon Sep 17 00:00:00 2001 From: spacewander Date: Wed, 24 Feb 2021 09:33:28 +0800 Subject: [PATCH] fix(chash): ensure retry can try every node Previously the default number of retry is equal to the number of node, but the same node will be tried again according to its weight. Also ensure the same picker will be used in the whole request, especially during the retry. Signed-off-by: spacewander --- apisix/balancer.lua | 8 ++++-- apisix/balancer/chash.lua | 39 +++++++++++++++++++++++-- t/node/chash-balance.t | 60 +++++++++++++++++++++++++++++++++++++++ t/node/healthcheck.t | 2 +- 4 files changed, 104 insertions(+), 5 deletions(-) diff --git a/apisix/balancer.lua b/apisix/balancer.lua index 2540f2ff2d1c6..4be56fcc28aa2 100644 --- a/apisix/balancer.lua +++ b/apisix/balancer.lua @@ -161,8 +161,12 @@ local function pick_server(route, ctx) version = version .. "#" .. checker.status_ver end - local server_picker = lrucache_server_picker(key, version, - create_server_picker, up_conf, checker) + -- the same picker will be used in the whole request, especially during the retry + local server_picker = ctx.server_picker + if not server_picker then + server_picker = lrucache_server_picker(key, version, + create_server_picker, up_conf, checker) + end if not server_picker then return nil, "failed to fetch server picker" end diff --git a/apisix/balancer/chash.lua b/apisix/balancer/chash.lua index df1568a5dc5f4..e36bc48f9f832 100644 --- a/apisix/balancer/chash.lua +++ b/apisix/balancer/chash.lua @@ -22,6 +22,9 @@ local str_gsub = string.gsub local pairs = pairs +local CONSISTENT_POINTS = 160 -- points per server, taken from `resty.chash` + + local _M = {} @@ -62,27 +65,59 @@ end function _M.new(up_nodes, upstream) local str_null = str_char(0) + local nodes_count = 0 + local safe_limit = 0 local servers, nodes = {}, {} for serv, weight in pairs(up_nodes) do local id = str_gsub(serv, ":", str_null) + nodes_count = nodes_count + 1 + safe_limit = safe_limit + weight servers[id] = serv nodes[id] = weight end + safe_limit = safe_limit * CONSISTENT_POINTS local picker = resty_chash:new(nodes) return { upstream = upstream, get = function (ctx) local id - if ctx.balancer_try_count > 1 and ctx.chash_last_server_index then - id, ctx.chash_last_server_index = picker:next(ctx.chash_last_server_index) + if ctx.chash_last_server_index then + if ctx.balancer_tried_servers_count == nodes_count then + return nil, "all upstream servers tried" + end + + -- the 'safe_limit' is a best effort limit to prevent infinite loop caused by bug + for i = 1, safe_limit do + id, ctx.chash_last_server_index = picker:next(ctx.chash_last_server_index) + if not ctx.balancer_tried_servers[servers[id]] then + break + end + end else local chash_key = fetch_chash_hash_key(ctx, upstream) id, ctx.chash_last_server_index = picker:find(chash_key) end -- core.log.warn("chash id: ", id, " val: ", servers[id]) return servers[id] + end, + after_balance = function (ctx, before_retry) + if not before_retry then + if ctx.balancer_tried_servers then + core.tablepool.release("balancer_tried_servers", ctx.balancer_tried_servers) + ctx.balancer_tried_servers = nil + end + + return nil + end + + if not ctx.balancer_tried_servers then + ctx.balancer_tried_servers = core.tablepool.fetch("balancer_tried_servers", 0, 2) + end + + ctx.balancer_tried_servers[ctx.balancer_server] = true + ctx.balancer_tried_servers_count = (ctx.balancer_tried_servers_count or 0) + 1 end } end diff --git a/t/node/chash-balance.t b/t/node/chash-balance.t index 02eee5f5adc0b..dfde2aaffee51 100644 --- a/t/node/chash-balance.t +++ b/t/node/chash-balance.t @@ -496,3 +496,63 @@ GET /t --- error_code_like: ^(?:50\d)$ --- error_log failed to find valid upstream server, no valid upstream node + + + +=== TEST 13: set route(ensure retry can try every node) +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/server_port", + "upstream": { + "key": "arg_device_id", + "type": "chash", + "nodes": { + "127.0.0.1:1979": 1000, + "127.0.0.1:1980": 1 + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- request +GET /t +--- response_body +passed +--- no_error_log +[error] + + + +=== TEST 14: hit routes +--- config + location /t { + content_by_lua_block { + local http = require "resty.http" + local uri = "http://127.0.0.1:" .. ngx.var.server_port + .. "/server_port?device_id=1" + + local httpc = http.new() + local res, err = httpc:request_uri(uri, {method = "GET"}) + if not res then + ngx.say(err) + return + end + + ngx.say(res.status) + } + } +--- request +GET /t +--- response_body +200 diff --git a/t/node/healthcheck.t b/t/node/healthcheck.t index d9bacb86fb9c5..9f77a4311090a 100644 --- a/t/node/healthcheck.t +++ b/t/node/healthcheck.t @@ -488,7 +488,7 @@ qr{.*http://127.0.0.1:1960/server_port.* .*http://127.0.0.1:1961/server_port.* .*http://127.0.0.1:1961/server_port.* .*http://127.0.0.1:1961/server_port.* -.*http://127.0.0.1:1960/server_port.*} +.*http://127.0.0.1:1961/server_port.*} --- timeout: 10