From d809dfc4d4d9c3e9c66ae8432d3a791dd5e751bf Mon Sep 17 00:00:00 2001
From: "zhuo.chen" <zhuo.chen@upai.com>
Date: Fri, 9 Jul 2021 14:37:08 +0800
Subject: [PATCH 1/3] add option to Limits the time during which a request can
 be passed to the next upstream

---
 apisix/balancer.lua                     |  8 ++++
 apisix/schema_def.lua                   |  4 ++
 docs/en/latest/admin-api.md             |  1 +
 docs/en/latest/plugins/traffic-split.md |  2 +-
 docs/zh/latest/admin-api.md             |  1 +
 docs/zh/latest/plugins/traffic-split.md |  2 +-
 t/admin/upstream.t                      | 32 ++++++++++++++++
 t/lib/server.lua                        |  8 ++++
 t/node/upstream-retries.t               | 49 +++++++++++++++++++++++++
 9 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/apisix/balancer.lua b/apisix/balancer.lua
index 87042844b5f2..d5336b993363 100644
--- a/apisix/balancer.lua
+++ b/apisix/balancer.lua
@@ -23,6 +23,7 @@ local enable_keepalive = balancer.enable_keepalive
 local set_more_tries   = balancer.set_more_tries
 local get_last_failure = balancer.get_last_failure
 local set_timeouts     = balancer.set_timeouts
+local ngx_now          = ngx.now
 
 
 local module_name = "balancer"
@@ -161,6 +162,9 @@ local function set_balancer_opts(route, ctx)
     end
 
     if retries > 0 then
+        if up_conf.retry_timeout and up_conf.retry_timeout > 0 then
+            ctx.proxy_retry_deadline = ngx_now() + up_conf.retry_timeout
+        end
         local ok, err = set_more_tries(retries)
         if not ok then
             core.log.error("could not set upstream retries: ", err)
@@ -293,6 +297,10 @@ function _M.run(route, ctx)
         set_balancer_opts(route, ctx)
 
     else
+        if ctx.proxy_retry_deadline and ctx.proxy_retry_deadline < ngx_now() then
+            core.log.error("proxy retry timeout, retry count: ", ctx.balancer_try_count or 0)
+            return core.response.exit(502)
+        end
         -- retry
         server, err = pick_server(route, ctx)
         if not server then
diff --git a/apisix/schema_def.lua b/apisix/schema_def.lua
index b196f7117e17..05501021b908 100644
--- a/apisix/schema_def.lua
+++ b/apisix/schema_def.lua
@@ -353,6 +353,10 @@ local upstream_schema = {
             type = "integer",
             minimum = 0,
         },
+        retry_timeout = {
+            type = "integer",
+            minimum = 0,
+        },
         timeout = timeout_def,
         tls = {
             type = "object",
diff --git a/docs/en/latest/admin-api.md b/docs/en/latest/admin-api.md
index 4ba3cdf407ba..6ff9ede2e15e 100644
--- a/docs/en/latest/admin-api.md
+++ b/docs/en/latest/admin-api.md
@@ -546,6 +546,7 @@ In addition to the basic complex equalization algorithm selection, APISIX's Upst
 |key             |optional|This option is only valid if the `type` is `chash`. Find the corresponding node `id` according to `hash_on` and `key`. When `hash_on` is set as `vars`, `key` is the required parameter, for now, it support nginx built-in variables like `uri, server_name, server_addr, request_uri, remote_port, remote_addr, query_string, host, hostname, arg_***`, `arg_***` is arguments in the request line, [Nginx variables list](http://nginx.org/en/docs/varindex.html). When `hash_on` is set as `header`, `key` is the required parameter, and `header name` is customized. When `hash_on` is set to `cookie`, `key` is the required parameter, and `cookie name` is customized. When `hash_on` is set to `consumer`, `key` does not need to be set. In this case, the `key` adopted by the hash algorithm is the `consumer_name` authenticated. If the specified `hash_on` and `key` can not fetch values, it will be fetch `remote_addr` by default.|
 |checks          |optional|Configure the parameters of the health check. For details, refer to [health-check](health-check.md).|
 |retries         |optional|Pass the request to the next upstream using the underlying Nginx retry mechanism, the retry mechanism is enabled by default and set the number of retries according to the number of available backend nodes. If `retries` option is explicitly set, it will override the default value. `0` means disable retry mechanism.|
+|retry_timeout   |optional|Limits the time during which a request can be passed to the next upstream. `0` means disable retry timeout mechanism.|
 |timeout         |optional| Set the timeout for connecting, sending and receiving messages. |
 |name            |optional|Identifies upstream names|
 |desc            |optional|upstream usage scenarios, and more.|
diff --git a/docs/en/latest/plugins/traffic-split.md b/docs/en/latest/plugins/traffic-split.md
index 4882da1e123f..1d5456bf582b 100644
--- a/docs/en/latest/plugins/traffic-split.md
+++ b/docs/en/latest/plugins/traffic-split.md
@@ -59,7 +59,7 @@ Note: The ratio between each upstream may not so accurate since the drawback of
 | weighted_upstreams.weight      | integer       | optional    | weight = 1   |  | The traffic is divided according to the `weight` value, and the roundrobin algorithm is used to divide multiple `weight`. |
 
 Currently, in the configuration of `weighted_upstreams.upstream`, the unsupported fields are:
-service_name, discovery_type, checks, retries, desc, scheme, labels, create_time and update_time. But you can use `weighted_upstreams.upstream_id` to bind the `upstream` object to achieve their functions.
+service_name, discovery_type, checks, retries, retry_timeout, desc, scheme, labels, create_time and update_time. But you can use `weighted_upstreams.upstream_id` to bind the `upstream` object to achieve their functions.
 
 The traffic-split plugin is mainly composed of two parts: `match` and `weighted_upstreams`. `match` is a custom conditional rule, and `weighted_upstreams` is upstream configuration information. If you configure `match` and `weighted_upstreams` information, then after the `match` rule is verified, it will be based on the `weight` value in `weighted_upstreams`; the ratio of traffic between each upstream in the plugin will be guided, otherwise, all traffic will be directly Reach the `upstream` configured on `route` or `service`. Of course, you can also configure only the `weighted_upstreams` part, which will directly guide the traffic ratio between each upstream in the plugin based on the `weight` value in `weighted_upstreams`.
 
diff --git a/docs/zh/latest/admin-api.md b/docs/zh/latest/admin-api.md
index b8f2bfe8e361..c3d1fd6a714c 100644
--- a/docs/zh/latest/admin-api.md
+++ b/docs/zh/latest/admin-api.md
@@ -551,6 +551,7 @@ APISIX 的 Upstream 除了基本的负载均衡算法选择外，还支持对上
 | key            | 条件必需                           | 匹配类型       | 该选项只有类型是 `chash` 才有效。根据 `key` 来查找对应的 node `id`，相同的 `key` 在同一个对象中，永远返回相同 id，目前支持的 Nginx 内置变量有 `uri, server_name, server_addr, request_uri, remote_port, remote_addr, query_string, host, hostname, arg_***`，其中 `arg_***` 是来自 URL 的请求参数，[Nginx 变量列表](http://nginx.org/en/docs/varindex.html) |                                                  |
 | checks         | 可选                               | health_checker | 配置健康检查的参数，详细可参考[health-check](health-check.md)                                                                                                                                                                                                                                                                                               |                                                  |
 | retries        | 可选                               | 整型           | 使用底层的 Nginx 重试机制将请求传递给下一个上游，默认启用重试且次数为后端可用的 node 数量。如果指定了具体重试次数，它将覆盖默认值。`0` 代表不启用重试机制。                                                                                                                                                                                                 |                                                  |
+| retry_timeout  | 可选                               | 整型           | 限制请求传递给下一个上游的时间。`0` 代表不启用重试机制。                                                                                                                                                                                                 |                                                  |
 | timeout        | 可选                               | 超时时间对象   | 设置连接、发送消息、接收消息的超时时间                                                                                                                                                                                                                                                                                                                      |                                                  |
 | hash_on        | 可选                               | 辅助           | `hash_on` 支持的类型有 `vars`（Nginx 内置变量），`header`（自定义 header），`cookie`，`consumer`，默认值为 `vars`                                                                                                                                                                                                                                           |
 | name           | 可选                               | 辅助           | 标识上游服务名称、使用场景等。                                                                                                                                                                                                                                                                                                                              |                                                  |
diff --git a/docs/zh/latest/plugins/traffic-split.md b/docs/zh/latest/plugins/traffic-split.md
index 86e576d06f84..c7172e65d240 100644
--- a/docs/zh/latest/plugins/traffic-split.md
+++ b/docs/zh/latest/plugins/traffic-split.md
@@ -59,7 +59,7 @@ traffic-split 插件使用户可以逐步引导各个上游之间的流量百分
 | weighted_upstreams.weight      | integer | 可选   |   weight = 1     |        | 根据 `weight` 值做流量划分，多个 weight 之间使用 roundrobin 算法划分。|
 
 目前在 `weighted_upstreams.upstream` 的配置中，不支持的字段有：
-service_name、discovery_type、checks、retries、desc、scheme、labels、create_time 和 update_time。但是你可以通过 `weighted_upstreams.upstream_id` 绑定 `upstream` 对象来实现他们。
+service_name、discovery_type、checks、retries、retry_timeout、desc、scheme、labels、create_time 和 update_time。但是你可以通过 `weighted_upstreams.upstream_id` 绑定 `upstream` 对象来实现他们。
 
 traffic-split 插件主要由 `match` 和 `weighted_upstreams` 两部分组成，`match` 是自定义的条件规则，`weighted_upstreams` 是 upstream 的配置信息。如果配置 `match` 和 `weighted_upstreams` 信息，那么在 `match` 规则校验通过后，会根据 `weighted_upstreams` 中的 `weight` 值；引导插件中各个 upstream 之间的流量比例，否则，所有流量直接到达 `route` 或 `service` 上配置的 `upstream`。当然你也可以只配置 `weighted_upstreams` 部分，这样会直接根据 `weighted_upstreams` 中的 `weight` 值，引导插件中各个 upstream 之间的流量比例。
 
diff --git a/t/admin/upstream.t b/t/admin/upstream.t
index cebbb165de61..88d80bd66cdf 100644
--- a/t/admin/upstream.t
+++ b/t/admin/upstream.t
@@ -1978,3 +1978,35 @@ GET /t
 [delete] code: 200 message: passed
 --- no_error_log
 [error]
+
+
+
+=== TEST 57: retry_timeout is -1 (INVALID)
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/upstreams/a-b-c-ABC_0123',
+                ngx.HTTP_PUT,
+                [[{
+                    "nodes": {
+                        "127.0.0.1:8080": 1,
+                        "127.0.0.1:8090": 1
+                    },
+                    "retry_timeout": -1,
+                    "type": "roundrobin"
+                }]]
+            )
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.print(body)
+        }
+    }
+--- request
+GET /t
+--- error_code: 400
+--- response_body
+{"error_msg":"invalid configuration: property \"retry_timeout\" validation failed: expected -1 to be greater than 0"}
+--- no_error_log
+[error]
diff --git a/t/lib/server.lua b/t/lib/server.lua
index e7621044489d..72f6123f496a 100644
--- a/t/lib/server.lua
+++ b/t/lib/server.lua
@@ -417,5 +417,13 @@ function _M.server_error()
     error("500 Internal Server Error")
 end
 
+function _M.retry_error()
+    local sleep = ngx.var.http_x_test_sleep
+    if sleep ~= nil then
+        ngx.sleep(tonumber(sleep))
+    end
+
+    ngx.exit(ngx.ERROR)
+end
 
 return _M
diff --git a/t/node/upstream-retries.t b/t/node/upstream-retries.t
index 7b3f79b96436..6ced7bb41f39 100644
--- a/t/node/upstream-retries.t
+++ b/t/node/upstream-retries.t
@@ -267,3 +267,52 @@ qr/proxy request to 127.0.0.1:1
 proxy request to 127.0.0.2:1
 |proxy request to 127.0.0.2:1
 proxy request to 127.0.0.1:1/
+
+
+
+=== TEST 13: stop proxy to next upstream by retry_timeout
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                 ngx.HTTP_PUT,
+                 [[{
+                        "upstream": {
+                            "nodes": {
+                                "127.0.0.1:1980": 100,
+                                "127.0.0.1:1981": 100,
+                                "127.0.0.1:1982": 100
+                            },
+                            "retries": 10,
+                            "retry_timeout": 2,
+                            "type": "roundrobin"
+                        },
+                        "uri": "/retry_error"
+                }]]
+                )
+
+            if code ~= 200 then
+                ngx.say(body)
+                return
+            end
+            local http = require "resty.http"
+            local httpc = http.new()
+            local uri = "http://127.0.0.1:" .. ngx.var.server_port
+                        .. "/retry_error"
+            local res, err = httpc:request_uri(uri, {headers={
+                ["X-Test-Sleep"] = "1"
+            }})
+            if not res then
+                ngx.say(err)
+                return
+            end
+            ngx.status = res.status
+            ngx.say(res.status)
+        }
+    }
+--- request
+GET /t
+--- error_code: 502
+--- error_log eval
+qr/proxy retry timeout, retry count: 2/

From 079bd50d5f3ea725ba80f0abe555c274d32bb97d Mon Sep 17 00:00:00 2001
From: "zhuo.chen" <zhuo.chen@upai.com>
Date: Fri, 9 Jul 2021 16:49:08 +0800
Subject: [PATCH 2/3] change test case position

---
 t/admin/upstream.t        | 32 --------------------------------
 t/admin/upstream3.t       | 32 ++++++++++++++++++++++++++++++++
 t/lib/server.lua          | 15 +++++----------
 t/node/upstream-retries.t |  8 +++-----
 4 files changed, 40 insertions(+), 47 deletions(-)

diff --git a/t/admin/upstream.t b/t/admin/upstream.t
index 88d80bd66cdf..cebbb165de61 100644
--- a/t/admin/upstream.t
+++ b/t/admin/upstream.t
@@ -1978,35 +1978,3 @@ GET /t
 [delete] code: 200 message: passed
 --- no_error_log
 [error]
-
-
-
-=== TEST 57: retry_timeout is -1 (INVALID)
---- config
-    location /t {
-        content_by_lua_block {
-            local t = require("lib.test_admin").test
-            local code, body = t('/apisix/admin/upstreams/a-b-c-ABC_0123',
-                ngx.HTTP_PUT,
-                [[{
-                    "nodes": {
-                        "127.0.0.1:8080": 1,
-                        "127.0.0.1:8090": 1
-                    },
-                    "retry_timeout": -1,
-                    "type": "roundrobin"
-                }]]
-            )
-            if code >= 300 then
-                ngx.status = code
-            end
-            ngx.print(body)
-        }
-    }
---- request
-GET /t
---- error_code: 400
---- response_body
-{"error_msg":"invalid configuration: property \"retry_timeout\" validation failed: expected -1 to be greater than 0"}
---- no_error_log
-[error]
diff --git a/t/admin/upstream3.t b/t/admin/upstream3.t
index 6dda9ad93673..2b542882938a 100644
--- a/t/admin/upstream3.t
+++ b/t/admin/upstream3.t
@@ -61,3 +61,35 @@ __DATA__
     }
 --- response_body
 {"action":"get","count":0,"node":{"dir":true,"key":"/apisix/upstreams","nodes":{}}}
+
+
+
+=== TEST 2: retry_timeout is -1 (INVALID)
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/upstreams/a-b-c-ABC_0123',
+                ngx.HTTP_PUT,
+                [[{
+                    "nodes": {
+                        "127.0.0.1:8080": 1,
+                        "127.0.0.1:8090": 1
+                    },
+                    "retry_timeout": -1,
+                    "type": "roundrobin"
+                }]]
+            )
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.print(body)
+        }
+    }
+--- request
+GET /t
+--- error_code: 400
+--- response_body
+{"error_msg":"invalid configuration: property \"retry_timeout\" validation failed: expected -1 to be greater than 0"}
+--- no_error_log
+[error]
diff --git a/t/lib/server.lua b/t/lib/server.lua
index 72f6123f496a..9e393872dd43 100644
--- a/t/lib/server.lua
+++ b/t/lib/server.lua
@@ -356,7 +356,11 @@ end
 
 function _M.mysleep()
     ngx.sleep(tonumber(ngx.var.arg_seconds))
-    ngx.say(ngx.var.arg_seconds)
+    if ngx.var.arg_abort then
+        ngx.exit(ngx.ERROR)
+    else
+        ngx.say(ngx.var.arg_seconds)
+    end
 end
 
 
@@ -417,13 +421,4 @@ function _M.server_error()
     error("500 Internal Server Error")
 end
 
-function _M.retry_error()
-    local sleep = ngx.var.http_x_test_sleep
-    if sleep ~= nil then
-        ngx.sleep(tonumber(sleep))
-    end
-
-    ngx.exit(ngx.ERROR)
-end
-
 return _M
diff --git a/t/node/upstream-retries.t b/t/node/upstream-retries.t
index 6ced7bb41f39..89ff86c0795b 100644
--- a/t/node/upstream-retries.t
+++ b/t/node/upstream-retries.t
@@ -288,7 +288,7 @@ proxy request to 127.0.0.1:1/
                             "retry_timeout": 2,
                             "type": "roundrobin"
                         },
-                        "uri": "/retry_error"
+                        "uri": "/mysleep"
                 }]]
                 )
 
@@ -299,10 +299,8 @@ proxy request to 127.0.0.1:1/
             local http = require "resty.http"
             local httpc = http.new()
             local uri = "http://127.0.0.1:" .. ngx.var.server_port
-                        .. "/retry_error"
-            local res, err = httpc:request_uri(uri, {headers={
-                ["X-Test-Sleep"] = "1"
-            }})
+                        .. "/mysleep?abort=true&seconds=1"
+            local res, err = httpc:request_uri(uri)
             if not res then
                 ngx.say(err)
                 return

From 650adf28d39b4d0b271276ba21bc767dc9c41896 Mon Sep 17 00:00:00 2001
From: "zhuo.chen" <zhuo.chen@upai.com>
Date: Tue, 13 Jul 2021 09:52:07 +0800
Subject: [PATCH 3/3] resolve discussion

---
 docs/en/latest/admin-api.md | 2 +-
 docs/zh/latest/admin-api.md | 2 +-
 t/admin/upstream3.t         | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/en/latest/admin-api.md b/docs/en/latest/admin-api.md
index 6ff9ede2e15e..30e78c3ae62a 100644
--- a/docs/en/latest/admin-api.md
+++ b/docs/en/latest/admin-api.md
@@ -546,7 +546,7 @@ In addition to the basic complex equalization algorithm selection, APISIX's Upst
 |key             |optional|This option is only valid if the `type` is `chash`. Find the corresponding node `id` according to `hash_on` and `key`. When `hash_on` is set as `vars`, `key` is the required parameter, for now, it support nginx built-in variables like `uri, server_name, server_addr, request_uri, remote_port, remote_addr, query_string, host, hostname, arg_***`, `arg_***` is arguments in the request line, [Nginx variables list](http://nginx.org/en/docs/varindex.html). When `hash_on` is set as `header`, `key` is the required parameter, and `header name` is customized. When `hash_on` is set to `cookie`, `key` is the required parameter, and `cookie name` is customized. When `hash_on` is set to `consumer`, `key` does not need to be set. In this case, the `key` adopted by the hash algorithm is the `consumer_name` authenticated. If the specified `hash_on` and `key` can not fetch values, it will be fetch `remote_addr` by default.|
 |checks          |optional|Configure the parameters of the health check. For details, refer to [health-check](health-check.md).|
 |retries         |optional|Pass the request to the next upstream using the underlying Nginx retry mechanism, the retry mechanism is enabled by default and set the number of retries according to the number of available backend nodes. If `retries` option is explicitly set, it will override the default value. `0` means disable retry mechanism.|
-|retry_timeout   |optional|Limits the time during which a request can be passed to the next upstream. `0` means disable retry timeout mechanism.|
+|retry_timeout   |optional|Limits the amount of time that retries can be continued, and do not continue retries if the previous request and retry requests have taken too long. `0` means disable retry timeout mechanism.|
 |timeout         |optional| Set the timeout for connecting, sending and receiving messages. |
 |name            |optional|Identifies upstream names|
 |desc            |optional|upstream usage scenarios, and more.|
diff --git a/docs/zh/latest/admin-api.md b/docs/zh/latest/admin-api.md
index c3d1fd6a714c..cdbec67bddb7 100644
--- a/docs/zh/latest/admin-api.md
+++ b/docs/zh/latest/admin-api.md
@@ -551,7 +551,7 @@ APISIX 的 Upstream 除了基本的负载均衡算法选择外，还支持对上
 | key            | 条件必需                           | 匹配类型       | 该选项只有类型是 `chash` 才有效。根据 `key` 来查找对应的 node `id`，相同的 `key` 在同一个对象中，永远返回相同 id，目前支持的 Nginx 内置变量有 `uri, server_name, server_addr, request_uri, remote_port, remote_addr, query_string, host, hostname, arg_***`，其中 `arg_***` 是来自 URL 的请求参数，[Nginx 变量列表](http://nginx.org/en/docs/varindex.html) |                                                  |
 | checks         | 可选                               | health_checker | 配置健康检查的参数，详细可参考[health-check](health-check.md)                                                                                                                                                                                                                                                                                               |                                                  |
 | retries        | 可选                               | 整型           | 使用底层的 Nginx 重试机制将请求传递给下一个上游，默认启用重试且次数为后端可用的 node 数量。如果指定了具体重试次数，它将覆盖默认值。`0` 代表不启用重试机制。                                                                                                                                                                                                 |                                                  |
-| retry_timeout  | 可选                               | 整型           | 限制请求传递给下一个上游的时间。`0` 代表不启用重试机制。                                                                                                                                                                                                 |                                                  |
+| retry_timeout  | 可选                               | 整型           | 限制是否继续重试的时间，若之前的请求和重试请求花费太多时间就不再继续重试。`0` 代表不启用重试超时机制。                                                                                                                                                                                                 |                                                  |
 | timeout        | 可选                               | 超时时间对象   | 设置连接、发送消息、接收消息的超时时间                                                                                                                                                                                                                                                                                                                      |                                                  |
 | hash_on        | 可选                               | 辅助           | `hash_on` 支持的类型有 `vars`（Nginx 内置变量），`header`（自定义 header），`cookie`，`consumer`，默认值为 `vars`                                                                                                                                                                                                                                           |
 | name           | 可选                               | 辅助           | 标识上游服务名称、使用场景等。                                                                                                                                                                                                                                                                                                                              |                                                  |
diff --git a/t/admin/upstream3.t b/t/admin/upstream3.t
index 2b542882938a..fc24a4faf34d 100644
--- a/t/admin/upstream3.t
+++ b/t/admin/upstream3.t
@@ -91,5 +91,3 @@ GET /t
 --- error_code: 400
 --- response_body
 {"error_msg":"invalid configuration: property \"retry_timeout\" validation failed: expected -1 to be greater than 0"}
---- no_error_log
-[error]