Skip to content

Commit

Permalink
feat: enable etcd health-check (#4191)
Browse files Browse the repository at this point in the history
  • Loading branch information
Yiyiyimu authored Jun 30, 2021
1 parent 20d9dd2 commit 994f020
Show file tree
Hide file tree
Showing 8 changed files with 217 additions and 11 deletions.
1 change: 1 addition & 0 deletions apisix/cli/ngx_tpl.lua
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ http {
lua_shared_dict plugin-limit-count-redis-cluster-slot-lock 1m;
lua_shared_dict tracing_buffer 10m; # plugin: skywalking
lua_shared_dict plugin-api-breaker 10m;
lua_shared_dict etcd_cluster_health_check 10m; # etcd health check
# for openid-connect and authz-keycloak plugin
lua_shared_dict discovery 1m; # cache for discovery metadata documents
Expand Down
46 changes: 44 additions & 2 deletions apisix/core/config_etcd.lua
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,15 @@ local tostring = tostring
local tonumber = tonumber
local xpcall = xpcall
local debug = debug
local string = string
local error = error
local rand = math.random
local constants = require("apisix.constants")
local health_check = require("resty.etcd.health_check")


local is_http = ngx.config.subsystem == "http"
local err_etcd_unhealthy_all = "has no healthy etcd endpoint available"
local created_obj = {}
local loaded_configuration = {}

Expand Down Expand Up @@ -146,7 +149,11 @@ local function waitdir(etcd_cli, key, modified_index, timeout)
end

if type(res.result) ~= "table" then
return nil, "failed to wait etcd dir"
err = "failed to wait etcd dir"
if res.error and res.error.message then
err = err .. ": " .. res.error.message
end
return nil, err
end
return etcd_apisix.watch_format(res)
end
Expand Down Expand Up @@ -529,6 +536,18 @@ local function _automatic_fetch(premature, self)
return
end

if not health_check.conf then
local _, err = health_check.init({
shm_name = "etcd_cluster_health_check",
fail_timeout = self.health_check_timeout,
max_fails = 3,
retry = true,
})
if err then
log.warn("fail to create health_check: " .. err)
end
end

local i = 0
while not exiting() and self.running and i <= 32 do
i = i + 1
Expand All @@ -545,7 +564,25 @@ local function _automatic_fetch(premature, self)

local ok, err = sync_data(self)
if err then
if err ~= "timeout" and err ~= "Key not found"
if string.find(err, err_etcd_unhealthy_all) then
local reconnected = false
while err and not reconnected and i <= 32 do
local backoff_duration, backoff_factor, backoff_step = 1, 2, 6
for _ = 1, backoff_step do
i = i + 1
ngx_sleep(backoff_duration)
_, err = sync_data(self)
if not err or not string.find(err, err_etcd_unhealthy_all) then
log.warn("reconnected to etcd")
reconnected = true
break
end
backoff_duration = backoff_duration * backoff_factor
log.error("no healthy etcd endpoint available, next retry after "
.. backoff_duration .. "s")
end
end
elseif err ~= "timeout" and err ~= "Key not found"
and self.last_err ~= err then
log.error("failed to fetch data from etcd: ", err, ", ",
tostring(self))
Expand Down Expand Up @@ -594,6 +631,10 @@ function _M.new(key, opts)
if not resync_delay or resync_delay < 0 then
resync_delay = 5
end
local health_check_timeout = etcd_conf.health_check_timeout
if not health_check_timeout or health_check_timeout < 0 then
health_check_timeout = 10
end

local automatic = opts and opts.automatic
local item_schema = opts and opts.item_schema
Expand All @@ -618,6 +659,7 @@ function _M.new(key, opts)
last_err = nil,
last_err_time = nil,
resync_delay = resync_delay,
health_check_timeout = health_check_timeout,
timeout = timeout,
single_item = single_item,
filter = filter_fun,
Expand Down
1 change: 1 addition & 0 deletions conf/config-default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ etcd:
prefix: "/apisix" # apisix configurations prefix
timeout: 30 # 30 seconds
#resync_delay: 5 # when sync failed and a rest is needed, resync after the configured seconds plus 50% random jitter
#health_check_timeout: 10 # etcd retry the unhealthy nodes after the configured seconds
#user: root # root username for etcd
#password: 5tHkHhYkjr6cQY # root password for etcd
tls:
Expand Down
2 changes: 1 addition & 1 deletion rockspec/apisix-master-0.rockspec
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ dependencies = {
"lua-resty-ctxdump = 0.1-0",
"lua-resty-dns-client = 5.2.0",
"lua-resty-template = 2.0",
"lua-resty-etcd = 1.5.0",
"lua-resty-etcd = 1.5.3",
"lua-resty-balancer = 0.02rc5",
"lua-resty-ngxvar = 0.5.2",
"lua-resty-jit-uuid = 0.0.7",
Expand Down
1 change: 1 addition & 0 deletions t/APISIX.pm
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,7 @@ _EOC_
lua_shared_dict discovery 1m; # plugin authz-keycloak
lua_shared_dict plugin-api-breaker 10m;
lua_capture_error_log 1m; # plugin error-log-logger
lua_shared_dict etcd_cluster_health_check 10m; # etcd health check
proxy_ssl_name \$upstream_host;
proxy_ssl_server_name on;
Expand Down
69 changes: 69 additions & 0 deletions t/cli/docker-compose-etcd-cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
version: "3.7"

services:
etcd0:
image: "gcr.io/etcd-development/etcd:v3.4.15"
container_name: etcd0
ports:
- "23800:2380"
- "23790:2379"
environment:
- ALLOW_NONE_AUTHENTICATION=yes
- ETCD_NAME=etcd0
- ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380
- ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379
- ETCD_ADVERTISE_CLIENT_URLS=http://127.0.0.1:23790
- ETCD_INITIAL_ADVERTISE_PEER_URLS=http://etcd0:2380
- ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster
- ETCD_INITIAL_CLUSTER=etcd0=http://etcd0:2380,etcd1=http://etcd1:2380,etcd2=http://etcd2:2380
- ETCD_INITIAL_CLUSTER_STATE=new

etcd1:
image: "gcr.io/etcd-development/etcd:v3.4.15"
container_name: etcd1
ports:
- "23801:2380"
- "23791:2379"
environment:
- ALLOW_NONE_AUTHENTICATION=yes
- ETCD_NAME=etcd1
- ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380
- ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379
- ETCD_ADVERTISE_CLIENT_URLS=http://127.0.0.1:23791
- ETCD_INITIAL_ADVERTISE_PEER_URLS=http://etcd1:2380
- ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster
- ETCD_INITIAL_CLUSTER=etcd0=http://etcd0:2380,etcd1=http://etcd1:2380,etcd2=http://etcd2:2380
- ETCD_INITIAL_CLUSTER_STATE=new

etcd2:
image: "gcr.io/etcd-development/etcd:v3.4.15"
container_name: etcd2
ports:
- "23802:2380"
- "23792:2379"
environment:
- ALLOW_NONE_AUTHENTICATION=yes
- ETCD_NAME=etcd2
- ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380
- ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379
- ETCD_ADVERTISE_CLIENT_URLS=http://127.0.0.1:23792
- ETCD_INITIAL_ADVERTISE_PEER_URLS=http://etcd2:2380
- ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster
- ETCD_INITIAL_CLUSTER=etcd0=http://etcd0:2380,etcd1=http://etcd1:2380,etcd2=http://etcd2:2380
- ETCD_INITIAL_CLUSTER_STATE=new
92 changes: 92 additions & 0 deletions t/cli/test_etcd_healthcheck.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

. ./t/cli/common.sh

# create 3 node etcd cluster in docker
ETCD_NAME_0=etcd0
ETCD_NAME_1=etcd1
ETCD_NAME_2=etcd2
HEALTH_CHECK_RETRY_TIMEOUT=10

echo '
etcd:
host:
- "http://127.0.0.1:23790"
- "http://127.0.0.1:23791"
- "http://127.0.0.1:23792"
health_check_timeout: '"$HEALTH_CHECK_RETRY_TIMEOUT"'
' > conf/config.yaml

docker-compose -f ./t/cli/docker-compose-etcd-cluster.yaml up -d

# Check apisix not got effected when one etcd node disconnected
make init && make run

docker stop ${ETCD_NAME_0}
code=$(curl -o /dev/null -s -w %{http_code} http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY: edd1c9f034335f136f87ad84b625c8f1')
if [ ! $code -eq 200 ]; then
echo "failed: apisix got effect when one etcd node out of a cluster disconnected"
exit 1
fi
docker start ${ETCD_NAME_0}

docker stop ${ETCD_NAME_1}
code=$(curl -o /dev/null -s -w %{http_code} http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY: edd1c9f034335f136f87ad84b625c8f1')
if [ ! $code -eq 200 ]; then
echo "failed: apisix got effect when one etcd node out of a cluster disconnected"
exit 1
fi
docker start ${ETCD_NAME_1}

make stop

echo "passed: apisix not got effected when one etcd node disconnected"

# Check when all etcd nodes disconnected, apisix trying to reconnect with backoff, and could successfully recover when reconnected
make init && make run

docker stop ${ETCD_NAME_0} && docker stop ${ETCD_NAME_1} && docker stop ${ETCD_NAME_2}

sleep_till=$(date +%s -d "$DATE + $HEALTH_CHECK_RETRY_TIMEOUT second")

code=$(curl -o /dev/null -s -w %{http_code} http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY: edd1c9f034335f136f87ad84b625c8f1')
if [ $code -eq 200 ]; then
echo "failed: apisix not got effect when all etcd nodes disconnected"
exit 1
fi

docker start ${ETCD_NAME_0} && docker start ${ETCD_NAME_1} && docker start ${ETCD_NAME_2}

# sleep till etcd health check try to check again
current_time=$(date +%s)
sleep_seconds=$(( $sleep_till - $current_time ))
if [ "$sleep_seconds" -gt 0 ]; then
sleep $sleep_seconds
fi

code=$(curl -o /dev/null -s -w %{http_code} http://127.0.0.1:9080/apisix/admin/routes -H 'X-API-KEY: edd1c9f034335f136f87ad84b625c8f1')
if [ ! $code -eq 200 ]; then
echo "failed: apisix could not recover when etcd node recover"
exit 1
fi

make stop

echo "passed: when all etcd nodes disconnected, apisix trying to reconnect with backoff, and could successfully recover when reconnected"
16 changes: 8 additions & 8 deletions t/core/config_etcd.t
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ etcd:
--- request
GET /t
--- grep_error_log eval
qr{failed to fetch data from etcd: connection refused, etcd key: .*routes}
qr{connection refused}
--- grep_error_log_out eval
qr/(failed to fetch data from etcd: connection refused, etcd key: .*routes\n){1,}/
qr/(connection refused){1,}/



Expand All @@ -68,9 +68,9 @@ etcd:
--- request
GET /t
--- grep_error_log chop
failed to fetch data from etcd: handshake failed
handshake failed
--- grep_error_log_out eval
qr/(failed to fetch data from etcd: handshake failed){1,}/
qr/(handshake failed){1,}/



Expand All @@ -92,9 +92,9 @@ etcd:
--- request
GET /t
--- grep_error_log chop
failed to fetch data from etcd: closed
closed
--- grep_error_log_out eval
qr/(failed to fetch data from etcd: closed){1,}/
qr/(closed){1,}/



Expand All @@ -116,9 +116,9 @@ etcd:
--- request
GET /t
--- grep_error_log chop
failed to fetch data from etcd: 18: self signed certificate
18: self signed certificate
--- grep_error_log_out eval
qr/(failed to fetch data from etcd: 18: self signed certificate){1,}/
qr/(18: self signed certificate){1,}/



Expand Down

0 comments on commit 994f020

Please sign in to comment.