Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle empty NODE_ID in Elasticsearch PreStop hook #7892

Merged
merged 34 commits into from
Jul 4, 2024
Merged
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
482f23b
fix: cleanup pre-stop-hook-script.sh
BobVanB Jun 11, 2024
fb62e1d
fix: update the exit default from delayed_exit
BobVanB Jun 11, 2024
9561a9f
fix: use spaces inside shellscript
BobVanB Jun 11, 2024
1442ef1
fix: use spaces inside shellscript
BobVanB Jun 11, 2024
18f1b87
fix: cleanup grep for NODE_ID
BobVanB Jun 11, 2024
46d08b3
fix: spaces before param
BobVanB Jun 11, 2024
8f53e76
fix: update retry to 7
BobVanB Jun 11, 2024
c3a8fd9
fix: temp return expanded param for basic auth (wrong)
BobVanB Jun 11, 2024
99a6d51
fix: update retry to 8
BobVanB Jun 11, 2024
9e22058
fix: add working version with alot of debug info
BobVanB Jun 12, 2024
452fb48
fix: cleanup debug code
BobVanB Jun 12, 2024
8f1cea9
fix: remove comments
BobVanB Jun 12, 2024
0e8c5c4
fix: update retry to 8
BobVanB Jun 12, 2024
6ee066b
fix; update set flags
BobVanB Jun 12, 2024
2954da3
fix: remove comments
BobVanB Jun 12, 2024
d8db45d
fix: use simpel definition for argument parameter
BobVanB Jun 12, 2024
ecf5746
Merge branch 'main' into pre-stop-hook-script
BobVanB Jun 13, 2024
48e005c
Merge branch 'main' into pre-stop-hook-script
BobVanB Jun 13, 2024
03d6bc0
fix: correct BASIC_AUTH array
BobVanB Jun 13, 2024
cf587e6
fix: remove shellcheck disabled comment
BobVanB Jun 14, 2024
97f12ed
fix: prevent globbing
BobVanB Jun 14, 2024
cb2bf9e
fix: globbing message
BobVanB Jun 14, 2024
29234cd
fix: smaller footprint
BobVanB Jun 14, 2024
a74bc1d
fix: smaller footprint
BobVanB Jun 14, 2024
acdf865
Update pkg/controller/elasticsearch/nodespec/lifecycle_hook.go
BobVanB Jun 27, 2024
2e6ea64
Update pkg/controller/elasticsearch/nodespec/lifecycle_hook.go
BobVanB Jun 27, 2024
b85c7e8
Update pkg/controller/elasticsearch/nodespec/lifecycle_hook.go
BobVanB Jun 27, 2024
a56d570
Update pkg/controller/elasticsearch/nodespec/lifecycle_hook.go
BobVanB Jun 27, 2024
4cc0bd0
Merge branch 'main' into pre-stop-hook-script
BobVanB Jun 28, 2024
5f955e6
fix: return grep to original
BobVanB Jun 29, 2024
e7c69da
Update pkg/controller/elasticsearch/nodespec/lifecycle_hook.go
BobVanB Jul 2, 2024
94ca9f9
Merge branch 'main' into pre-stop-hook-script
BobVanB Jul 3, 2024
3285537
Variabilize and comment magic number
thbkrkr Jul 3, 2024
953cf00
Merge branch 'main' into pre-stop-hook-script
BobVanB Jul 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 45 additions & 35 deletions pkg/controller/elasticsearch/nodespec/lifecycle_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ var preStopHookScriptTemplate = template.Must(template.New("pre-stop").Parse(`#!

set -uo pipefail

# This script will wait for up to $PRE_STOP_ADDITIONAL_WAIT_SECONDS before allowing termination of the Pod
# This script will wait for up to $PRE_STOP_ADDITIONAL_WAIT_SECONDS before allowing termination of the Pod
# This slows down the process shutdown and allows to make changes to the pool gracefully, without blackholing traffic when DNS
# still contains the IP that is already inactive.
# still contains the IP that is already inactive.
# As this runs in parallel to grace period after which process is SIGKILLed,
# it should be set to allow enough time for the process to gracefully terminate.
# It allows kube-proxy to refresh its rules and remove the terminating Pod IP.
Expand Down Expand Up @@ -69,20 +69,23 @@ global_dns_error_cnt=0

function request() {
local status exit
status=$(curl -k -sS -o "$resp_body" -w "%{http_code}" "$@")
status=$(curl -k -sS -o "${resp_body}" -w "%{http_code}" "$@")
exit=$?
if [ "$exit" -ne 0 ] || [ "$status" -lt 200 ] || [ "$status" -gt 299 ]; then
# track curl DNS errors separately
if [ "$exit" -eq 6 ]; then ((global_dns_error_cnt++)); fi
# make sure we have a non-zero exit code in the presence of errors
if [ "$exit" -eq 0 ]; then exit=1; fi
log "$status" "$3" #by convention the third arg contains the URL
log "$status" "$3" #by convention the third arg contains the URL
return $exit
fi
global_dns_error_cnt=0
return 0
}

# number of retries to try not to last more than default terminateGracePeriodSeconds (0 + 1 + 2 + 4 + 8 + 16 + 32 + 64 < 180s)
retries_count=8

function retry() {
local retries=$1
shift
Expand Down Expand Up @@ -113,25 +116,25 @@ function log() {
}

function error_exit() {
log "$@"
log "$*"
delayed_exit 1
}

function delayed_exit() {
local elapsed
elapsed=$(duration "$script_start")
elapsed=$(duration "${script_start}")
local remaining=$((PRE_STOP_ADDITIONAL_WAIT_SECONDS - elapsed))
if (( remaining < 0 )); then
exit ${1-0}
exit "${1-0}"
fi
log "delaying termination for $remaining seconds"
log "delaying termination for ${remaining} seconds"
sleep $remaining
exit ${1-0}
exit "${1-0}"
}

function supports_node_shutdown() {
local version="$1"
version=${version#[vV]}
version="${version#[vV]}"
major="${version%%\.*}"
minor="${version#*.}"
minor="${minor%.*}"
Expand All @@ -146,62 +149,69 @@ function supports_node_shutdown() {
version=""
if [[ -f "{{.LabelsFile}}" ]]; then
# get Elasticsearch version from the downward API
version=$(grep "{{.VersionLabelName}}" {{.LabelsFile}} | cut -d '=' -f 2)
version=$(grep "{{.VersionLabelName}}" "{{.LabelsFile}}" | cut -d '=' -f 2)
# remove quotes
version=$(echo "${version}" | tr -d '"')
fi

# if ES version does not support node shutdown exit early
if ! supports_node_shutdown "$version"; then
delayed_exit
delayed_exit
fi

# setup basic auth if credentials are available
if [ -f "{{.PreStopUserPasswordPath}}" ]; then
PROBE_PASSWORD=$(<{{.PreStopUserPasswordPath}})
BASIC_AUTH="-u {{.PreStopUserName}}:${PROBE_PASSWORD}"
PROBE_PASSWORD=$(<"{{.PreStopUserPasswordPath}}")
BASIC_AUTH=("-u" "{{.PreStopUserName}}:${PROBE_PASSWORD}")
else
# typically the case on upgrades from versions that did not have this script yet and the necessary volume mounts are missing
log "no API credentials available, will not attempt node shutdown orchestration from pre-stop hook"
delayed_exit
fi

ES_URL={{.ServiceURL}}
ES_URL="{{.ServiceURL}}"

log "retrieving node ID"
retry 10 request -X GET "$ES_URL/_cat/nodes?full_id=true&h=id,name" $BASIC_AUTH
if [ "$?" -ne 0 ]; then
error_exit "failed to retrieve node ID"
if ! retry "$retries_count" request -X GET "${ES_URL}/_cat/nodes?full_id=true&h=id,name" "${BASIC_AUTH[@]}"
then
error_exit "failed to retrieve nodes"
fi

NODE_ID=$(grep "$POD_NAME" "$resp_body" | cut -f 1 -d ' ')
if ! NODE_ID="$(grep "${POD_NAME}" "${resp_body}" | cut -f 1 -d ' ')"
then
error_exit "failed to extract node id"
fi

# check if there is an ongoing shutdown request
request -X GET $ES_URL/_nodes/"$NODE_ID"/shutdown $BASIC_AUTH
if ! request -X GET "${ES_URL}/_nodes/${NODE_ID}/shutdown" "${BASIC_AUTH[@]}"
BobVanB marked this conversation as resolved.
Show resolved Hide resolved
then
error_exit "failed to retrieve shutdown status"
fi

if grep -q -v '"nodes":\[\]' "$resp_body"; then
log "shutdown managed by ECK operator"
delayed_exit
log "shutdown managed by ECK operator"
delayed_exit
fi

log "initiating node shutdown"
retry 10 request -X PUT $ES_URL/_nodes/"$NODE_ID"/shutdown $BASIC_AUTH -H 'Content-Type: application/json' -d"
if ! retry "$retries_count" request -X PUT "${ES_URL}/_nodes/${NODE_ID}/shutdown" "${BASIC_AUTH[@]}" -H 'Content-Type: application/json' -d"
{
\"type\": \"$shutdown_type\",
\"type\": \"${shutdown_type}\",
\"reason\": \"pre-stop hook\"
}
"
if [ "$?" -ne 0 ]; then
error_exit "failed to call node shutdown API"
}"
then
error_exit "failed to call node shutdown API"
fi

while :
do
log "waiting for node shutdown to complete"
request -X GET $ES_URL/_nodes/"$NODE_ID"/shutdown $BASIC_AUTH
if [ "$?" -eq 0 ] && grep -q -v 'IN_PROGRESS\|STALLED' "$resp_body"; then
break
fi
sleep 10
do
log "waiting for node shutdown to complete"
if request -X GET "${ES_URL}/_nodes/${NODE_ID}/shutdown" "${BASIC_AUTH[@]}" &&
grep -q -v 'IN_PROGRESS\|STALLED' "$resp_body"
then
break
fi
sleep 10
done

delayed_exit
Expand Down