Skip to content

K8s: Improve Node checks for liveness probe and preStop hook #2661

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 22 additions & 9 deletions charts/selenium-grid/configs/node/nodePreStop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,15 @@ function signal_hub_to_drain_node() {
BASIC_AUTH="$(echo -en "${SE_ROUTER_USERNAME}:${SE_ROUTER_PASSWORD}" | base64 -w0)"
if [ -n "${grid_url}" ]; then
if [ "${grid_check}" = "401" ]; then
echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router requires authentication. Please check SE_ROUTER_USERNAME and SE_ROUTER_PASSWORD."
echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router requires authentication. Please check env vars SE_ROUTER_USERNAME and SE_ROUTER_PASSWORD are given."
elif [ "${grid_check}" = "404" ]; then
echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router endpoint could not be found. Please check the endpoint ${grid_url}"
elif [ "${grid_check}" = "200" ]; then
echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router endpoint is reachable. Signaling Hub/Router to drain node"
curl --noproxy "*" -m ${max_time} -k -X POST -H "Authorization: Basic ${BASIC_AUTH}" ${grid_url}/se/grid/distributor/node/${NODE_ID}/drain --header "${HEADERS}"
else
echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router endpoint returns ${grid_check}. Skip signaling upstream."
fi
echo "$(date -u +"${ts_format}") [${probe_name}] - Signaling Hub/Router to drain node"
curl --noproxy "*" -m ${max_time} -k -X POST -H "Authorization: Basic ${BASIC_AUTH}" ${grid_url}/se/grid/distributor/node/${NODE_ID}/drain --header "${HEADERS}"
else
echo "$(date -u +"${ts_format}") [${probe_name}] - There is no configured HUB/ROUTER host or SE_NODE_GRID_URL isn't set. ${probe_name} ignores to send drain request to upstream."
fi
Expand All @@ -63,28 +66,38 @@ function signal_node_to_drain() {
}

if curl --noproxy "*" -m ${max_time} -sfk ${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status > ${tmp_node_file}; then
NODE_ID=$(jq -r '.value.node.nodeId' ${tmp_node_file} || "")
NODE_ID=$(jq -r '.value.node.nodeId' ${tmp_node_file} || echo "")
if [ -n "${NODE_ID}" ]; then
echo "$(date -u +"${ts_format}") [${probe_name}] - Current Node ID is: ${NODE_ID}"
signal_hub_to_drain_node
echo
fi
signal_node_to_drain
# Wait for the current session to be finished if any
check_attempts=0
while true; do
# Attempt the cURL request and capture the exit status
endpoint_http_code=$(curl --noproxy "*" --retry ${retry_time} -m ${max_time} -sfk ${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status -o ${tmp_node_file} -w "%{http_code}")
endpoint_http_code=$(curl --noproxy "*" --retry ${retry_time} -m ${max_time} -sfk "${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status" -o "${tmp_node_file}" -w "%{http_code}")
endpoint_status=$?
echo "$(date -u +"${ts_format}") [${probe_name}] - Fetch the Node status via cURL with exit status: ${endpoint_status}, HTTP code: ${endpoint_http_code}"

SLOT_HAS_SESSION=$(jq -e ".value.node.slots[]|select(.session != null).id.id" ${tmp_node_file} | tr -d '"' || "")
if [ -z "${SLOT_HAS_SESSION}" ]; then
SLOT_HAS_SESSION=$(jq -r '[.value.node.slots[]? | select(.session != null)] | length' "${tmp_node_file}" || echo 0)
if [ "${SLOT_HAS_SESSION}" -eq 0 ] && [ "${endpoint_http_code}" = "200" ]; then
echo "$(date -u +"${ts_format}") [${probe_name}] - There is no session running. Node is ready to be terminated."
echo "$(date -u +"${ts_format}") [${probe_name}] - $(cat ${tmp_node_file} || "")"
echo "$(date -u +"${ts_format}") [${probe_name}] - $(cat "${tmp_node_file}" || echo "")"
echo
exit 0
elif [ "${endpoint_http_code}" != "200" ]; then
echo "$(date -u +"${ts_format}") [${probe_name}] - Node endpoint returned status ${endpoint_http_code}, attempt one more time to confirm the status."
check_attempts=$((check_attempts+1))
if [ ${check_attempts} -ge 3 ]; then
echo "$(date -u +"${ts_format}") [${probe_name}] - Node endpoint returned status ${endpoint_http_code} for serveral times. Assume that Node is ready to be terminated."
exit 0
fi
sleep 2;
else
echo "$(date -u +"${ts_format}") [${probe_name}] - Node ${probe_name} is waiting for current session on slot ${SLOT_HAS_SESSION} to be finished. Node details: message: $(jq -r '.value.message' ${tmp_node_file} || "unknown"), availability: $(jq -r '.value.node.availability' ${tmp_node_file} || "unknown")"
check_attempts=0
echo "$(date -u +"${ts_format}") [${probe_name}] - Node is waiting for ${SLOT_HAS_SESSION} session(s) to be finished. Node details: message: $(jq -r '.value.message' "${tmp_node_file}" || echo "unknown"), availability: $(jq -r '.value.node.availability' "${tmp_node_file}" || echo "unknown")"
sleep 2;
fi

Expand Down
18 changes: 12 additions & 6 deletions charts/selenium-grid/configs/node/nodeProbe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ function help_message() {
echo "$(date -u +"${ts_format}") [${probe_name}] - If you believe Node is registered successfully but probe still report this message and fail for a long time. Workaround by set 'global.seleniumGrid.defaultNodeStartupProbe' to 'httpGet' and report us an issue for Chart improvement with your scenario."
}

if curl --noproxy "*" -m ${max_time} -sfk ${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status -o ${tmp_node_file}; then
NODE_ID=$(jq -r '.value.node.nodeId' ${tmp_node_file} || "")
NODE_STATUS=$(jq -r '.value.node.availability' ${tmp_node_file} || "")
if curl --noproxy "*" -m ${max_time} -sfk "${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status" -o "${tmp_node_file}"; then
NODE_ID=$(jq -r '.value.node.nodeId' "${tmp_node_file}" || echo "")
NODE_STATUS=$(jq -r '.value.node.availability' "${tmp_node_file}" || echo "")
SLOT_HAS_SESSION=$(jq -r '[.value.node.slots[]? | select(.session != null)] | length' "${tmp_node_file}" || echo 0)
if [ -n "${NODE_ID}" ]; then
echo "$(date -u +"${ts_format}") [${probe_name}] - Node responds the ID: ${NODE_ID} with status: ${NODE_STATUS}"
echo "$(date -u +"${ts_format}") [${probe_name}] - Node responds the ID: ${NODE_ID} with status: ${NODE_STATUS}. Number of ongoing sessions: ${SLOT_HAS_SESSION}"
else
echo "$(date -u +"${ts_format}") [${probe_name}] - Wait for the Node to report its status"
exit 1
Expand All @@ -52,15 +53,20 @@ if curl --noproxy "*" -m ${max_time} -sfk ${SE_SERVER_PROTOCOL}://127.0.0.1:${SE
echo "$(date -u +"${ts_format}") [${probe_name}] - There is no configured HUB/ROUTER host or SE_NODE_GRID_URL isn't set. ${probe_name} will not work as expected."
fi

curl --noproxy "*" -m ${max_time} -H "Authorization: Basic ${BASIC_AUTH}" -sfk "${grid_url}/status" -o ${tmp_grid_file}
GRID_NODE_ID=$(jq -e ".value.nodes[].id|select(. == \"${NODE_ID}\")" ${tmp_grid_file} | tr -d '"' || "")
endpoint_http_code=$(curl --noproxy "*" -m ${max_time} -H "Authorization: Basic ${BASIC_AUTH}" -sfk "${grid_url}/status" -o "${tmp_grid_file}" -w "%{http_code}")
GRID_NODE_ID=$(jq -e ".value.nodes[]?.id|select(. == \"${NODE_ID}\")" "${tmp_grid_file}" | tr -d '"' || echo "")
if [ -n "${GRID_NODE_ID}" ]; then
echo "$(date -u +"${ts_format}") [${probe_name}] - Grid responds a matched Node ID: ${GRID_NODE_ID}"
elif [ "${endpoint_http_code}" != "200" ]; then
echo "$(date -u +"${ts_format}") [${probe_name}] - Grid endpoint returns ${endpoint_http_code}. Skip checking upstream."
fi

if [ -n "${NODE_ID}" ] && [ -n "${GRID_NODE_ID}" ] && [ "${NODE_ID}" = "${GRID_NODE_ID}" ]; then
echo "$(date -u +"${ts_format}") [${probe_name}] - Node ID: ${NODE_ID} is found in the Grid. Node is ready."
exit 0
elif [ -n "${NODE_ID}" ] && [ "${endpoint_http_code}" != "200" ]; then
echo "$(date -u +"${ts_format}") [${probe_name}] - Node ID: ${NODE_ID} report its status, but could not double check ID in Hub. Assume that Node is ready."
exit 0
else
echo "$(date -u +"${ts_format}") [${probe_name}] - Node ID: ${NODE_ID} is not found in the Grid. Node is not ready."
exit 1
Expand Down
Loading