From 8e454ba43d9c3cc63a8ec66d41f2d795798a3070 Mon Sep 17 00:00:00 2001 From: Viet Nguyen Duc Date: Tue, 18 Feb 2025 15:56:37 +0700 Subject: [PATCH] K8s: Improve Node checks for liveness probe and preStop hook Signed-off-by: Viet Nguyen Duc --- .../selenium-grid/configs/node/nodePreStop.sh | 31 +++++++++++++------ .../selenium-grid/configs/node/nodeProbe.sh | 18 +++++++---- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/charts/selenium-grid/configs/node/nodePreStop.sh b/charts/selenium-grid/configs/node/nodePreStop.sh index e3dfaead4e..cdeb49c391 100644 --- a/charts/selenium-grid/configs/node/nodePreStop.sh +++ b/charts/selenium-grid/configs/node/nodePreStop.sh @@ -46,12 +46,15 @@ function signal_hub_to_drain_node() { BASIC_AUTH="$(echo -en "${SE_ROUTER_USERNAME}:${SE_ROUTER_PASSWORD}" | base64 -w0)" if [ -n "${grid_url}" ]; then if [ "${grid_check}" = "401" ]; then - echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router requires authentication. Please check SE_ROUTER_USERNAME and SE_ROUTER_PASSWORD." + echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router requires authentication. Please check env vars SE_ROUTER_USERNAME and SE_ROUTER_PASSWORD are given." elif [ "${grid_check}" = "404" ]; then echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router endpoint could not be found. Please check the endpoint ${grid_url}" + elif [ "${grid_check}" = "200" ]; then + echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router endpoint is reachable. Signaling Hub/Router to drain node" + curl --noproxy "*" -m ${max_time} -k -X POST -H "Authorization: Basic ${BASIC_AUTH}" ${grid_url}/se/grid/distributor/node/${NODE_ID}/drain --header "${HEADERS}" + else + echo "$(date -u +"${ts_format}") [${probe_name}] - Hub/Router endpoint returns ${grid_check}. Skip signaling upstream." fi - echo "$(date -u +"${ts_format}") [${probe_name}] - Signaling Hub/Router to drain node" - curl --noproxy "*" -m ${max_time} -k -X POST -H "Authorization: Basic ${BASIC_AUTH}" ${grid_url}/se/grid/distributor/node/${NODE_ID}/drain --header "${HEADERS}" else echo "$(date -u +"${ts_format}") [${probe_name}] - There is no configured HUB/ROUTER host or SE_NODE_GRID_URL isn't set. ${probe_name} ignores to send drain request to upstream." fi @@ -63,7 +66,7 @@ function signal_node_to_drain() { } if curl --noproxy "*" -m ${max_time} -sfk ${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status > ${tmp_node_file}; then - NODE_ID=$(jq -r '.value.node.nodeId' ${tmp_node_file} || "") + NODE_ID=$(jq -r '.value.node.nodeId' ${tmp_node_file} || echo "") if [ -n "${NODE_ID}" ]; then echo "$(date -u +"${ts_format}") [${probe_name}] - Current Node ID is: ${NODE_ID}" signal_hub_to_drain_node @@ -71,20 +74,30 @@ if curl --noproxy "*" -m ${max_time} -sfk ${SE_SERVER_PROTOCOL}://127.0.0.1:${SE fi signal_node_to_drain # Wait for the current session to be finished if any + check_attempts=0 while true; do # Attempt the cURL request and capture the exit status - endpoint_http_code=$(curl --noproxy "*" --retry ${retry_time} -m ${max_time} -sfk ${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status -o ${tmp_node_file} -w "%{http_code}") + endpoint_http_code=$(curl --noproxy "*" --retry ${retry_time} -m ${max_time} -sfk "${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status" -o "${tmp_node_file}" -w "%{http_code}") endpoint_status=$? echo "$(date -u +"${ts_format}") [${probe_name}] - Fetch the Node status via cURL with exit status: ${endpoint_status}, HTTP code: ${endpoint_http_code}" - SLOT_HAS_SESSION=$(jq -e ".value.node.slots[]|select(.session != null).id.id" ${tmp_node_file} | tr -d '"' || "") - if [ -z "${SLOT_HAS_SESSION}" ]; then + SLOT_HAS_SESSION=$(jq -r '[.value.node.slots[]? | select(.session != null)] | length' "${tmp_node_file}" || echo 0) + if [ "${SLOT_HAS_SESSION}" -eq 0 ] && [ "${endpoint_http_code}" = "200" ]; then echo "$(date -u +"${ts_format}") [${probe_name}] - There is no session running. Node is ready to be terminated." - echo "$(date -u +"${ts_format}") [${probe_name}] - $(cat ${tmp_node_file} || "")" + echo "$(date -u +"${ts_format}") [${probe_name}] - $(cat "${tmp_node_file}" || echo "")" echo exit 0 + elif [ "${endpoint_http_code}" != "200" ]; then + echo "$(date -u +"${ts_format}") [${probe_name}] - Node endpoint returned status ${endpoint_http_code}, attempt one more time to confirm the status." + check_attempts=$((check_attempts+1)) + if [ ${check_attempts} -ge 3 ]; then + echo "$(date -u +"${ts_format}") [${probe_name}] - Node endpoint returned status ${endpoint_http_code} for serveral times. Assume that Node is ready to be terminated." + exit 0 + fi + sleep 2; else - echo "$(date -u +"${ts_format}") [${probe_name}] - Node ${probe_name} is waiting for current session on slot ${SLOT_HAS_SESSION} to be finished. Node details: message: $(jq -r '.value.message' ${tmp_node_file} || "unknown"), availability: $(jq -r '.value.node.availability' ${tmp_node_file} || "unknown")" + check_attempts=0 + echo "$(date -u +"${ts_format}") [${probe_name}] - Node is waiting for ${SLOT_HAS_SESSION} session(s) to be finished. Node details: message: $(jq -r '.value.message' "${tmp_node_file}" || echo "unknown"), availability: $(jq -r '.value.node.availability' "${tmp_node_file}" || echo "unknown")" sleep 2; fi diff --git a/charts/selenium-grid/configs/node/nodeProbe.sh b/charts/selenium-grid/configs/node/nodeProbe.sh index c1ef578ce4..e1fbe23702 100644 --- a/charts/selenium-grid/configs/node/nodeProbe.sh +++ b/charts/selenium-grid/configs/node/nodeProbe.sh @@ -27,11 +27,12 @@ function help_message() { echo "$(date -u +"${ts_format}") [${probe_name}] - If you believe Node is registered successfully but probe still report this message and fail for a long time. Workaround by set 'global.seleniumGrid.defaultNodeStartupProbe' to 'httpGet' and report us an issue for Chart improvement with your scenario." } -if curl --noproxy "*" -m ${max_time} -sfk ${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status -o ${tmp_node_file}; then - NODE_ID=$(jq -r '.value.node.nodeId' ${tmp_node_file} || "") - NODE_STATUS=$(jq -r '.value.node.availability' ${tmp_node_file} || "") +if curl --noproxy "*" -m ${max_time} -sfk "${SE_SERVER_PROTOCOL}://127.0.0.1:${SE_NODE_PORT}/status" -o "${tmp_node_file}"; then + NODE_ID=$(jq -r '.value.node.nodeId' "${tmp_node_file}" || echo "") + NODE_STATUS=$(jq -r '.value.node.availability' "${tmp_node_file}" || echo "") + SLOT_HAS_SESSION=$(jq -r '[.value.node.slots[]? | select(.session != null)] | length' "${tmp_node_file}" || echo 0) if [ -n "${NODE_ID}" ]; then - echo "$(date -u +"${ts_format}") [${probe_name}] - Node responds the ID: ${NODE_ID} with status: ${NODE_STATUS}" + echo "$(date -u +"${ts_format}") [${probe_name}] - Node responds the ID: ${NODE_ID} with status: ${NODE_STATUS}. Number of ongoing sessions: ${SLOT_HAS_SESSION}" else echo "$(date -u +"${ts_format}") [${probe_name}] - Wait for the Node to report its status" exit 1 @@ -52,15 +53,20 @@ if curl --noproxy "*" -m ${max_time} -sfk ${SE_SERVER_PROTOCOL}://127.0.0.1:${SE echo "$(date -u +"${ts_format}") [${probe_name}] - There is no configured HUB/ROUTER host or SE_NODE_GRID_URL isn't set. ${probe_name} will not work as expected." fi - curl --noproxy "*" -m ${max_time} -H "Authorization: Basic ${BASIC_AUTH}" -sfk "${grid_url}/status" -o ${tmp_grid_file} - GRID_NODE_ID=$(jq -e ".value.nodes[].id|select(. == \"${NODE_ID}\")" ${tmp_grid_file} | tr -d '"' || "") + endpoint_http_code=$(curl --noproxy "*" -m ${max_time} -H "Authorization: Basic ${BASIC_AUTH}" -sfk "${grid_url}/status" -o "${tmp_grid_file}" -w "%{http_code}") + GRID_NODE_ID=$(jq -e ".value.nodes[]?.id|select(. == \"${NODE_ID}\")" "${tmp_grid_file}" | tr -d '"' || echo "") if [ -n "${GRID_NODE_ID}" ]; then echo "$(date -u +"${ts_format}") [${probe_name}] - Grid responds a matched Node ID: ${GRID_NODE_ID}" + elif [ "${endpoint_http_code}" != "200" ]; then + echo "$(date -u +"${ts_format}") [${probe_name}] - Grid endpoint returns ${endpoint_http_code}. Skip checking upstream." fi if [ -n "${NODE_ID}" ] && [ -n "${GRID_NODE_ID}" ] && [ "${NODE_ID}" = "${GRID_NODE_ID}" ]; then echo "$(date -u +"${ts_format}") [${probe_name}] - Node ID: ${NODE_ID} is found in the Grid. Node is ready." exit 0 + elif [ -n "${NODE_ID}" ] && [ "${endpoint_http_code}" != "200" ]; then + echo "$(date -u +"${ts_format}") [${probe_name}] - Node ID: ${NODE_ID} report its status, but could not double check ID in Hub. Assume that Node is ready." + exit 0 else echo "$(date -u +"${ts_format}") [${probe_name}] - Node ID: ${NODE_ID} is not found in the Grid. Node is not ready." exit 1