Skip to content

Commit

Permalink
[CLOUD-2261] handling txn recovery by querying api
Browse files Browse the repository at this point in the history
  • Loading branch information
ochaloup committed May 15, 2018
1 parent e91bb65 commit 5e9de06
Show file tree
Hide file tree
Showing 11 changed files with 417 additions and 13 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
.idea/
.project

14 changes: 11 additions & 3 deletions os-eap-migration/added/launch/openshift-migrate-common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ function runMigration() {
local instanceDir=$1
local count=$2

export NODE_NAME="${NODE_NAME:-node}-${count}"
local countedNodeName="node-${count}"
export NODE_NAME="${NODE_NAME:-$countedNodeName}"
cp -f ${STANDALONE_XML_COPY} ${STANDALONE_XML}

source $JBOSS_HOME/bin/launch/configure.sh
Expand All @@ -29,8 +30,15 @@ function runMigration() {
local success=false
local message="Finished, migration pod has been terminated"
${JBOSS_HOME}/bin/readinessProbe.sh
local probeStatus=$?

if [ $? -eq 0 ] ; then
if [ $probeStatus -eq 0 ] && [ "$(type -t probePodLog)" = 'function' ]; then
# -- checking if server.log is clean from errors (only if function of the particular name exists)
probePodLog # calling function from partitionPV.sh
probeStatus=$?
fi

if [ $probeStatus -eq 0 ] ; then
echo "$(date): Server started, checking for transactions"
local startTime=$(date +'%s')
local endTime=$((startTime + ${RECOVERY_TIMEOUT} + 1))
Expand Down Expand Up @@ -66,7 +74,7 @@ function runMigration() {
if [ "${success}" = "true" ] ; then
message="Finished, recovery terminated successfully"
else
message="Finished, Recovery DID NOT complete, check log for details. Recovery will be reattempted."
message="Finished, Recovery DID NOT complete, check log for details. Recovery will be reattempted."
fi
fi

Expand Down
2 changes: 1 addition & 1 deletion os-eap-migration/module.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ description: EAP common migration scripts

modules:
install:
- name: os-partition
- name: os-eap-txn-partition

envs:
- name: "RECOVERY_TIMEOUT"
Expand Down
20 changes: 13 additions & 7 deletions os-eap-node-name/added/launch/openshift-node-name.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
function init_pod_name() {
# when POD_NAME is non-zero length using that given name

# docker sets up container_uuid
[ -z "${POD_NAME}" ] && POD_NAME="${container_uuid}"
# openshift sets up the node id as host name
[ -z "${POD_NAME}" ] && POD_NAME="${HOSTNAME}"
# TODO: fail when pod name is not set here?
}

function init_node_name() {
if [ -z "${JBOSS_NODE_NAME}" ] ; then
if [ -n "${NODE_NAME}" ]; then
JBOSS_NODE_NAME="${NODE_NAME}"
elif [ -n "${container_uuid}" ]; then
JBOSS_NODE_NAME="${container_uuid}"
else
JBOSS_NODE_NAME="${HOSTNAME}"
fi
init_pod_name

JBOSS_NODE_NAME="${POD_NAME}"

# CLOUD-427: truncate to 23 characters max (from the end backwards)
if [ ${#JBOSS_NODE_NAME} -gt 23 ]; then
Expand Down
2 changes: 1 addition & 1 deletion os-eap-probes/added/readinessProbe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ LOG=/tmp/readiness-log

COUNT=30
SLEEP=5
DEBUG=false
DEBUG=${SCRIPT_DEBUG:-false}
PROBE_IMPL=probe.eap.dmr.EapProbe

if [ $# -gt 0 ] ; then
Expand Down
202 changes: 202 additions & 0 deletions os-eap-txn-partition/added/partitionPV.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
source ${JBOSS_HOME}/bin/launch/openshift-node-name.sh
[ "${SCRIPT_DEBUG}" = "true" ] && DEBUG_QUERY_API_PARAM="-l debug"

# parameters
# - needle to search in array
# - array passed as: "${ARRAY_VAR[@]}"
function arrContains() {
local element match="$1"
shift
for element; do
[[ "$element" == "$match" ]] && return 0
done
return 1
}

# parameters
# - base directory
function partitionPV() {
local podsDir="$1"
local applicationPodDir

mkdir -p "${podsDir}"

init_pod_name
local applicationPodDir="${podsDir}/${POD_NAME}"

local waitCounter=0
# 2) while any file matching, sleep
while true; do
local isRecoveryInProgress=false
# is there an existing RECOVERY descriptor that means a recovery is in progress
find "${podsDir}" -maxdepth 1 -type f -name "${POD_NAME}-RECOVERY-*" 2>/dev/null | grep -q .
[ $? -eq 0 ] && isRecoveryInProgress=true

# we are free to start the app container
if ! $isRecoveryInProgress; then
break
fi

if $isRecoveryInProgress; then
echo "Waiting to start pod ${POD_NAME} as recovery process '$(echo ${podsDir}/${POD_NAME}-RECOVERY-*)' is currently cleaning data directory."
fi

sleep 1
echo "`date`: waiting for recovery process to clean the environment for the pod to start"
done

# 3) create /pods/<applicationPodName>
SERVER_DATA_DIR="${applicationPodDir}/serverData"
mkdir -p "${SERVER_DATA_DIR}"

if [ ! -f "${SERVER_DATA_DIR}/../data_initialized" ]; then
init_data_dir ${SERVER_DATA_DIR}
touch "${SERVER_DATA_DIR}/../data_initialized"
fi

# 4) launch EAP with node name as pod name
NODE_NAME="${POD_NAME}" runServer "${SERVER_DATA_DIR}" &

PID=$!

trap "echo Received TERM of pid ${PID} of pod name ${POD_NAME}; kill -TERM $PID" TERM

wait $PID 2>/dev/null
STATUS=$?
trap - TERM
wait $PID 2>/dev/null

echo "Server terminated with status $STATUS ($(kill -l $STATUS 2>/dev/null))"

if [ "$STATUS" -eq 255 ] ; then
echo "Server returned 255, changing to 254"
STATUS=254
fi

exit $STATUS
}


# parameters
# - base directory
# - migration pause between cycles
function migratePV() {
local podsDir="$1"
local applicationPodDir
MIGRATION_PAUSE="${2:-30}"
MIGRATED=false

init_pod_name
local recoveryPodName="${POD_NAME}"

while true ; do

# 1) Periodically, for each /pods/<applicationPodName>
for applicationPodDir in "${podsDir}"/*; do
# check if the found file is type of directory, if not directory move to the next item
[ ! -d "$applicationPodDir" ] && continue

# 1.a) create /pods/<applicationPodName>-RECOVERY-<recoveryPodName>
local applicationPodName="$(basename ${applicationPodDir})"
touch "${podsDir}/${applicationPodName}-RECOVERY-${recoveryPodName}"
STATUS=42 # expecting there could be error on getting living pods

# 1.a.i) if <applicationPodName> is not in the cluster
echo "examining existence of living pod for directory: '${applicationPodDir}'"
unset LIVING_PODS
LIVING_PODS=($(python ${JBOSS_HOME}/bin/queryapi/query.py -q pods_living -f list_space ${DEBUG_QUERY_API_PARAM}))
[ $? -ne 0 ] && echo "ERROR: Can't get list of living pods" && continue
STATUS=-1 # here we have data about living pods and the recovery marker can be removed if the pod is living
if ! arrContains ${applicationPodName} "${LIVING_PODS[@]}"; then

(
# 1.a.ii) run recovery until empty (including orphan checks and empty object store hierarchy deletion)
SERVER_DATA_DIR="${applicationPodDir}/serverData"
JBOSS_NODE_NAME="$applicationPodName" runMigration "${SERVER_DATA_DIR}" &

PID=$!

trap "echo Received TERM ; kill -TERM $PID" TERM

wait $PID 2>/dev/null
STATUS=$?
trap - TERM
wait $PID 2>/dev/null

echo "Migration terminated with status $STATUS ($(kill -l $STATUS))"

if [ "$STATUS" -eq 255 ] ; then
echo "Server returned 255, changing to 254"
STATUS=254
fi
exit $STATUS
) &

PID=$!

trap "kill -TERM $PID" TERM

wait $PID 2>/dev/null
STATUS=$?
trap - TERM
wait $PID 2>/dev/null

if [ $STATUS -eq 0 ]; then
# 1.a.iii) Delete /pods/<applicationPodName> when recovery was succesful
echo "`date`: Migration succesfully finished for application directory ${applicationPodDir} thus removing it by recovery pod ${recoveryPodName}"
rm -rf "${applicationPodDir}"
fi
fi

# 1.b.) Deleting the recovery marker
if [ $STATUS -eq 0 ] || [ $STATUS -eq -1 ]; then
# STATUS is 0: we are free from in-doubt transactions, -1: there is a running pod of the same name (do the recovery on his own if needed)
rm -f "${podsDir}/${applicationPodName}-RECOVERY-${recoveryPodName}"
fi

# 2) Periodically, for files /pods/<applicationPodName>-RECOVERY-<recoveryPodName>, for failed recovery pods
for recoveryPodFilePathToCheck in "${podsDir}/"*-RECOVERY-*; do
local recoveryPodFileToCheck="$(basename ${recoveryPodFilePathToCheck})"
local recoveryPodNameToCheck=${recoveryPodFileToCheck#*RECOVERY-}

unset LIVING_PODS
LIVING_PODS=($(python ${JBOSS_HOME}/bin/queryapi/query.py -q pods_living -f list_space ${DEBUG_QUERY_API_PARAM}))
[ $? -ne 0 ] && echo "ERROR: Can't get list of living pods" && continue

if ! arrContains ${recoveryPodNameToCheck} "${LIVING_PODS[@]}"; then
# recovery pod is dead, garbage collecting
rm -f "${recoveryPodFilePathToCheck}"
fi
done

done

echo "`date`: Finished Migration Check cycle, pausing for ${MIGRATION_PAUSE} seconds before resuming"
sleep "${MIGRATION_PAUSE}"
done
}

# parameters
# - pod name (optional)
function probePodLog() {
init_pod_name
local podNameToProbe=${1:-$POD_NAME}

local logOutput=$(python ${JBOSS_HOME}/bin/queryapi/query.py -q log ${podNameToProbe})
local probeStatus=$?

if [ $probeStatus -ne 0 ]; then
echo "Cannot contact OpenShift API to get log for pod ${POD_NAME}"
return 1
fi

printf $logOutput | grep 'ERROR'
local logProbeStatus=$?

if [ $logProbeStatus -eq 0 ]; then # ERROR string was found in the log output
echo "Server at ${NAMESPACE}/${POD_NAME} started with errors"
return 1
fi

return 0
}
Loading

0 comments on commit 5e9de06

Please sign in to comment.