-
Notifications
You must be signed in to change notification settings - Fork 1
/
handlejob.sh
executable file
·76 lines (63 loc) · 3.19 KB
/
handlejob.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/bash
set -e
#set -x
kubectl delete -f job.yaml || true
kubectl apply -f job.yaml
namespace=testjob
jobName=testjob
timeoutMinuteStartContainer=30s
# this crancky implementation is because '--pod-running-timeout' is not working...
[[ "${jobName}" == "" ]] && exit 0
function yellAndExit1(){
echo " Namespace ${namespace} will not be deleted to allow debugging"
echo " Attempting to get as much info as possible before exiting 1"
set -x
kubectl -n ${namespace} get job/${jobName} -o yaml || true
kubectl -n ${namespace} describe job/${jobName} || true
kubectl -n ${namespace} describe pod -l "job-name=${jobName}" || true
exit 1
}
# unique identifier for that Job, so we can query ressources for that specific Job without worrying about eventual previous run
jobUid=$(kubectl -n ${namespace} get job/${jobName} -o jsonpath='{.spec.selector.matchLabels.controller-uid}')
echo " Keep trying to get logs until backoffLimit has been reached (or Job succeed)"
while true; do
echo " Wait for the most recently created Pod to not be 'Pending' so logs can be fetched without errors"
finaldate=$(date -d " ${timeoutMinuteStartContainer} minutes" +'%m/%d %H:%M')
ready=false
while [[ $ready != "true" ]]; do
if [[ "$(date +'%m/%d %H:%M')" > "${finaldate}" ]]; then
echo " Err: Timeout waiting for pod to start"
yellAndExit1
fi
echo "... waiting"
sleep 1
# check if the Job is finally active, or maybe already done
jobPodPhases=$(kubectl -n ${namespace} get pod -l controller-uid=${jobUid} --sort-by=.metadata.creationTimestamp -o jsonpath='{.items[-1:].status.phase}')
# pods phase can be 'Pending' 'Running' 'Failed' 'Succeeded'
[[ ${jobPodPhases} != 'Pending' ]] && ready=true
done
echo " Job is either 'Running' 'Failed' 'Succeeded'"
echo " Attempt to fetch logs"
echo "-----------------------------"
echo ""
kubectl -n ${namespace} logs --timestamps=true --follow pod/"$(kubectl -n ${namespace} get pod -l controller-uid=${jobUid} --sort-by=.metadata.creationTimestamp -o jsonpath='{.items[-1:].metadata.name}')" || true
echo ""
echo "-----------------------------"
echo " Job replica is done, checking Job status"
# not elegant but the safest way to get the overall Job status as .failed and conditions start to get tricky
# to look into as long as more than backofflimit is not 0
# give 5s to Kubernetes to have time to update the job status
#kubectl -n ${namespace} wait --for=condition=complete --timeout=60s job/${jobName} && complete=true || true
kubectl -n ${namespace} wait --for=condition=complete --timeout=1s job/${jobName} 2> /dev/null && complete=true || true
if [[ "${complete}" == "true" ]]; then
echo " Job final state is 'complete', it ended with sucess"
exit 0
else
if [[ $(kubectl -n ${namespace} get job/${jobName} -o jsonpath='{.spec.backoffLimit}') != $(kubectl -n ${namespace} get job/${jobName} -o jsonpath='{.status.failed}') ]]; then
echo " Job replica failed, loop to wait for the next replica"
else
echo " Job has reach its backoffLimit and its final state is not 'complete', it ended with failures"
yellAndExit1
fi
fi
done