-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathcheck_autorestart_nodes_tmux.sh
209 lines (174 loc) · 8.12 KB
/
check_autorestart_nodes_tmux.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/bin/bash
# This script is used to check nodes' statuses and to autorestart if necessary
#
# Exit script immediately on error:
set -e
# Source the general node config file, which should be in the same folder as the current script:
scripts_folder=$(dirname "$(realpath $0)")
source $scripts_folder/nodes_config.sh
# Define criteria to restart of even reinstall node(s)
SLEEP_SECS=20 # check every ... seconds
MIN_ERD_NUM_CONNECTED_PEERS_T1_SECS=60 # check erd_num_connected_peers after T1 seconds
MIN_ERD_NUM_CONNECTED_PEERS_AFTER_T1=5 # minimum erd_num_connected_peers after T1 seconds
# The lines below are for later....
#MIN_ERD_NUM_CONNECTED_PEERS_T2_SECS=120 # check erd_num_connected_peers after T2 seconds
#MIN_ERD_NUM_CONNECTED_PEERS_AFTER_T2=10 # minimum erd_num_connected_peers after T2 seconds
#MIN_ERD_NUM_CONNECTED_PEERS_MAX_TRIES=3 # reinstall node after ... restart attempts
# Initialization
use_rest_api=0
list_node_index="${!USE_KEYS[@]}"
list_node_length="${#USE_KEYS[@]}"
for i in $list_node_index; do if [[ "${RESTAPI_KEYS[i]^^}" == "YES" ]]; then use_rest_api=1; fi; done
# If use of REST-API is not enabled, do not monitor nodes and exit monitoring script
if [ "$use_rest_api" -eq "0" ]; then
printf "${RED}In nodes_config.sh, RESTAPI_KEYS was set to \'no\' for all $list_node_length nodes!${NC}\n"
printf "${RED}Node monitoring is therefore not available for your nodes. Exiting monitoring script.${NC}\n"
exit
fi
keypress=''
# Info message
printf "\n${CYAN}This monitoring script will check the nodes' statuses every $SLEEP_SECS seconds.${NC}"
printf "\n${CYAN}Press q to stop this script, press i for node uptime info.${NC}\n"
# Enable exiting script with a single keystroke
if [ -t 0 ]; then
SAVED_STTY="`stty --save`"
stty -echo -icanon -icrnl time 0 min 0
fi
# Define functions
exit_script () {
# Reset keyboard input configuration to initial settings
if [ -t 0 ]; then stty "$SAVED_STTY"; fi
exit
}
initialize_clock () {
local node_index="$1"
begin[node_index]=$(date +%s)
diff[node_index]=0
}
check_node_process () {
local node_index="$1"
local rest_api_port=$((8080+node_index))
# Don't exit the script if lsof fails, this is an exception
set +e && local rest_api_port_node_process="$(sudo lsof -t -i:$rest_api_port -c node -a)" && set -e
if [[ -z "$rest_api_port_node_process" ]]; then
local message="cannot find node process on rest-api port $rest_api_port"
restart $node_index "$message"
fi
}
check_erd_num_connected_peers () {
local node_index="$1"
local test_value="$2"
if [[ ${diff[node_index]} -ge $MIN_ERD_NUM_CONNECTED_PEERS_T1_SECS && $test_value -lt $MIN_ERD_NUM_CONNECTED_PEERS_AFTER_T1 ]]; then
local message="after at least $MIN_ERD_NUM_CONNECTED_PEERS_T1_SECS seconds, erd_num_connected_peers < $MIN_ERD_NUM_CONNECTED_PEERS_AFTER_T1"
restart $node_index "$message"
fi
}
restart () {
local node_index="$1"
local message="$2"
printf "${RED}Restarting node $((node_index+1))/$list_node_length: $message${NC}\n"
default_node_folder[node_index]="$NODE_FOLDER_PREFIX${USE_KEYS[node_index]}" # default node folder for $USE_KEYS[i]
if [[ ! -d ${default_node_folder[node_index]} ]]; then
printf "${RED}Cannot find default node folder: ${default_node_folder[node_index]}! Exiting script.${NC}\n"
exit_script
fi
suffix="$(printf "%02d" $((node_index+1)))"
rest_api_port=$((8080+node_index))
# Run node in virtual tmux session: $session_name.
# The user can switch to this session with: tmux a -t $session_name
# For a single node, this will be: tmux a -t node-01
# To detach from that session again: <Ctrl+b>, followed by <d>
session_name="$SESSION_PREFIX$suffix"
if [ -z "$(tmux ls | grep $session_name)" ]; then
tmux new-session -d -s "$session_name"
else
tmux send-keys -t "$session_name" C-c
# Don't exit the script if lsof fails, this is an exception
set +e && local rest_api_port_node_process="$(sudo lsof -t -i:$rest_api_port -c node -a)" && set -e
if [[ ! -z "$rest_api_port_node_process" ]]; then sudo kill "$rest_api_port_node_process"; fi
tmux kill-session -t "$session_name" && tmux new-session -d -s "$session_name"
fi
# Use rest-api-port by default
tmux send -t "$session_name" "cd ${default_node_folder[node_index]}" ENTER
tmux send -t "$session_name" "./node --rest-api-interface localhost:$rest_api_port" ENTER
# Initialize clock for the restarted node
initialize_clock $node_index
}
show_info () {
echo
for i in $list_node_index; do
secs[i]=$((${diff[i]} % 60))
mins[i]=$((${diff[i]} / 60 % 60))
hours[i]=$((${diff[i]} / 60 / 60 % 24))
days[i]=$((${diff[i]} / 60 / 60 / 24))
if [[ "${RESTAPI_KEYS[i]^^}" == "YES" ]]; then
printf "${CYAN}Node %d/%d has run for %d days, %02d:%02d:%02d\n${NC}" $((i+1)) $list_node_length \
${days[i]} ${hours[i]} ${mins[i]} ${secs[i]}
fi
done
}
# Initialize clocks for all nodes
for i in $list_node_index; do initialize_clock $i; done
# Start monitoring
while [[ "x$keypress" != "xq" && "x$keypress" != "xQ" ]]; do
header_printed=0
for i in $list_node_index; do
# Check if rest-api-port is open
if [[ "${RESTAPI_KEYS[i]^^}" == "YES" ]]; then
rest_api_port=$((8080+i))
# Don't exit the script if curl fails, this is an exception
set +e && node_status[i]="$(curl --silent http://localhost:$rest_api_port/node/status)" && set -e
if [[ ! -z $(echo ${node_status[i]} | jq '.details.erd_app_version') ]]; then
# Only printf header once
if [ "$header_printed" -eq "0" ]; then
printf "\n${GREEN} Node ${NC}|${GREEN} Sync ${NC}|${GREEN} initNodes Pk ${NC}|"
printf "${GREEN} Typ ${NC}|${GREEN} Node Display Name ${NC}|${GREEN} Shard ${NC}|"
printf "${GREEN} ConP ${NC}|${GREEN} Synch Block Nonce ${NC}|${GREEN} Consensus Round${NC}"
header_printed=1
fi
erd_is_syncing_str[i]="OK"
erd_is_syncing[i]="$(echo ${node_status[i]} | jq '.details.erd_is_syncing')"
if [[ $((erd_is_syncing[i])) != 0 ]]; then erd_is_syncing_str[i]="!!"; fi
erd_node_display_name[i]="$(echo ${node_status[i]} | jq '.details.erd_node_display_name' | tr -d '"')"
erd_public_key_block_sign[i]="$(echo ${node_status[i]} | jq '.details.erd_public_key_block_sign' | tr -d '"')"
erd_shard_id[i]="$(echo ${node_status[i]} | jq '.details.erd_shard_id')"
if [[ $((erd_shard_id[i]-1000000)) -gt 0 ]]; then erd_shard_id[i]="meta"; fi
erd_node_type[i]="$(echo ${node_status[i]} | jq '.details.erd_node_type' | tr -d '"')"
erd_num_connected_peers[i]="$(echo ${node_status[i]} | jq '.details.erd_num_connected_peers')"
erd_nonce[i]="$(echo ${node_status[i]} | jq '.details.erd_nonce')"
erd_probable_highest_nonce[i]="$(echo ${node_status[i]} | jq '.details.erd_probable_highest_nonce')"
erd_synchronized_round[i]="$(echo ${node_status[i]} | jq '.details.erd_synchronized_round')"
erd_current_round[i]="$(echo ${node_status[i]} | jq '.details.erd_current_round')"
printf "\n%2s/%-2s | %2s | %-12s | %-3s | %-17s | %5s | %4s | %8s/%-8s | %8s/%-8s" \
"$((i+1))" "$list_node_length" "${erd_is_syncing_str[i]}" "${erd_public_key_block_sign[i]:0:12}" \
"${erd_node_type[i]:0:3}" "${erd_node_display_name[i]:0:17}" "${erd_shard_id[i]}" \
"${erd_num_connected_peers[i]}" "${erd_nonce[i]}" "${erd_probable_highest_nonce[i]}" \
"${erd_synchronized_round[i]}" "${erd_current_round[i]}"
check_erd_num_connected_peers $i "${erd_num_connected_peers[i]}"
fi
else
printf "\n${RED}RESTAPI_KEYS for node %d/%d was set to \'no\'. Node monitoring not available for this node.${NC}" \
$((i+1)) $list_node_length
fi
done
echo
for count in $(seq 1 $SLEEP_SECS); do
keypress="`cat -v`"
sleep 1
now=$(date +%s)
for i in $list_node_index; do diff[i]=$(($now - ${begin[i]})); done
if [[ "x$keypress" == "xq" || "x$keypress" == "xQ" ]]; then
break
fi
if [[ "x$keypress" == "xi" || "x$keypress" == "xI" ]]; then
show_info
fi
done
# Check node process only after the first sleep
for i in $list_node_index; do
if [[ "${RESTAPI_KEYS[i]^^}" == "YES" ]]; then check_node_process $i; fi
done
done
# Message upon exit
show_info
exit_script