forked from shawn-sterling/graphios
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgraphios.py
executable file
·495 lines (418 loc) · 16.2 KB
/
graphios.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
#!/usr/bin/python -tt
#
# Copyright (C) 2011 Shawn Sterling <shawn@systemtemplar.org>
#
# With contributions from:
#
# Juan Jose Presa <juanjop@gmail.com>
# Ranjib Dey <dey.ranjib@gmail.com>
# Ryan Davis <https://github.com/ryepup>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
#
# graphios: this program will read nagios host and service perfdata, and
# send it to a carbon server.
#
# The latest version of this code will be found on my github page:
# https://github.com/shawn-sterling
import os
import sys
import re
import logging
import logging.handlers
import time
import socket
import cPickle as pickle
import struct
from optparse import OptionParser
############################################################
##### You will likely need to change some of the below #####
# carbon server info
carbon_server = '10.10.101.54'
# carbon pickle receiver port (normally 2004)
carbon_port = 2004
# nagios spool directory
spool_directory = '/var/spool/nagios/graphios'
# graphios log info
log_file = '/var/log/nagios/graphios.log'
log_max_size = 25165824 # 24 MB
log_level = logging.INFO
#log_level = logging.DEBUG # DEBUG is quite verbose
# How long to sleep between processing the spool directory
sleep_time = 15
# when we can't connect to carbon, the sleeptime is doubled until we hit max
sleep_max = 480
# test mode makes it so we print what we would add to carbon, and not delete
# any files from the spool directory. log_level must be DEBUG as well.
test_mode = False
##### You should stop changing things unless you know what you are doing #####
##############################################################################
parser = OptionParser("""usage: %prog [options]
sends nagios performance data to carbon.
""")
parser.add_option('-v', "--verbose", action="store_true", dest="verbose",
help="sets logging to DEBUG level")
parser.add_option("--spool-directory", dest="spool_directory",
default=spool_directory,
help="where to look for nagios performance data")
parser.add_option("--log-file", dest="log_file",
default=log_file,
help="file to log to")
sock = socket.socket()
log = logging.getLogger('log')
def configure(opts):
global spool_directory
log_handler = logging.handlers.RotatingFileHandler(
opts.log_file, maxBytes=log_max_size, backupCount=4)
f = logging.Formatter("%(asctime)s %(filename)s %(levelname)s %(message)s",
"%B %d %H:%M:%S")
log_handler.setFormatter(f)
log.addHandler(log_handler)
if opts.verbose:
log.setLevel(logging.DEBUG)
else:
log.setLevel(log_level)
spool_directory = opts.spool_directory
def connect_carbon():
"""
Connects to Carbon server
"""
global sock
sock = socket.socket()
try:
sock.connect((carbon_server, carbon_port))
return True
except Exception, e:
log.warning("Can't connect to carbon: %s:%s %s" % (carbon_server,
carbon_port, e))
return False
def send_carbon(carbon_list):
"""
Sends a list to Carbon, we postpend every entry with a \n as per
carbon documentation.
If we can't connect to carbin, it sleeps, and doubles sleep_time
until it hits sleep_max.
"""
global sock
global sleep_time
message = convert_pickle(carbon_list)
#message = '\n'.join(carbon_list) + '\n'
try:
sock.sendall(message)
log.debug("sending to carbon: %s" % message)
return True
except Exception, e:
log.critical("Can't send message to carbon error:%s" % (e))
while True:
sock.close()
if connect_carbon():
sleep_time = 15 # reset sleep_time to 15
return False
else:
if sleep_time < sleep_max:
sleep_time = sleep_time + sleep_time
log.warning("Carbon not responding. Increasing " + \
"sleep_time to %s." % (sleep_time))
else:
log.warning("Carbon not responding. Sleeping %s" % \
(sleep_time))
log.debug("sleeping %s" % (sleep_time))
time.sleep(sleep_time)
return False
def convert_pickle(carbon_list):
"""
Converts a list into pickle formatted message and returns it
"""
pickle_list = []
for metric in carbon_list:
path, value, timestamp = re.split("\s+",metric.strip())
metric_tuple = (path, (timestamp, value))
pickle_list.append(metric_tuple)
payload = pickle.dumps(pickle_list)
header = struct.pack("!L", len(payload))
message = header + payload
return message
def process_host_data(file_name, delete_after=0):
"""
processes a file loaded with nagios host data, and sends info to
a carbon server. If delete_after is 1, we delete the file when we
are done with it.
When nagios has host perf data, that is from check_host_alive, which
is the check_icmp plugin. The perf data looks like this:
rta=1.066ms;5.000;10.000;0; pl=0%;5;10;; rtmax=4.368ms;;;; rtmin=0.196ms;;;;
We send to graphite:
(GRAPHITEPREFIX).HOSTNAME.(GRAPHITEPOSTFIX).perf value TIMET
Which for me I set the prefix to:
monitoring.domain.com.nagiosXX.pingto.
I leave the postfix blank (for this example)
So if i'm checking db01, from nagios host nagios01 my carbon lines
would be:
monitoring.domain.com.nagios01.pingto.db01.rta 1.0.66 timet
monitoring.domain.com.nagios01.pingto.db01.pl 0 timet
monitoring.domain.com.nagios01.pingto.db01.rtmax 4.368 timet
monitoring.domain.com.nagios01.pingto.db01.rtmin 0.196 timet
"""
try:
f = open(file_name, "r")
file_array = f.readlines()
f.close()
except Exception, e:
log.critical("Can't open file:%s error: %s" % (file_name, e))
sys.exit(2)
graphite_lines = []
for line in file_array:
variables = line.split('\t')
data_type = ""
host_name = ""
time = ""
host_perf_data = ""
graphite_postfix = ""
graphite_prefix = ""
carbon_string = ""
for var in variables:
(var_name, value) = var.split('::')
if var_name == 'TIMET':
time = value
if var_name == 'HOSTNAME':
host_name = value
if var_name == 'HOSTPERFDATA':
host_perf_data = value
if var_name == 'GRAPHITEPOSTFIX':
value = re.sub("\s", "", value)
if value != "$_HOSTGRAPHITEPOSTFIX$":
graphite_postfix = value
if var_name == 'GRAPHITEPREFIX':
value = re.sub("\s", "", value)
if str(value) != "$_HOSTGRAPHITEPREFIX$":
graphite_prefix = value
if host_perf_data == "":
continue
carbon_string = build_carbon_metric(
graphite_prefix, host_name, graphite_postfix)
if carbon_string:
graphite_lines.extend(process_host_perf_data(
carbon_string, host_perf_data, time))
handle_file(file_name, graphite_lines, test_mode, delete_after)
def handle_file(file_name, graphite_lines, test_mode, delete_after):
"""
if we are test mode we just print the graphite lines.
if the graphite data gets into carbon, and delete_after is set
we remove the file.
if the graphite_lines has a length of 0, there was no graphite
data, and we remove the file.
"""
if test_mode:
if len(graphite_lines) > 0:
log.debug("graphite_lines:%s" % (graphite_lines))
else:
if len(graphite_lines) > 0:
if send_carbon(graphite_lines):
if delete_after:
log.debug("removing file, %s" % (file_name))
try:
os.remove(file_name)
except Exception, e:
log.critical("couldn't remove file %s error:%s" % (
file_name, e))
else:
log.warning("message not sent to graphite, file not deleted.")
else:
# file didn't have any graphite data in it, delete it.
if delete_after:
try:
os.remove(file_name)
except Exception, e:
log.critical("couldn't remove file %s error:%s" % (
file_name, e))
def process_host_perf_data(carbon_string, perf_data, time):
"""
given the nagios perfdata, and some variables we return a list of
carbon formatted values. carbon_string should already have a trailing .
"""
graphite_lines = []
perf_list = perf_data.split(" ")
for perf in perf_list:
(name, value) = process_perf_string(perf)
new_line = "%s%s %s %s" % (carbon_string, name, value, time)
log.debug("new line = %s" % (new_line))
graphite_lines.append(new_line)
return graphite_lines
def process_service_data(file_name, delete_after=0):
"""
processes a file loaded with nagios service data, and sends info to
a carbon server. If delete_after is 1, we delete the file when we
are done with it.
here is what we send to carbon:
(GRAPHITEPREFIX).HOSTNAME.(GRAPHITEPOSTFIX).perf value timet
So, our example service will be
'MySQL Connection Time' where the perfdata looks like this:
connection_time=0.0213s;1;5
Let's say this is checked on host db01, and it is run from nagios01.
We set our graphiteprefix to be:
monitoring.domain.com.nagios01.mysql
the graphitepostfix in this case to be left blank
Giving us a final carbon metric of:
monitoring.domain.com.nagios01.mysql.db01.connection_time 0.0213 timet
Or let's say you have a plugin that gives the perf 'load=3.4;5;6;;'
In this case I want my carbon data to be:
hostname.domain.com.nagios.load
So I set the _graphitepostfix to 'domain.com.nagios'
"""
try:
f = open(file_name, "r")
file_array = f.readlines()
f.close()
except Exception, e:
log.critical("Can't open file:%s error: %s" % (file_name, e))
sys.exit(2)
graphite_lines = []
for line in file_array:
variables = line.split('\t')
data_type = ""
host_name = ""
time = ""
service_perf_data = ""
graphite_postfix = ""
graphite_prefix = ""
carbon_string = ""
for var in variables:
if re.search("::", var):
var_name = var.split('::')[0]
if var_name == 'SERVICECHECKCOMMAND':
continue
value = var[len(var_name) + 2:]
else:
var_name = ""
if var_name == 'TIMET':
time = value
if var_name == 'HOSTNAME':
host_name = value
if var_name == 'SERVICEPERFDATA':
service_perf_data = value.replace('/', '_')
if var_name == 'GRAPHITEPOSTFIX':
value = re.sub("\s", "", value)
if value != "$_SERVICEGRAPHITEPOSTFIX$":
graphite_postfix = value
if var_name == 'GRAPHITEPREFIX':
value = re.sub("\s", "", value)
if value != "$_SERVICEGRAPHITEPREFIX$":
graphite_prefix = value
if not re.search("=", service_perf_data):
# no perfdata to parse, so we're done
continue
carbon_string = build_carbon_metric(
graphite_prefix, host_name, graphite_postfix)
if carbon_string:
graphite_lines.extend(process_service_perf_data(
carbon_string, service_perf_data, time))
handle_file(file_name, graphite_lines, test_mode, delete_after)
def build_carbon_metric(graphite_prefix, host_name, graphite_postfix):
"""
builds the metric to send to carbon, returns empty string if
there's insufficient data and we shouldn't forward to carbon.
"""
if (graphite_prefix == "" and graphite_postfix == ""):
# uncomment below if you are troubleshooting a weird plugin.
# log.debug("can't find graphite prefix or postfix in %s on %s" % (
# line, file_name))
return ""
carbon_string = ""
if graphite_prefix != "":
carbon_string = "%s." % graphite_prefix
if host_name != "":
carbon_string = carbon_string + "%s." % host_name.replace('.', '_')
else:
log.debug("can't find hostname in %s on %s" % (line, file_name))
return ""
if graphite_postfix != "":
carbon_string = carbon_string + "%s." % graphite_postfix
return carbon_string
def process_perf_string(nagios_perf_string):
"""
given a single nagios perf string, returns a processed value.
Expected values:
label=value[UOM];[warn];[crit];[min];[max]
We want to scrape the label=value and get rid of everything else.
UOM can be: s, %, B(kb,mb,tb), c
graphios assumes that you have modified your plugin to always use
the same value. If your plugin does not support this, you can use
check_mp to force your units to be consistent. Graphios plain
ignores the UOM.
"""
# log.debug("perfstring:%s" % (nagios_perf_string))
(name, value) = (None,None)
tmp = re.findall("=?[^;]*", nagios_perf_string)
try:
(name, value) = tmp[0].split('=')
value = re.sub('[a-zA-Z]', '', value)
value = re.sub('\%', '', value)
except ValueError:
log.error('Bad perf string %s',(nagios_perf_string))
return name, value
def process_service_perf_data(carbon_string, perf_data, time):
"""
given the nagios perfdata, and some variables we return a list of
carbon formatted strings.
"""
graphite_lines = []
# log.debug('perfdata:%s' % (perf_data))
# find out if this is 1 perf statement or many, by counting how many '='
d = dict.fromkeys(perf_data, 0)
for c in perf_data:
d[c] += 1
if d['='] == 1:
(name, value) = process_perf_string(perf_data)
new_line = "%s%s %s %s" % (carbon_string, name, value, time)
log.debug("new line = %s" % (new_line))
graphite_lines.append(new_line)
else:
perf_list = perf_data.split(" ")
for perf in perf_list:
(name, value) = process_perf_string(perf)
new_line = "%s%s %s %s" % (carbon_string, name, value, time)
log.debug("new line = %s" % (new_line))
graphite_lines.append(new_line)
return graphite_lines
def process_spool_dir(directory):
"""
processes the files in the spool directory
"""
file_list = os.listdir(directory)
for file in file_list:
if file == "host-perfdata" or file == "service-perfdata":
continue
file_dir = os.path.join(directory, file)
if re.match('host-perfdata\.', file):
process_host_data(file_dir, 1)
if re.match('service-perfdata\.', file):
process_service_data(file_dir, 1)
def main():
"""
the main
"""
global sock
log.info("graphios startup.")
try:
connect_carbon()
while True:
process_spool_dir(spool_directory)
time.sleep(sleep_time)
except KeyboardInterrupt:
log.info("ctrl-c pressed. Exiting graphios.")
if __name__ == '__main__':
(options, args) = parser.parse_args()
configure(options)
main()