-
Notifications
You must be signed in to change notification settings - Fork 813
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This adds a new agent check for supervisord. The check reports whether supervisord is running or not and whether one or more processes are runnnig or not. It also reports prcesses' uptime. The check works over http inet server and works with sockets. See supervisord.yaml.example for more details.
- Loading branch information
Showing
4 changed files
with
630 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
from collections import defaultdict | ||
import errno | ||
import socket | ||
import time | ||
import xmlrpclib | ||
|
||
from checks import AgentCheck | ||
|
||
import supervisor.xmlrpc | ||
|
||
DEFAULT_HOST = 'localhost' | ||
DEFAULT_PORT = '9001' | ||
DEFAULT_SOCKET_IP = 'http://127.0.0.1' | ||
|
||
DD_STATUS = { | ||
'STOPPED': AgentCheck.CRITICAL, | ||
'STARTING': AgentCheck.UNKNOWN, | ||
'RUNNING': AgentCheck.OK, | ||
'BACKOFF': AgentCheck.CRITICAL, | ||
'STOPPING': AgentCheck.CRITICAL, | ||
'EXITED': AgentCheck.CRITICAL, | ||
'FATAL': AgentCheck.CRITICAL, | ||
'UNKNOWN': AgentCheck.UNKNOWN | ||
} | ||
|
||
PROCESS_STATUS = { | ||
AgentCheck.CRITICAL: 'down', | ||
AgentCheck.OK: 'up', | ||
AgentCheck.UNKNOWN: 'unknown' | ||
} | ||
|
||
SERVER_TAG = 'supervisord_server' | ||
|
||
PROCESS_TAG = 'supervisord_process' | ||
|
||
FORMAT_TIME = lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x)) | ||
|
||
|
||
class SupervisordCheck(AgentCheck): | ||
|
||
def check(self, instance): | ||
server_name = instance.get('name') | ||
|
||
if not server_name or not server_name.strip(): | ||
raise Exception("Supervisord server name not specified in yaml configuration.") | ||
|
||
supe = self._connect(instance) | ||
count_by_status = defaultdict(int) | ||
|
||
# Grab process information | ||
try: | ||
proc_names = instance.get('proc_names') | ||
if proc_names: | ||
if not isinstance(proc_names, list) or not len(proc_names): | ||
raise Exception("Empty or invalid proc_names.") | ||
processes = [] | ||
for proc_name in proc_names: | ||
try: | ||
processes.append(supe.getProcessInfo(proc_name)) | ||
except xmlrpclib.Fault, e: | ||
if e.faultCode == 10: # bad process name | ||
self.warning('Process not found: %s' % proc_name) | ||
else: | ||
raise Exception('An error occurred while reading' | ||
'process %s information: %s %s' | ||
% (proc_name, e.faultCode, e.faultString)) | ||
else: | ||
processes = supe.getAllProcessInfo() | ||
except socket.error, e: | ||
host = instance.get('host', DEFAULT_HOST) | ||
port = instance.get('port', DEFAULT_PORT) | ||
sock = instance.get('socket') | ||
if sock is None: | ||
msg = 'Cannot connect to http://%s:%s. ' \ | ||
'Make sure supervisor is running and XML-RPC ' \ | ||
'inet interface is enabled.' % (host, port) | ||
else: | ||
msg = 'Cannot connect to %s. Make sure sure supervisor ' \ | ||
'is running and socket is enabled and socket file' \ | ||
' has the right permissions.' % sock | ||
|
||
if e.errno not in [errno.EACCES, errno.ENOENT]: # permissions denied, no such file | ||
self.service_check('supervisord.server.check', AgentCheck.CRITICAL, | ||
tags=['%s:%s' % (SERVER_TAG, server_name)], | ||
message='Supervisord server %s is down.' % server_name) | ||
|
||
raise Exception(msg) | ||
except xmlrpclib.ProtocolError, e: | ||
if e.errcode == 401: # authorization error | ||
raise Exception('Username or password to %s are incorrect.' % | ||
server_name) | ||
else: | ||
raise Exception('An error occurred while connecting to %s: ' | ||
'%s %s ' % (servere_name, e.errcode, e.errmsg)) | ||
|
||
# Report service checks and uptime for each process | ||
for proc in processes: | ||
proc_name = proc['name'] | ||
tags = ['%s:%s' % (SERVER_TAG, server_name), | ||
'%s:%s' % (PROCESS_TAG, proc_name)] | ||
|
||
# Report Service Check | ||
status = DD_STATUS[proc['statename']] | ||
msg = self._build_message(proc) | ||
count_by_status[status] += 1 | ||
self.service_check('supervisord.process.check', | ||
status, tags=tags, message=msg) | ||
# Report Uptime | ||
uptime = self._extract_uptime(proc) | ||
self.gauge('supervisord.process.uptime', uptime, tags=tags) | ||
|
||
# Report counts by status | ||
tags = ['%s:%s' % (SERVER_TAG, server_name)] | ||
for status in PROCESS_STATUS: | ||
self.gauge('supervisord.process.count', count_by_status[status], | ||
tags=tags + ['status:%s' % PROCESS_STATUS[status]]) | ||
|
||
@staticmethod | ||
def _connect(instance): | ||
sock = instance.get('socket') | ||
if sock is not None: | ||
host = instance.get('host', DEFAULT_SOCKET_IP) | ||
transport = supervisor.xmlrpc.SupervisorTransport(None, None, sock) | ||
server = xmlrpclib.ServerProxy(host, transport=transport) | ||
else: | ||
host = instance.get('host', DEFAULT_HOST) | ||
port = instance.get('port', DEFAULT_PORT) | ||
user = instance.get('user') | ||
password = instance.get('pass') | ||
auth = '%s:%s@' % (user, password) if user and password else '' | ||
server = xmlrpclib.Server('http://%s%s:%s/RPC2' % (auth, host, port)) | ||
return server.supervisor | ||
|
||
@staticmethod | ||
def _extract_uptime(proc): | ||
start, now = int(proc['start']), int(proc['now']) | ||
status = proc['statename'] | ||
active_state = status in ['BACKOFF', 'RUNNING', 'STOPPING'] | ||
return now - start if active_state else 0 | ||
|
||
@staticmethod | ||
def _build_message(proc): | ||
start, stop, now = int(proc['start']), int(proc['stop']), int(proc['now']) | ||
proc['now_str'] = FORMAT_TIME(now) | ||
proc['start_str'] = FORMAT_TIME(start) | ||
proc['stop_str'] = '' if stop == 0 else FORMAT_TIME(stop) | ||
|
||
return """Current time: %(now_str)s | ||
Process name: %(name)s | ||
Process group: %(group)s | ||
Description: %(description)s | ||
Error log file: %(stderr_logfile)s | ||
Stdout log file: %(stdout_logfile)s | ||
Log file: %(logfile)s | ||
State: %(statename)s | ||
Start time: %(start_str)s | ||
Stop time: %(stop_str)s | ||
Exit Status: %(exitstatus)s""" % proc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# | ||
# There are two ways to get started with the supervisord check. | ||
# | ||
# You can configure inet_http_server in /etc/supervisord.conf. Below is an | ||
# example inet_http_server configuration: | ||
# | ||
# [inet_http_server] | ||
# port:localhost:9001 | ||
# username:user # optional | ||
# password:pass # optional | ||
# | ||
# OR, you can use supervisorctl socket to communicate with supervisor. | ||
# If supervisor is running as root, make sure chmod property is set | ||
# to a permission accessible to non-root users. See the example below: | ||
# | ||
# [supervisorctl] | ||
# serverurl=unix:///var/run//supervisor.sock | ||
# | ||
# [unix_http_server] | ||
# file=/var/run/supervisor.sock | ||
# chmod=777 | ||
# | ||
# Reload supervsior, specify the inet or unix socket server information | ||
# in this yaml file along with an optional list of the processes you want | ||
# to monitor per instance, and you're good to go! | ||
# | ||
# See http://supervisord.org/configuration.html for more information on | ||
# configuring supervisord sockets and inet http servers. | ||
# | ||
|
||
init_config: | ||
|
||
instances: | ||
# - name: server0 # Required. An arbitrary name to identify the supervisord server | ||
# host: localhost # Optional. Defaults to localhost. The host where supervisord server is running | ||
# port: 9001 # Optional. Defaults to 9001. The port number. | ||
# user: user # Optional. Required only if a username is configured. | ||
# pass: pass # Optional. Required only if a password is configured. | ||
# proc_names: # Optional. The process to monitor within this supervisord instance. | ||
# - apache2 # If not specified, the check will monitor all processes. | ||
# - webapp | ||
# - java | ||
# server_check: false # Optional. Defaults to true. Service check for connection to supervisord server. | ||
# - name: server1 | ||
# host: localhost | ||
# port: 9002 | ||
# - name: server2 | ||
# socket: unix:///var/run//supervisor.sock | ||
# host: http://127.0.0.1 # Optional. Defaults to http://127.0.0.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,3 +19,4 @@ httplib2 | |
kafka-python==0.9.0-9bed11db98387c0d9e456528130b330631dc50af | ||
requests | ||
paramiko | ||
supervisor==3.1.3 |
Oops, something went wrong.