Skip to content

Commit

Permalink
Supervisord check and tests
Browse files Browse the repository at this point in the history
This adds a new agent check for supervisord. The check reports whether
supervisord is running or not and  whether one or more processes are
runnnig or not. It also reports prcesses' uptime. The check works over
http inet server and works with sockets. See supervisord.yaml.example
for more details.
  • Loading branch information
isaacdd committed Mar 5, 2015
1 parent 3729b72 commit 86f35c2
Show file tree
Hide file tree
Showing 4 changed files with 630 additions and 0 deletions.
158 changes: 158 additions & 0 deletions checks.d/supervisord.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
from collections import defaultdict
import errno
import socket
import time
import xmlrpclib

from checks import AgentCheck

import supervisor.xmlrpc

DEFAULT_HOST = 'localhost'
DEFAULT_PORT = '9001'
DEFAULT_SOCKET_IP = 'http://127.0.0.1'

DD_STATUS = {
'STOPPED': AgentCheck.CRITICAL,
'STARTING': AgentCheck.UNKNOWN,
'RUNNING': AgentCheck.OK,
'BACKOFF': AgentCheck.CRITICAL,
'STOPPING': AgentCheck.CRITICAL,
'EXITED': AgentCheck.CRITICAL,
'FATAL': AgentCheck.CRITICAL,
'UNKNOWN': AgentCheck.UNKNOWN
}

PROCESS_STATUS = {
AgentCheck.CRITICAL: 'down',
AgentCheck.OK: 'up',
AgentCheck.UNKNOWN: 'unknown'
}

SERVER_TAG = 'supervisord_server'

PROCESS_TAG = 'supervisord_process'

FORMAT_TIME = lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x))


class SupervisordCheck(AgentCheck):

def check(self, instance):
server_name = instance.get('name')

if not server_name or not server_name.strip():
raise Exception("Supervisord server name not specified in yaml configuration.")

supe = self._connect(instance)
count_by_status = defaultdict(int)

# Grab process information
try:
proc_names = instance.get('proc_names')
if proc_names:
if not isinstance(proc_names, list) or not len(proc_names):
raise Exception("Empty or invalid proc_names.")
processes = []
for proc_name in proc_names:
try:
processes.append(supe.getProcessInfo(proc_name))
except xmlrpclib.Fault, e:
if e.faultCode == 10: # bad process name
self.warning('Process not found: %s' % proc_name)
else:
raise Exception('An error occurred while reading'
'process %s information: %s %s'
% (proc_name, e.faultCode, e.faultString))
else:
processes = supe.getAllProcessInfo()
except socket.error, e:
host = instance.get('host', DEFAULT_HOST)
port = instance.get('port', DEFAULT_PORT)
sock = instance.get('socket')
if sock is None:
msg = 'Cannot connect to http://%s:%s. ' \
'Make sure supervisor is running and XML-RPC ' \
'inet interface is enabled.' % (host, port)
else:
msg = 'Cannot connect to %s. Make sure sure supervisor ' \
'is running and socket is enabled and socket file' \
' has the right permissions.' % sock

if e.errno not in [errno.EACCES, errno.ENOENT]: # permissions denied, no such file
self.service_check('supervisord.server.check', AgentCheck.CRITICAL,
tags=['%s:%s' % (SERVER_TAG, server_name)],
message='Supervisord server %s is down.' % server_name)

raise Exception(msg)
except xmlrpclib.ProtocolError, e:
if e.errcode == 401: # authorization error
raise Exception('Username or password to %s are incorrect.' %
server_name)
else:
raise Exception('An error occurred while connecting to %s: '
'%s %s ' % (servere_name, e.errcode, e.errmsg))

# Report service checks and uptime for each process
for proc in processes:
proc_name = proc['name']
tags = ['%s:%s' % (SERVER_TAG, server_name),
'%s:%s' % (PROCESS_TAG, proc_name)]

# Report Service Check
status = DD_STATUS[proc['statename']]
msg = self._build_message(proc)
count_by_status[status] += 1
self.service_check('supervisord.process.check',
status, tags=tags, message=msg)
# Report Uptime
uptime = self._extract_uptime(proc)
self.gauge('supervisord.process.uptime', uptime, tags=tags)

# Report counts by status
tags = ['%s:%s' % (SERVER_TAG, server_name)]
for status in PROCESS_STATUS:
self.gauge('supervisord.process.count', count_by_status[status],
tags=tags + ['status:%s' % PROCESS_STATUS[status]])

@staticmethod
def _connect(instance):
sock = instance.get('socket')
if sock is not None:
host = instance.get('host', DEFAULT_SOCKET_IP)
transport = supervisor.xmlrpc.SupervisorTransport(None, None, sock)
server = xmlrpclib.ServerProxy(host, transport=transport)
else:
host = instance.get('host', DEFAULT_HOST)
port = instance.get('port', DEFAULT_PORT)
user = instance.get('user')
password = instance.get('pass')
auth = '%s:%s@' % (user, password) if user and password else ''
server = xmlrpclib.Server('http://%s%s:%s/RPC2' % (auth, host, port))
return server.supervisor

@staticmethod
def _extract_uptime(proc):
start, now = int(proc['start']), int(proc['now'])
status = proc['statename']
active_state = status in ['BACKOFF', 'RUNNING', 'STOPPING']
return now - start if active_state else 0

@staticmethod
def _build_message(proc):
start, stop, now = int(proc['start']), int(proc['stop']), int(proc['now'])
proc['now_str'] = FORMAT_TIME(now)
proc['start_str'] = FORMAT_TIME(start)
proc['stop_str'] = '' if stop == 0 else FORMAT_TIME(stop)

return """Current time: %(now_str)s
Process name: %(name)s
Process group: %(group)s
Description: %(description)s
Error log file: %(stderr_logfile)s
Stdout log file: %(stdout_logfile)s
Log file: %(logfile)s
State: %(statename)s
Start time: %(start_str)s
Stop time: %(stop_str)s
Exit Status: %(exitstatus)s""" % proc
49 changes: 49 additions & 0 deletions conf.d/supervisord.yaml.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#
# There are two ways to get started with the supervisord check.
#
# You can configure inet_http_server in /etc/supervisord.conf. Below is an
# example inet_http_server configuration:
#
# [inet_http_server]
# port:localhost:9001
# username:user # optional
# password:pass # optional
#
# OR, you can use supervisorctl socket to communicate with supervisor.
# If supervisor is running as root, make sure chmod property is set
# to a permission accessible to non-root users. See the example below:
#
# [supervisorctl]
# serverurl=unix:///var/run//supervisor.sock
#
# [unix_http_server]
# file=/var/run/supervisor.sock
# chmod=777
#
# Reload supervsior, specify the inet or unix socket server information
# in this yaml file along with an optional list of the processes you want
# to monitor per instance, and you're good to go!
#
# See http://supervisord.org/configuration.html for more information on
# configuring supervisord sockets and inet http servers.
#

init_config:

instances:
# - name: server0 # Required. An arbitrary name to identify the supervisord server
# host: localhost # Optional. Defaults to localhost. The host where supervisord server is running
# port: 9001 # Optional. Defaults to 9001. The port number.
# user: user # Optional. Required only if a username is configured.
# pass: pass # Optional. Required only if a password is configured.
# proc_names: # Optional. The process to monitor within this supervisord instance.
# - apache2 # If not specified, the check will monitor all processes.
# - webapp
# - java
# server_check: false # Optional. Defaults to true. Service check for connection to supervisord server.
# - name: server1
# host: localhost
# port: 9002
# - name: server2
# socket: unix:///var/run//supervisor.sock
# host: http://127.0.0.1 # Optional. Defaults to http://127.0.0.1
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ httplib2
kafka-python==0.9.0-9bed11db98387c0d9e456528130b330631dc50af
requests
paramiko
supervisor==3.1.3
Loading

0 comments on commit 86f35c2

Please sign in to comment.