Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the ability to select processes to monitor by regex pattern match #1375

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 175 additions & 0 deletions checks.d/supervisord.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
from collections import defaultdict
import errno
import socket
import time
import xmlrpclib
import re
import itertools

from checks import AgentCheck

import supervisor.xmlrpc

DEFAULT_HOST = 'localhost'
DEFAULT_PORT = '9001'
DEFAULT_SOCKET_IP = 'http://127.0.0.1'

DD_STATUS = {
'STOPPED': AgentCheck.CRITICAL,
'STARTING': AgentCheck.UNKNOWN,
'RUNNING': AgentCheck.OK,
'BACKOFF': AgentCheck.CRITICAL,
'STOPPING': AgentCheck.CRITICAL,
'EXITED': AgentCheck.CRITICAL,
'FATAL': AgentCheck.CRITICAL,
'UNKNOWN': AgentCheck.UNKNOWN
}

PROCESS_STATUS = {
AgentCheck.CRITICAL: 'down',
AgentCheck.OK: 'up',
AgentCheck.UNKNOWN: 'unknown'
}

SERVER_TAG = 'supervisord_server'

PROCESS_TAG = 'supervisord_process'

FORMAT_TIME = lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x))


class SupervisordCheck(AgentCheck):

def check(self, instance):
server_name = instance.get('name')

if not server_name or not server_name.strip():
raise Exception("Supervisord server name not specified in yaml configuration.")

supervisor = self._connect(instance)
count_by_status = defaultdict(int)

# gather all process information
try:
processes = supervisor.getAllProcessInfo()
except xmlrpclib.Fault, error:
raise Exception(
'An error occurred while reading process information: %s %s'
% (error.faultCode, error.faultString)
)

except socket.error, e:
host = instance.get('host', DEFAULT_HOST)
port = instance.get('port', DEFAULT_PORT)
sock = instance.get('socket')
if sock is None:
msg = 'Cannot connect to http://%s:%s. ' \
'Make sure supervisor is running and XML-RPC ' \
'inet interface is enabled.' % (host, port)
else:
msg = 'Cannot connect to %s. Make sure sure supervisor ' \
'is running and socket is enabled and socket file' \
' has the right permissions.' % sock

if e.errno not in [errno.EACCES, errno.ENOENT]: # permissions denied, no such file
self.service_check('supervisord.server.check', AgentCheck.CRITICAL,
tags=['%s:%s' % (SERVER_TAG, server_name)],
message='Supervisord server %s is down.' % server_name)

raise Exception(msg)

except xmlrpclib.ProtocolError, e:
if e.errcode == 401: # authorization error
raise Exception('Username or password to %s are incorrect.' %
server_name)
else:
raise Exception('An error occurred while connecting to %s: '
'%s %s ' % (server_name, e.errcode, e.errmsg))

# filter monitored processes on configuration directives
proc_regex = instance.get('proc_regex', [])
if not isinstance(proc_regex, list):
raise Exception("Empty or invalid proc_regex.")

proc_names = instance.get('proc_names', [])
if not isinstance(proc_names, list):
raise Exception("Empty or invalid proc_names.")

# Collect information on each monitored process
monitored_processes = []

# monitor all processes if no filters were specified
if len(proc_regex) == 0 and len(proc_names) == 0:
monitored_processes = processes

for pattern, process in itertools.product(proc_regex, processes):
if re.match(pattern, process['name']) and process not in monitored_processes:
monitored_processes.append(process)

for process in processes:
if process['name'] in proc_names and process not in monitored_processes:
monitored_processes.append(process)

# Report service checks and uptime for each process
for proc in monitored_processes:
proc_name = proc['name']
tags = ['%s:%s' % (SERVER_TAG, server_name),
'%s:%s' % (PROCESS_TAG, proc_name)]

# Report Service Check
status = DD_STATUS[proc['statename']]
msg = self._build_message(proc)
count_by_status[status] += 1
self.service_check('supervisord.process.check',
status, tags=tags, message=msg)
# Report Uptime
uptime = self._extract_uptime(proc)
self.gauge('supervisord.process.uptime', uptime, tags=tags)

# Report counts by status
tags = ['%s:%s' % (SERVER_TAG, server_name)]
for status in PROCESS_STATUS:
self.gauge('supervisord.process.count', count_by_status[status],
tags=tags + ['status:%s' % PROCESS_STATUS[status]])

@staticmethod
def _connect(instance):
sock = instance.get('socket')
if sock is not None:
host = instance.get('host', DEFAULT_SOCKET_IP)
transport = supervisor.xmlrpc.SupervisorTransport(None, None, sock)
server = xmlrpclib.ServerProxy(host, transport=transport)
else:
host = instance.get('host', DEFAULT_HOST)
port = instance.get('port', DEFAULT_PORT)
user = instance.get('user')
password = instance.get('pass')
auth = '%s:%s@' % (user, password) if user and password else ''
server = xmlrpclib.Server('http://%s%s:%s/RPC2' % (auth, host, port))
return server.supervisor

@staticmethod
def _extract_uptime(proc):
start, now = int(proc['start']), int(proc['now'])
status = proc['statename']
active_state = status in ['BACKOFF', 'RUNNING', 'STOPPING']
return now - start if active_state else 0

@staticmethod
def _build_message(proc):
start, stop, now = int(proc['start']), int(proc['stop']), int(proc['now'])
proc['now_str'] = FORMAT_TIME(now)
proc['start_str'] = FORMAT_TIME(start)
proc['stop_str'] = '' if stop == 0 else FORMAT_TIME(stop)

return """Current time: %(now_str)s
Process name: %(name)s
Process group: %(group)s
Description: %(description)s
Error log file: %(stderr_logfile)s
Stdout log file: %(stdout_logfile)s
Log file: %(logfile)s
State: %(statename)s
Start time: %(start_str)s
Stop time: %(stop_str)s
Exit Status: %(exitstatus)s""" % proc
51 changes: 51 additions & 0 deletions conf.d/supervisord.yaml.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#
# There are two ways to get started with the supervisord check.
#
# You can configure inet_http_server in /etc/supervisord.conf. Below is an
# example inet_http_server configuration:
#
# [inet_http_server]
# port:localhost:9001
# username:user # optional
# password:pass # optional
#
# OR, you can use supervisorctl socket to communicate with supervisor.
# If supervisor is running as root, make sure chmod property is set
# to a permission accessible to non-root users. See the example below:
#
# [supervisorctl]
# serverurl=unix:///var/run//supervisor.sock
#
# [unix_http_server]
# file=/var/run/supervisor.sock
# chmod=777
#
# Reload supervsior, specify the inet or unix socket server information
# in this yaml file along with an optional list of the processes you want
# to monitor per instance, and you're good to go!
#
# See http://supervisord.org/configuration.html for more information on
# configuring supervisord sockets and inet http servers.
#

init_config:

instances:
# - name: server0 # Required. An arbitrary name to identify the supervisord server
# host: localhost # Optional. Defaults to localhost. The host where supervisord server is running
# port: 9001 # Optional. Defaults to 9001. The port number.
# user: user # Optional. Required only if a username is configured.
# pass: pass # Optional. Required only if a password is configured.
# proc_regex: # Optional. Regex pattern[s] matching the names of processes to monitor
# - 'myprocess-\d\d$'
# proc_names: # Optional. Monitor processes with specific names.
# - apache2 # Combines with proc_regex matches.
# - custom_app # If neither proc_names or proc_regex are specified, the check will monitor all processes.
# - etc
# server_check: false # Optional. Defaults to true. Service check for connection to supervisord server.
# - name: server1
# host: localhost
# port: 9002
# - name: server2
# socket: unix:///var/run//supervisor.sock
# host: http://127.0.0.1 # Optional. Defaults to http://127.0.0.1
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ httplib2
kafka-python==0.9.0-9bed11db98387c0d9e456528130b330631dc50af
requests
paramiko
supervisor==3.1.3
Loading