Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add check for ZFS #1349

Closed
wants to merge 57 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
2991d4e
initial commit of zfs.py
mboylevt Jan 27, 2015
d6d433e
Add check bits, gauge calls
mboylevt Jan 27, 2015
a10eaed
initial blank conf
mboylevt Jan 27, 2015
9f0a688
Remove main entrypoint for check, correct yaml format
mboylevt Jan 27, 2015
e9d7915
Add proper subprocess handling to zfs
mboylevt Jan 27, 2015
d5a1a79
Add debug info
mboylevt Jan 27, 2015
a28ddbd
check for bad data
mboylevt Jan 27, 2015
f77c793
Add service check
mboylevt Jan 28, 2015
0c3b6e8
Fix spacing
Jan 29, 2015
37117ab
Refactor zfs reporting to make it more flexible
mboylevt Jan 30, 2015
aa9f446
Change name to zfs_name to avoid confusion
mboylevt Jan 30, 2015
bd47f47
Added retrieval for zpool checks and metrics
mboylevt Jan 30, 2015
eab5a7c
Add zpool processing
mboylevt Jan 30, 2015
d8e22ea
int -> float. duh
mboylevt Jan 30, 2015
ef9c8d2
String casting
mboylevt Jan 30, 2015
471b068
Final changes
mboylevt Jan 30, 2015
b1d562c
Dependency injection for subprocess so that Unittests can mock it
mboylevt Jan 30, 2015
b7c0a77
Add unittests for convert_human_to_bytes, update function to address …
mboylevt Jan 30, 2015
a5efc3d
Add more unit tests
mboylevt Jan 31, 2015
748f610
Whitespace for triggering travis
mboylevt Feb 2, 2015
ff0d53b
Add unit test for process_zfs_usage, update function itself to return…
mboylevt Feb 2, 2015
e40220a
Added _process_zpool tests
Feb 2, 2015
2e1acad
Remove unecessary comment -- issue addressed
mboylevt Feb 2, 2015
b0f23a5
update comment
mboylevt Feb 5, 2015
24a3f38
initial commit of zfs.py
mboylevt Jan 27, 2015
e56e43a
Add check bits, gauge calls
mboylevt Jan 27, 2015
39a4f3a
initial blank conf
mboylevt Jan 27, 2015
797bcb9
Remove main entrypoint for check, correct yaml format
mboylevt Jan 27, 2015
74ab77e
Add proper subprocess handling to zfs
mboylevt Jan 27, 2015
0e5464f
Add debug info
mboylevt Jan 27, 2015
1f67011
check for bad data
mboylevt Jan 27, 2015
037debd
Add service check
mboylevt Jan 28, 2015
4dcf57c
Fix spacing
Jan 29, 2015
3b0cbce
Refactor zfs reporting to make it more flexible
mboylevt Jan 30, 2015
9b9fcae
Change name to zfs_name to avoid confusion
mboylevt Jan 30, 2015
0105eee
Added retrieval for zpool checks and metrics
mboylevt Jan 30, 2015
dfdb212
Add zpool processing
mboylevt Jan 30, 2015
1c60138
int -> float. duh
mboylevt Jan 30, 2015
206d2b9
String casting
mboylevt Jan 30, 2015
3816226
Final changes
mboylevt Jan 30, 2015
d38e832
Dependency injection for subprocess so that Unittests can mock it
mboylevt Jan 30, 2015
595da37
Add unittests for convert_human_to_bytes, update function to address …
mboylevt Jan 30, 2015
5344c54
Add more unit tests
mboylevt Jan 31, 2015
418d616
Whitespace for triggering travis
mboylevt Feb 2, 2015
4299fdb
Add unit test for process_zfs_usage, update function itself to return…
mboylevt Feb 2, 2015
e016afe
Added _process_zpool tests
Feb 2, 2015
9be0ea5
Remove unecessary comment -- issue addressed
mboylevt Feb 2, 2015
710d875
update comment
mboylevt Feb 5, 2015
11e22a9
Merge branch 'master' of http://github.com/mboylevt/dd-agent
mboylevt Feb 5, 2015
3506b72
Thanks for fixing the check
Feb 26, 2015
07f2e6c
Merge branch 'master' of http://github.com/mboylevt/dd-agent
Feb 26, 2015
52794f3
Pull request feedback -- clean up imports
mboylevt Mar 3, 2015
60da27b
Move future import back to the top, where it belongs
Mar 10, 2015
7c2a8fb
Add vdev checks
mboylevt Apr 15, 2015
f42fd44
Patch a few things up
Apr 15, 2015
2915f6d
Add percent_used
mboylevt Apr 15, 2015
56908d6
Oops, silly goose
mboylevt Apr 15, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
298 changes: 298 additions & 0 deletions checks.d/zfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
'''
ZFS check
'''
from __future__ import division

# stdlib
import subprocess
import re
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coud you organize your imports the same way it's done in other checks please ?
e.g. https://github.com/DataDog/dd-agent/blob/master/checks.d/redisdb.py#L4-L13

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure can! Done.


# project
from checks import AgentCheck

class Zfs(AgentCheck):
# Inject dependency so that we can make mocks work in UnitTests
subprocess = subprocess

ZFS_NAMESPACE = 'system.zfs.'
ZFS_AVAILABLE = 'available'
ZFS_USED = 'used'
ZFS_COMPRESSRATIO = 'compressratio'

ZPOOL_NAMESPACE = 'zpool.'
ZPOOL_VDEV_NAMESPACE = 'zpool.vdev.'
ZPOOL_CAPACITY = 'capacity'
ZPOOL_SIZE = 'size'
ZPOOL_DEDUPRATIO = 'dedupratio'
ZPOOL_FREE = 'free'
ZPOOL_ALLOCATED = 'allocated'
ZPOOL_HEALTH = 'health'
ZPOOL_TOTAL = 'total'

zfs_metrics = [
ZFS_AVAILABLE,
ZFS_USED,
ZFS_COMPRESSRATIO
]

zpool_metrics = [
ZPOOL_CAPACITY,
ZPOOL_SIZE,
ZPOOL_DEDUPRATIO,
ZPOOL_FREE,
ZPOOL_ALLOCATED
]

zpool_service_checks = [
ZPOOL_HEALTH
]

def check(self, instance):
# Retrieve the list of ZFS filesystems
self.log.debug('Getting list of zfs filesystems')
zfs_filesystems = self._get_zfs_filesystems()

# Retrieve the list of Zpools
self.log.debug('Getting list of zpools')
zpools = self._get_zpools()

# For each zfs filesystem, retrieve statistics and send them to datadog
for zfs_fs in zfs_filesystems:
self.log.debug('Reporting on ZFS filesystem {}'.format(zfs_fs))
stats = self._get_zfs_stats(zfs_fs)
self._process_zfs_usage(zfs_name=zfs_fs, zfs_stats=stats)

# For each zpool, retrieve statistics and send them to datadog
for zpool in zpools:
self.log.debug('Reporting on zpool {}'.format(zpool))
stats = self._get_zpool_stats(zpool)
checks = self._get_zpool_checks(zpool)
vdev_stats = self._get_zpool_iostat(zpool)
self._process_zpool(zpool, stats, checks, vdev_stats)

def _process_zpool(self, zpool, zpool_metrics, zpool_checks, zpool_vdev_stats):
"""
Process zpool usage

:param zpool: Name of zfs filesystem
:param zpool_metrics: Associated statistics
:param zpool_checks: Associated service checks
:return: None
"""
tags = [
'zpool_name:{}'.format(zpool)
]

for metric in zpool_metrics.keys():
self.gauge(Zfs.ZPOOL_NAMESPACE + metric, zpool_metrics[metric], tags=tags)

for check in zpool_checks.keys():
if check == Zfs.ZPOOL_HEALTH:
check_status = None
health_status = zpool_checks[check]
if health_status == 'ONLINE':
check_status = AgentCheck.OK
elif check_status == 'DEGRADED':
check_status = AgentCheck.WARNING
else:
check_status = AgentCheck.CRITICAL
self.service_check(Zfs.ZPOOL_NAMESPACE + check, check_status, tags=tags, message=health_status)

for vdev in zpool_vdev_stats:
tags = [
'zpool_name:{}'.format(zpool),
'vdev_name:{}'.format(vdev)
]
self.gauge(Zfs.ZPOOL_VDEV_NAMESPACE + 'total', zpool_vdev_stats[vdev]['total'], tags=tags)
self.gauge(Zfs.ZPOOL_VDEV_NAMESPACE + 'free', zpool_vdev_stats[vdev]['free'], tags=tags)
self.gauge(Zfs.ZPOOL_VDEV_NAMESPACE + 'percent_used', zpool_vdev_stats[vdev]['percent_used'], tags=tags)

def _get_zpools(self):
"""
Get list of zpools
:return: List of zpools
"""
p = self.subprocess.Popen(
'sudo zpool list -H -o name'.split(),
stdout=self.subprocess.PIPE
)
zpools, err = p.communicate()
return filter(None, zpools.split('\n'))

def _get_zpool_stats(self, zpool):
"""
Retrieve numerical statistics about zpool. Parses out all non-digits

:param zpool:
:return:
"""
p = self.subprocess.Popen(
'sudo zpool get {props} {name}'.format(
props=','.join(Zfs.zpool_metrics),
name=zpool
).split(),
stdout=self.subprocess.PIPE
)
zpool_output, err = p.communicate()
stats = {}
for line in filter(None, zpool_output.split('\n')):
properties = line.split()
result = properties[2]
# Stupid zpool command doesn't let you skip headers. Toss this record
if result == 'VALUE':
continue
if re.match('^\d+[K,M,G,T]', result) or re.match('^\d+\.\d+[K,M,G,T]', result):
result = self._convert_human_to_bytes(result)
stats[properties[1]] = re.sub('[^0-9,\.]', "", str(result))
return stats

def _get_zpool_iostat(self, zpool):
"""
Retrieve vdev-specific stats using iostat -v. Parses out all non-digits

:param zpool:
:return:
"""
p = subprocess.Popen(
'sudo zpool iostat -v {name}'.format(
name=zpool
).split(),
stdout=subprocess.PIPE
)
zpool_iostat_output, err = p.communicate()
stats = {}
vdev_count = 0
vdev_name = "VDEV_"
zpool_iostat_output = filter(None, zpool_iostat_output.split('\n'))[4:-1]

# For each line from zpool iostat -v, find the vdevs and get their total and free space
for line in zpool_iostat_output:
properties = line.split()

# We only care about parsing vdevs here for total and free space. Lines from iostat
# which are disk-only don't have total capacity, just '-', so we don't want to send
# any information
if properties[1][0] == '-':
continue
current_vdev = vdev_name + str(vdev_count)
stats[current_vdev] = {}
total = properties[1]
free = properties[2]

if re.match('^\d+[K,M,G,T]', free) or re.match('^\d+\.\d+[K,M,G,T]', free):
free = self._convert_human_to_bytes(free)
if re.match('^\d+[K,M,G,T]', total) or re.match('^\d+\.\d+[K,M,G,T]', total):
total = self._convert_human_to_bytes(total)

used = int(total) - int(free)
percent_used = int((used / int(total)) * 100)
if percent_used < 1:
percent_used = 1

stats[current_vdev]['total'] = total
stats[current_vdev]['free'] = free
stats[current_vdev]['percent_used'] = percent_used
vdev_count += 1
return stats

def _get_zpool_checks(self, zpool):
"""
Retrieve service check stats about zpool. Returns as-is: no parsing

:param zpool:
:return:
"""
p = self.subprocess.Popen(
'sudo zpool get {props} {name}'.format(
props=','.join(Zfs.zpool_service_checks),
name=zpool
).split(),
stdout=self.subprocess.PIPE
)
zpool_output, err = p.communicate()
checks = {}
for line in filter(None, zpool_output.split('\n')):
properties = line.split()
result = properties[2]
# Stupid zpool command doesn't let you skip headers. Toss this record
if result == 'VALUE':
continue
checks[properties[1]] = result
return checks

@staticmethod
def _convert_human_to_bytes(number):
unit = number[-1:].upper()
value = float(number[:-1])

if unit == 'K':
value *= 1024
elif unit == 'M':
value *= 1048576
elif unit == 'G':
value *= 1073741824
elif unit == 'T':
value *= 1099511627776
elif unit not in ('K', 'M', 'G', 'T'):
try:
value = float(number)
except ValueError:
raise NotImplementedError
return int(value)

def _get_zfs_filesystems(self):
"""
Get all zfs filesystems present on the host
:return: List of zfs filesystems
"""
p = self.subprocess.Popen(
'sudo zfs list -o name -H'.split(),
stdout=self.subprocess.PIPE
)
zfs_filesystems, err = p.communicate()
return filter(None, zfs_filesystems.split('\n'))

def _get_zfs_stats(self, zfs_name):
p = self.subprocess.Popen(
'sudo zfs get -o property,value -p {props} -H {name}'.format(
props=','.join(Zfs.zfs_metrics),
name=zfs_name).split(),
stdout=self.subprocess.PIPE
)
zfs_output, err = p.communicate()
stats = {}
for line in filter(None, zfs_output.split('\n')):
properties = line.split()
stats[properties[0]] = re.sub("[^0-9,\.]", "", properties[1])
return stats

def _process_zfs_usage(self, zfs_name, zfs_stats):
"""
Process zfs usage

:param zfs_name: Name of zfs filesystem
:param zfs_stats: Associated statistics
:return: None
"""
tags = [
'zfs_name:{}'.format(zfs_name)
]

try:
total = int(zfs_stats[Zfs.ZFS_USED]) + int(zfs_stats[Zfs.ZFS_AVAILABLE])
percent_used = int((int(zfs_stats[Zfs.ZFS_USED]) / total) * 100)
if percent_used < 1:
percent_used = 1
self.gauge(Zfs.ZFS_NAMESPACE + 'total', str(total), tags=tags)
self.gauge(Zfs.ZFS_NAMESPACE + 'percent_used', str(percent_used), tags=tags)

except ValueError:
self.log.debug("Could not determine total and percentage for zfs {name}, used {used}, avail {avail}".format(
name=zfs_name,
used=zfs_stats[Zfs.ZFS_USED],
avail=zfs_stats[Zfs.ZFS_AVAILABLE]
))

for metric in zfs_stats.keys():
self.gauge(Zfs.ZFS_NAMESPACE + metric, zfs_stats[metric], tags=tags)

4 changes: 4 additions & 0 deletions conf.d/zfs.yaml.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
init_config:

instances:
- name: test
Loading