diff --git a/.travis.yml b/.travis.yml index 255f90bfa1..31904b9b36 100644 --- a/.travis.yml +++ b/.travis.yml @@ -65,6 +65,8 @@ env: - TRAVIS_FLAVOR=redis FLAVOR_VERSION=2.4.18 - TRAVIS_FLAVOR=redis FLAVOR_VERSION=2.6.17 - TRAVIS_FLAVOR=redis FLAVOR_VERSION=2.8.19 + # Compilation takes too much time on Travis + # - TRAVIS_FLAVOR=riak - TRAVIS_FLAVOR=snmpd - TRAVIS_FLAVOR=ssh - TRAVIS_FLAVOR=supervisord diff --git a/Rakefile b/Rakefile index d0c294d300..f4c6b05b34 100755 --- a/Rakefile +++ b/Rakefile @@ -24,6 +24,7 @@ require './ci/phpfpm' require './ci/postgres' require './ci/rabbitmq' require './ci/redis' +require './ci/riak' require './ci/snmpd' require './ci/ssh' require './ci/supervisord' diff --git a/checks.d/riak.py b/checks.d/riak.py index a6bd9601b8..020cd6160f 100644 --- a/checks.d/riak.py +++ b/checks.d/riak.py @@ -1,6 +1,4 @@ # stdlib -import time -from hashlib import md5 import socket # project @@ -10,44 +8,45 @@ import simplejson as json from httplib2 import Http, HttpLib2Error + class Riak(AgentCheck): SERVICE_CHECK_NAME = 'riak.can_connect' keys = [ - "vnode_gets", - "vnode_puts", - "vnode_index_reads", - "vnode_index_writes", - "vnode_index_deletes", - "node_gets", - "node_puts", - "pbc_active", - "pbc_connects", - "memory_total", - "memory_processes", - "memory_processes_used", "memory_atom", "memory_atom_used", "memory_binary", "memory_code", "memory_ets", - "read_repairs", - "node_put_fsm_rejected_60s", - "node_put_fsm_active_60s", - "node_put_fsm_in_rate", - "node_put_fsm_out_rate", - "node_get_fsm_rejected_60s", + "memory_processes", + "memory_processes_used", + "memory_total", "node_get_fsm_active_60s", "node_get_fsm_in_rate", "node_get_fsm_out_rate" + "node_get_fsm_rejected_60s", + "node_gets", + "node_put_fsm_active_60s", + "node_put_fsm_in_rate", + "node_put_fsm_out_rate", + "node_put_fsm_rejected_60s", + "node_puts", + "pbc_active", + "pbc_connects", + "read_repairs", + "vnode_gets", + "vnode_index_deletes", + "vnode_index_reads", + "vnode_index_writes", + "vnode_puts", ] stat_keys = [ - "node_get_fsm_siblings", "node_get_fsm_objsize", + "node_get_fsm_siblings", "node_get_fsm_time", "node_put_fsm_time" - ] + ] def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) @@ -57,30 +56,32 @@ def __init__(self, name, init_config, agentConfig, instances=None): self.prev_coord_redirs_total = -1 def check(self, instance): - url = instance['url'] + url = instance['url'] default_timeout = self.init_config.get('default_timeout', 5) - timeout = float(instance.get('timeout', default_timeout)) - service_check_tags = ['url:%s' % url] + timeout = float(instance.get('timeout', default_timeout)) + tags = instance.get('tags', []) + service_check_tags = tags + ['url:%s' % url] try: h = Http(timeout=timeout) resp, content = h.request(url, "GET") except (socket.timeout, socket.error, HttpLib2Error) as e: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - message="Unable to fetch Riak stats: %s" % str(e), - tags=service_check_tags) + message="Unable to fetch Riak stats: %s" % str(e), + tags=service_check_tags) + raise if resp.status != 200: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, - tags=service_check_tags, - message="Unexpected status of %s when fetching Riak stats, " \ - "response: %s" % (resp.status, content)) + tags=service_check_tags, + message="Unexpected status of %s when fetching Riak stats, " + "response: %s" % (resp.status, content)) stats = json.loads(content) self.service_check( self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) - [self.gauge("riak." + k, stats[k]) for k in self.keys if k in stats] + [self.gauge("riak." + k, stats[k], tags=tags) for k in self.keys if k in stats] coord_redirs_total = stats["coord_redirs_total"] if self.prev_coord_redirs_total > -1: diff --git a/ci/common.rb b/ci/common.rb index 88ca3eb9fb..c7a65582a6 100644 --- a/ci/common.rb +++ b/ci/common.rb @@ -67,7 +67,7 @@ def section(name) task :before_cache do |t| section('BEFORE_CACHE') - sh %(find $INTEGRATIONS_DIR/ -type f -name '*.log' -delete) + sh %(find $INTEGRATIONS_DIR/ -type f -name '*.log*' -delete) t.reenable end diff --git a/ci/riak.rb b/ci/riak.rb new file mode 100644 index 0000000000..3696af0056 --- /dev/null +++ b/ci/riak.rb @@ -0,0 +1,98 @@ +require './ci/common' + +def riak_version + ENV['COUCHDB_VERSION'] || '2.0.5' +end + +def riak_rootdir + "#{ENV['INTEGRATIONS_DIR']}/riak_#{riak_version}" +end + +namespace :ci do + namespace :riak do |flavor| + task :before_install => ['ci:common:before_install'] + + task :install => ['ci:common:install'] do + unless Dir.exist? File.expand_path(riak_rootdir) + sh %(curl -o $VOLATILE_DIR/kerl https://raw.githubusercontent.com/spawngrid/kerl/master/kerl) + sh %(chmod a+x $VOLATILE_DIR/kerl) + sh %($VOLATILE_DIR/kerl build git git://github.com/basho/otp.git OTP_R16B02 R16B02) + sh %($VOLATILE_DIR/kerl install R16B02 $VOLATILE_DIR/erlang/R16B02) + sh %(curl -o $VOLATILE_DIR/riak.tar.gz http://s3.amazonaws.com/downloads.basho.com/riak/#{riak_version[0..2]}/#{riak_version}/riak-#{riak_version}.tar.gz) + sh %(mkdir -p $VOLATILE_DIR/riak) + sh %(tar zxvf $VOLATILE_DIR/riak.tar.gz -C $VOLATILE_DIR/riak --strip-components=1) + sh %(cd $VOLATILE_DIR/riak\ + && PATH=$PATH:$VOLATILE_DIR/erlang/R16B02/bin make all\ + && PATH=$PATH:$VOLATILE_DIR/erlang/R16B02/bin make devrel DEVNODES=2) + sh %(mv $VOLATILE_DIR/riak/dev #{riak_rootdir}) + end + end + + task :before_script => ['ci:common:before_script'] do + %w(dev1 dev2).each do |dev| + sh %(#{riak_rootdir}/#{dev}/bin/riak start) + end + # When cached, dev2 is already a member of the cluster + sh %(#{riak_rootdir}/dev2/bin/riak-admin cluster join dev1@127.0.0.1 || true) + sh %(#{riak_rootdir}/dev2/bin/riak-admin cluster plan) + sh %(#{riak_rootdir}/dev2/bin/riak-admin cluster commit) + 10.times do + sh %(curl -s -XPUT http://localhost:10018/buckets/welcome/keys/german\ + -H 'Content-Type: text/plain'\ + -d 'herzlich willkommen') + sh %(curl -s http://localhost:10018/buckets/welcome/keys/german) + end + end + + task :script => ['ci:common:script'] do + this_provides = [ + 'riak' + ] + Rake::Task['ci:common:run_tests'].invoke(this_provides) + end + + task :before_cache => ['ci:common:before_cache'] do + Rake::Task['ci:riak:cleanup'].invoke + end + + task :cache => ['ci:common:cache'] + + task :cleanup => ['ci:common:cleanup'] do + %w(dev1 dev2).each do |dev| + sh %(#{riak_rootdir}/#{dev}/bin/riak stop) + sh %(rm -rf #{riak_rootdir}/#{dev}/data) + end + end + + task :execute do + exception = nil + # Compilation takes too long on Travis, so it's using the cache + # External contributors don't have access to the cache + # So they can't run these tests + if ENV['TRAVIS'] && ENV['AWS_SECRET_ACCESS_KEY'].nil? + puts "Riak tests won't run, compilation takes too long on Travis" + else + begin + %w(before_install install before_script script).each do |t| + Rake::Task["#{flavor.scope.path}:#{t}"].invoke + end + rescue => e + exception = e + puts "Failed task: #{e.class} #{e.message}".red + end + if ENV['SKIP_CLEANUP'] + puts 'Skipping cleanup, disposable environments are great'.yellow + else + puts 'Cleaning up' + Rake::Task["#{flavor.scope.path}:cleanup"].invoke + end + if ENV['TRAVIS'] + %w(before_cache cache).each do |t| + Rake::Task["#{flavor.scope.path}:#{t}"].invoke + end + end + fail exception if exception + end + end + end +end diff --git a/conf.d/riak.yaml.example b/conf.d/riak.yaml.example index bb8e5d7aef..1d38aea0a9 100644 --- a/conf.d/riak.yaml.example +++ b/conf.d/riak.yaml.example @@ -2,3 +2,6 @@ init_config: instances: - url: http://127.0.0.1:8098/stats + # tags: + # - optional_tag1 + # - optional_tag2 diff --git a/tests/test_riak.py b/tests/test_riak.py new file mode 100644 index 0000000000..ae0f7a3156 --- /dev/null +++ b/tests/test_riak.py @@ -0,0 +1,104 @@ +from nose.plugins.attrib import attr +import socket +from tests.common import AgentCheckTest + + +@attr(requires='riak') +class RiakTestCase(AgentCheckTest): + + CHECK_NAME = 'riak' + + CHECK_GAUGES = [ + 'riak.memory_atom', + 'riak.memory_atom_used', + 'riak.memory_binary', + 'riak.memory_code', + 'riak.memory_ets', + 'riak.memory_processes', + 'riak.memory_processes_used', + 'riak.memory_total', + 'riak.node_get_fsm_active_60s', + 'riak.node_get_fsm_in_rate', + 'riak.node_gets', + 'riak.node_put_fsm_active_60s', + 'riak.node_put_fsm_in_rate', + 'riak.node_put_fsm_out_rate', + 'riak.node_put_fsm_rejected_60s', + 'riak.node_puts', + 'riak.pbc_active', + 'riak.pbc_connects', + 'riak.read_repairs', + 'riak.vnode_gets', + 'riak.vnode_index_deletes', + 'riak.vnode_index_reads', + 'riak.vnode_index_writes', + 'riak.vnode_puts', + ] + + CHECK_GAUGES_STATS = [ + 'riak.node_get_fsm_objsize_100', + 'riak.node_get_fsm_objsize_95', + 'riak.node_get_fsm_objsize_99', + 'riak.node_get_fsm_objsize_mean', + 'riak.node_get_fsm_objsize_median', + 'riak.node_get_fsm_siblings_100', + 'riak.node_get_fsm_siblings_95', + 'riak.node_get_fsm_siblings_99', + 'riak.node_get_fsm_siblings_mean', + 'riak.node_get_fsm_siblings_median', + 'riak.node_get_fsm_time_100', + 'riak.node_get_fsm_time_95', + 'riak.node_get_fsm_time_99', + 'riak.node_get_fsm_time_mean', + 'riak.node_get_fsm_time_median', + 'riak.node_put_fsm_time_100', + 'riak.node_put_fsm_time_95', + 'riak.node_put_fsm_time_99', + 'riak.node_put_fsm_time_mean', + 'riak.node_put_fsm_time_median', + ] + + # FIXME + # Does not appear when null and we can't really fake it + # These metrics do not appear in the docs + # http://docs.basho.com/riak/latest/ops/running/stats-and-monitoring/ + CHECK_NOT_TESTED = [ + 'riak.node_get_fsm_out_rate', + 'riak.node_get_fsm_rejected_60s', + ] + + SERVICE_CHECK_NAME = 'riak.can_connect' + + def __init__(self, *args, **kwargs): + AgentCheckTest.__init__(self, *args, **kwargs) + + def test_riak(self): + config_dev1 = { + "instances": [{ + "url": "http://localhost:10018/stats", + "tags": ["my_tag"] + }] + } + self.run_check(config_dev1) + tags = ['my_tag'] + sc_tags = tags + ['url:' + config_dev1['instances'][0]['url']] + + for gauge in self.CHECK_GAUGES + self.CHECK_GAUGES_STATS: + self.assertMetric(gauge, count=1, tags=tags) + + self.assertServiceCheckOK(self.SERVICE_CHECK_NAME, + tags=sc_tags, + count=1) + self.coverage_report() + + def test_bad_config(self): + self.assertRaises( + socket.error, + lambda: self.run_check({"instances": [{"url": "http://localhost:5985"}]}) + ) + sc_tags = ['url:http://localhost:5985'] + + self.assertServiceCheckCritical(self.SERVICE_CHECK_NAME, + tags=sc_tags, + count=1) + self.coverage_report()