diff --git a/checks.d/mesos_master.py b/checks.d/mesos_master.py new file mode 100644 index 0000000000..d552817b4d --- /dev/null +++ b/checks.d/mesos_master.py @@ -0,0 +1,232 @@ +"""Mesos Master check + +Collects metrics from mesos master node, only the leader is sending metrics. +""" +# stdlib +from hashlib import md5 +import time + +# 3rd party +import requests + +# project +from checks import AgentCheck, CheckException + + +class MesosMaster(AgentCheck): + GAUGE = AgentCheck.gauge + MONOTONIC_COUNT = AgentCheck.monotonic_count + SERVICE_CHECK_NAME = "mesos_master.can_connect" + service_check_needed = True + + + FRAMEWORK_METRICS = { + 'cpus' : ('mesos.framework.cpu', GAUGE), + 'mem' : ('mesos.framework.mem', GAUGE), + 'disk' : ('mesos.framework.disk', GAUGE), + } + + ROLE_RESOURCES_METRICS = { + 'cpus' : ('mesos.role.cpu', GAUGE), + 'mem' : ('mesos.role.mem', GAUGE), + 'disk' : ('mesos.role.disk', GAUGE), + } + + # These metrics are aggregated only on the elected master + CLUSTER_TASKS_METRICS = { + 'master/tasks_error' : ('mesos.cluster.tasks_error', GAUGE), + 'master/tasks_failed' : ('mesos.cluster.tasks_failed', MONOTONIC_COUNT), + 'master/tasks_finished' : ('mesos.cluster.tasks_finished', MONOTONIC_COUNT), + 'master/tasks_killed' : ('mesos.cluster.tasks_killed', MONOTONIC_COUNT), + 'master/tasks_lost' : ('mesos.cluster.tasks_lost', MONOTONIC_COUNT), + 'master/tasks_running' : ('mesos.cluster.tasks_running', GAUGE), + 'master/tasks_staging' : ('mesos.cluster.tasks_staging', GAUGE), + 'master/tasks_starting' : ('mesos.cluster.tasks_starting', GAUGE), + } + + # These metrics are aggregated only on the elected master + CLUSTER_SLAVES_METRICS = { + 'master/slave_registrations' : ('mesos.cluster.slave_registrations', GAUGE), + 'master/slave_removals' : ('mesos.cluster.slave_removals', GAUGE), + 'master/slave_reregistrations' : ('mesos.cluster.slave_reregistrations', GAUGE), + 'master/slave_shutdowns_canceled' : ('mesos.cluster.slave_shutdowns_canceled', GAUGE), + 'master/slave_shutdowns_scheduled' : ('mesos.cluster.slave_shutdowns_scheduled', GAUGE), + 'master/slaves_active' : ('mesos.cluster.slaves_active', GAUGE), + 'master/slaves_connected' : ('mesos.cluster.slaves_connected', GAUGE), + 'master/slaves_disconnected' : ('mesos.cluster.slaves_disconnected', GAUGE), + 'master/slaves_inactive' : ('mesos.cluster.slaves_inactive', GAUGE), + 'master/recovery_slave_removals' : ('mesos.cluster.recovery_slave_removals', GAUGE), + } + + # These metrics are aggregated only on the elected master + CLUSTER_RESOURCES_METRICS = { + 'master/cpus_percent' : ('mesos.cluster.cpus_percent', GAUGE), + 'master/cpus_total' : ('mesos.cluster.cpus_total', GAUGE), + 'master/cpus_used' : ('mesos.cluster.cpus_used', GAUGE), + 'master/disk_percent' : ('mesos.cluster.disk_percent', GAUGE), + 'master/disk_total' : ('mesos.cluster.disk_total', GAUGE), + 'master/disk_used' : ('mesos.cluster.disk_used', GAUGE), + 'master/mem_percent' : ('mesos.cluster.mem_percent', GAUGE), + 'master/mem_total' : ('mesos.cluster.mem_total', GAUGE), + 'master/mem_used' : ('mesos.cluster.mem_used', GAUGE), + } + + # These metrics are aggregated only on the elected master + CLUSTER_REGISTRAR_METRICS = { + 'registrar/queued_operations' : ('mesos.registrar.queued_operations', GAUGE), + 'registrar/registry_size_bytes' : ('mesos.registrar.registry_size_bytes', GAUGE), + 'registrar/state_fetch_ms' : ('mesos.registrar.state_fetch_ms', GAUGE), + 'registrar/state_store_ms' : ('mesos.registrar.state_store_ms', GAUGE), + 'registrar/state_store_ms/count' : ('mesos.registrar.state_store_ms.count', GAUGE), + 'registrar/state_store_ms/max' : ('mesos.registrar.state_store_ms.max', GAUGE), + 'registrar/state_store_ms/min' : ('mesos.registrar.state_store_ms.min', GAUGE), + 'registrar/state_store_ms/p50' : ('mesos.registrar.state_store_ms.p50', GAUGE), + 'registrar/state_store_ms/p90' : ('mesos.registrar.state_store_ms.p90', GAUGE), + 'registrar/state_store_ms/p95' : ('mesos.registrar.state_store_ms.p95', GAUGE), + 'registrar/state_store_ms/p99' : ('mesos.registrar.state_store_ms.p99', GAUGE), + 'registrar/state_store_ms/p999' : ('mesos.registrar.state_store_ms.p999', GAUGE), + 'registrar/state_store_ms/p9999' : ('mesos.registrar.state_store_ms.p9999', GAUGE), + } + + # These metrics are aggregated only on the elected master + CLUSTER_FRAMEWORK_METRICS = { + 'master/frameworks_active' : ('mesos.cluster.frameworks_active', GAUGE), + 'master/frameworks_connected' : ('mesos.cluster.frameworks_connected', GAUGE), + 'master/frameworks_disconnected' : ('mesos.cluster.frameworks_disconnected', GAUGE), + 'master/frameworks_inactive' : ('mesos.cluster.frameworks_inactive', GAUGE), + } + + # These metrics are aggregated on all nodes in the cluster + SYSTEM_METRICS = { + 'system/cpus_total' : ('mesos.stats.system.cpus_total', GAUGE), + 'system/load_15min' : ('mesos.stats.system.load_15min', GAUGE), + 'system/load_1min' : ('mesos.stats.system.load_1min', GAUGE), + 'system/load_5min' : ('mesos.stats.system.load_5min', GAUGE), + 'system/mem_free_bytes' : ('mesos.stats.system.mem_free_bytes', GAUGE), + 'system/mem_total_bytes' : ('mesos.stats.system.mem_total_bytes', GAUGE), + 'master/elected' : ('mesos.stats.elected', GAUGE), + 'master/uptime_secs' : ('mesos.stats.uptime_secs', GAUGE), + } + + # These metrics are aggregated only on the elected master + STATS_METRICS = { + 'master/dropped_messages' : ('mesos.cluster.dropped_messages', GAUGE), + 'master/outstanding_offers' : ('mesos.cluster.outstanding_offers', GAUGE), + 'master/event_queue_dispatches' : ('mesos.cluster.event_queue_dispatches', GAUGE), + 'master/event_queue_http_requests' : ('mesos.cluster.event_queue_http_requests', GAUGE), + 'master/event_queue_messages' : ('mesos.cluster.event_queue_messages', GAUGE), + 'master/invalid_framework_to_executor_messages' : ('mesos.cluster.invalid_framework_to_executor_messages', GAUGE), + 'master/invalid_status_update_acknowledgements' : ('mesos.cluster.invalid_status_update_acknowledgements', GAUGE), + 'master/invalid_status_updates' : ('mesos.cluster.invalid_status_updates', GAUGE), + 'master/valid_framework_to_executor_messages' : ('mesos.cluster.valid_framework_to_executor_messages', GAUGE), + 'master/valid_status_update_acknowledgements' : ('mesos.cluster.valid_status_update_acknowledgements', GAUGE), + 'master/valid_status_updates' : ('mesos.cluster.valid_status_updates', GAUGE), + } + + def _get_json(self, url, timeout): + # Use a hash of the URL as an aggregation key + aggregation_key = md5(url).hexdigest() + tags = ["url:%s" % url] + msg = None + status = None + try: + r = requests.get(url, timeout=timeout) + if r.status_code != 200: + status = AgentCheck.CRITICAL + msg = "Got %s when hitting %s" % (r.status_code, url) + else: + status = AgentCheck.OK + msg = "Mesos master instance detected at %s " % url + except requests.exceptions.Timeout as e: + # If there's a timeout + msg = "%s seconds timeout when hitting %s" % (timeout, url) + status = AgentCheck.CRITICAL + except Exception as e: + msg = str(e) + status = AgentCheck.CRITICAL + finally: + if self.service_check_needed: + self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, + message=msg) + self.service_check_needed = False + if status is AgentCheck.CRITICAL: + self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, + message=msg) + raise CheckException("Cannot connect to mesos, please check your configuration.") + + return r.json() + + def _get_master_state(self, url, timeout): + return self._get_json(url + '/state.json', timeout) + + def _get_master_stats(self, url, timeout): + if self.version >= [0, 22, 0]: + endpoint = '/metrics/snapshot' + else: + endpoint = '/stats.json' + return self._get_json(url + endpoint, timeout) + + def _get_master_roles(self, url, timeout): + return self._get_json(url + '/roles.json', timeout) + + def _check_leadership(self, url, timeout): + state_metrics = self._get_master_state(url, timeout) + + if state_metrics is not None: + self.version = map(int, state_metrics['version'].split('.')) + if state_metrics['leader'] == state_metrics['pid']: + self.leader = True + else: + self.leader = False + return state_metrics + + def check(self, instance): + if 'url' not in instance: + raise Exception('Mesos instance missing "url" value.') + + url = instance['url'] + instance_tags = instance.get('tags', []) + default_timeout = self.init_config.get('default_timeout', 5) + timeout = float(instance.get('timeout', default_timeout)) + + state_metrics = self._check_leadership(url, timeout) + if state_metrics: + tags = [ + 'mesos_cluster:{0}'.format(state_metrics['cluster']), + 'mesos_pid:{0}'.format(state_metrics['pid']), + 'mesos_node:master' + ] + tags += instance_tags + + if self.leader: + self.GAUGE('mesos.cluster.total_frameworks', len(state_metrics['frameworks']), tags=tags) + + for framework in state_metrics['frameworks']: + framework_tags = ['framework_name:' + framework['name']] + tags + self.GAUGE('mesos.framework.total_tasks', len(framework['tasks']), tags=framework_tags) + resources = framework['used_resources'] + for key_name, (metric_name, metric_func) in self.FRAMEWORK_METRICS.iteritems(): + metric_func(self, metric_name, resources[key_name], tags=framework_tags) + + role_metrics = self._get_master_roles(url, timeout) + if role_metrics is not None: + for role in role_metrics['roles']: + role_tags = ['mesos_role:' + role['name']] + tags + self.GAUGE('mesos.role.frameworks.count', len(role['frameworks']), tags=role_tags) + self.GAUGE('mesos.role.weight', role['weight'], tags=role_tags) + for key_name, (metric_name, metric_func) in self.ROLE_RESOURCES_METRICS.iteritems(): + metric_func(self, metric_name, role['resources'][key_name], tags=role_tags) + + stats_metrics = self._get_master_stats(url, timeout) + if stats_metrics is not None: + metrics = [self.SYSTEM_METRICS] + if self.leader: + metrics += [self.CLUSTER_TASKS_METRICS, self.CLUSTER_SLAVES_METRICS, + self.CLUSTER_RESOURCES_METRICS, self.CLUSTER_REGISTRAR_METRICS, + self.CLUSTER_FRAMEWORK_METRICS, self.STATS_METRICS] + for m in metrics: + for key_name, (metric_name, metric_func) in m.iteritems(): + metric_func(self, metric_name, stats_metrics[key_name], tags=tags) + + + self.service_check_needed = True diff --git a/checks.d/mesos_slave.py b/checks.d/mesos_slave.py new file mode 100644 index 0000000000..78b25507fd --- /dev/null +++ b/checks.d/mesos_slave.py @@ -0,0 +1,186 @@ +"""Mesos Slave check + +Collects metrics from mesos slave node. +""" +# stdlib +from hashlib import md5 +import time + +# 3rd party +import requests + +# project +from checks import AgentCheck, CheckException + + +class MesosSlave(AgentCheck): + GAUGE = AgentCheck.gauge + MONOTONIC_COUNT = AgentCheck.monotonic_count + SERVICE_CHECK_NAME = "mesos_slave.can_connect" + service_check_needed = True + + TASK_STATUS = { + 'TASK_STARTING' : AgentCheck.OK, + 'TASK_RUNNING' : AgentCheck.OK, + 'TASK_FINISHED' : AgentCheck.OK, + 'TASK_FAILED' : AgentCheck.CRITICAL, + 'TASK_KILLED' : AgentCheck.WARNING, + 'TASK_LOST' : AgentCheck.CRITICAL, + 'TASK_STAGING' : AgentCheck.OK, + 'TASK_ERROR' : AgentCheck.CRITICAL, + } + + TASK_METRICS = { + 'cpus' : ('mesos.state.task.cpu', GAUGE), + 'mem' : ('mesos.state.task.mem', GAUGE), + 'disk' : ('mesos.state.task.disk', GAUGE), + } + + SLAVE_TASKS_METRICS = { + 'slave/tasks_failed' : ('mesos.slave.tasks_failed', MONOTONIC_COUNT), + 'slave/tasks_finished' : ('mesos.slave.tasks_finished', MONOTONIC_COUNT), + 'slave/tasks_killed' : ('mesos.slave.tasks_killed', MONOTONIC_COUNT), + 'slave/tasks_lost' : ('mesos.slave.tasks_lost', MONOTONIC_COUNT), + 'slave/tasks_running' : ('mesos.slave.tasks_running', GAUGE), + 'slave/tasks_staging' : ('mesos.slave.tasks_staging', GAUGE), + 'slave/tasks_starting' : ('mesos.slave.tasks_starting', GAUGE), + } + + SYSTEM_METRICS = { + 'system/cpus_total' : ('mesos.stats.system.cpus_total', GAUGE), + 'system/load_15min' : ('mesos.stats.system.load_15min', GAUGE), + 'system/load_1min' : ('mesos.stats.system.load_1min', GAUGE), + 'system/load_5min' : ('mesos.stats.system.load_5min', GAUGE), + 'system/mem_free_bytes' : ('mesos.stats.system.mem_free_bytes', GAUGE), + 'system/mem_total_bytes' : ('mesos.stats.system.mem_total_bytes', GAUGE), + 'slave/registered' : ('mesos.stats.registered', GAUGE), + 'slave/uptime_secs' : ('mesos.stats.uptime_secs', GAUGE), + } + + SLAVE_RESOURCE_METRICS = { + 'slave/cpus_percent' : ('mesos.slave.cpus_percent', GAUGE), + 'slave/cpus_total' : ('mesos.slave.cpus_total', GAUGE), + 'slave/cpus_used' : ('mesos.slave.cpus_used', GAUGE), + 'slave/disk_percent' : ('mesos.slave.disk_percent', GAUGE), + 'slave/disk_total' : ('mesos.slave.disk_total', GAUGE), + 'slave/disk_used' : ('mesos.slave.disk_used', GAUGE), + 'slave/mem_percent' : ('mesos.slave.mem_percent', GAUGE), + 'slave/mem_total' : ('mesos.slave.mem_total', GAUGE), + 'slave/mem_used' : ('mesos.slave.mem_used', GAUGE), + } + + SLAVE_EXECUTORS_METRICS = { + 'slave/executors_registering' : ('mesos.slave.executors_registering', GAUGE), + 'slave/executors_running' : ('mesos.slave.executors_running', GAUGE), + 'slave/executors_terminated' : ('mesos.slave.executors_terminated', GAUGE), + 'slave/executors_terminating' : ('mesos.slave.executors_terminating', GAUGE), + } + + STATS_METRICS = { + 'slave/frameworks_active' : ('mesos.slave.frameworks_active', GAUGE), + 'slave/invalid_framework_messages' : ('mesos.slave.invalid_framework_messages', GAUGE), + 'slave/invalid_status_updates' : ('mesos.slave.invalid_status_updates', GAUGE), + 'slave/recovery_errors' : ('mesos.slave.recovery_errors', GAUGE), + 'slave/valid_framework_messages' : ('mesos.slave.valid_framework_messages', GAUGE), + 'slave/valid_status_updates' : ('mesos.slave.valid_status_updates', GAUGE), + } + + def __init__(self, name, init_config, agentConfig, instances=None): + AgentCheck.__init__(self, name, init_config, agentConfig, instances) + self.cluster_name = None + + def _get_json(self, url, timeout): + # Use a hash of the URL as an aggregation key + aggregation_key = md5(url).hexdigest() + tags = ["url:%s" % url] + msg = None + status = None + try: + r = requests.get(url, timeout=timeout) + if r.status_code != 200: + status = AgentCheck.CRITICAL + msg = "Got %s when hitting %s" % (r.status_code, url) + else: + status = AgentCheck.OK + msg = "Mesos master instance detected at %s " % url + except requests.exceptions.Timeout as e: + # If there's a timeout + msg = "%s seconds timeout when hitting %s" % (timeout, url) + status = AgentCheck.CRITICAL + except Exception as e: + msg = str(e) + status = AgentCheck.CRITICAL + finally: + if self.service_check_needed: + self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) + self.service_check_needed = False + if status is AgentCheck.CRITICAL: + raise CheckException("Cannot connect to mesos, please check your configuration.") + + return r.json() + + def _get_state(self, url, timeout): + return self._get_json(url + '/state.json', timeout) + + def _get_stats(self, url, timeout): + if self.version >= [0, 22, 0]: + endpoint = '/metrics/snapshot' + else: + endpoint = '/stats.json' + return self._get_json(url + endpoint, timeout) + + def _get_constant_attributes(self, url, timeout): + state_metrics = None + if self.cluster_name is None: + state_metrics = self._get_state(url, timeout) + if state_metrics is not None: + self.version = map(int, state_metrics['version'].split('.')) + master_state = self._get_state('http://' + state_metrics['master_hostname'] + ':5050', timeout) + if master_state is not None: + self.cluster_name = master_state['cluster'] + + return state_metrics + + def check(self, instance): + if 'url' not in instance: + raise Exception('Mesos instance missing "url" value.') + + url = instance['url'] + instance_tags = instance.get('tags', []) + tasks = instance.get('tasks', []) + default_timeout = self.init_config.get('default_timeout', 5) + timeout = float(instance.get('timeout', default_timeout)) + + state_metrics = self._get_constant_attributes(url, timeout) + tags = None + + if state_metrics is None: + state_metrics = self._get_state(url, timeout) + if state_metrics: + tags = [ + 'mesos_cluster:{0}'.format(self.cluster_name), + 'mesos_pid:{0}'.format(state_metrics['pid']), + 'mesos_node:slave' + ] + tags += instance_tags + + for task in tasks: + for framework in state_metrics['frameworks']: + for executor in framework['executors']: + for t in executor['tasks']: + if task.lower() in t['name'].lower() and t['slave_id'] == state_metrics['id']: + task_tags = ['task_name:' + t['name']] + tags + self.service_check(t['name'] + '.ok', self.TASK_STATUS[t['state']], tags=task_tags) + for key_name, (metric_name, metric_func) in self.TASK_METRICS.iteritems(): + metric_func(self, metric_name, t['resources'][key_name], tags=task_tags) + + stats_metrics = self._get_stats(url, timeout) + if stats_metrics: + tags = tags if tags else instance_tags + metrics = [self.SLAVE_TASKS_METRICS, self.SYSTEM_METRICS, self.SLAVE_RESOURCE_METRICS, + self.SLAVE_EXECUTORS_METRICS, self.STATS_METRICS] + for m in metrics: + for key_name, (metric_name, metric_func) in m.iteritems(): + metric_func(self, metric_name, stats_metrics[key_name], tags=tags) + + self.service_check_needed = True diff --git a/conf.d/mesos_master.yaml.example b/conf.d/mesos_master.yaml.example new file mode 100644 index 0000000000..46b68d94cc --- /dev/null +++ b/conf.d/mesos_master.yaml.example @@ -0,0 +1,5 @@ +init_config: + default_timeout: 10 + +instances: + - url: "http://localhost:5050" diff --git a/conf.d/mesos_slave.yaml.example b/conf.d/mesos_slave.yaml.example new file mode 100644 index 0000000000..2d154c4997 --- /dev/null +++ b/conf.d/mesos_slave.yaml.example @@ -0,0 +1,7 @@ +init_config: + default_timeout: 10 + +instances: + - url: "http://localhost:5051" + # tasks: + # - "hello" diff --git a/requirements.txt b/requirements.txt index 88afeff46c..af5775f485 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ ########################################################### # These modules are the deps needed by the -# agent core, meaning every module that is +# agent core, meaning every module that is # not a check # They're installed in the CI and when doing # a source install diff --git a/tests/checks/fixtures/mesos_master/roles.json b/tests/checks/fixtures/mesos_master/roles.json new file mode 100644 index 0000000000..e54ea83873 --- /dev/null +++ b/tests/checks/fixtures/mesos_master/roles.json @@ -0,0 +1,17 @@ +{ + "roles": [ + { + "weight": 1, + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "name": "*", + "frameworks": [ + "20150403-140128-251789322-5050-6047-0000" + ] + } + ] +} diff --git a/tests/checks/fixtures/mesos_master/state.json b/tests/checks/fixtures/mesos_master/state.json new file mode 100644 index 0000000000..fbb58294f6 --- /dev/null +++ b/tests/checks/fixtures/mesos_master/state.json @@ -0,0 +1,132 @@ +{ + "version": "0.22.0", + "unregistered_frameworks": [], + "started_tasks": 0, + "start_time": 1428951954.34111, + "staged_tasks": 0, + "slaves": [ + { + "resources": { + "ports": "[31000-32000]", + "mem": 244, + "disk": 35164, + "cpus": 1 + }, + "reregistered_time": 1428951983.53731, + "registered_time": 1428951983.53725, + "pid": "slave(1)@127.0.0.1:5051", + "id": "20150410-134224-16777343-5050-1778-S0", + "hostname": "localhost", + "attributes": {}, + "active": 'true' + } + ], + "pid": "master@127.0.0.1:5050", + "orphan_tasks": [], + "lost_tasks": 0, + "log_dir": "/var/log/mesos", + "leader": "master@127.0.0.1:5050", + "killed_tasks": 0, + "elected_time": 1428951954.3774, + "deactivated_slaves": 0, + "completed_frameworks": [], + "cluster": "datadog-test", + "build_user": "root", + "build_time": 1427376927, + "build_date": "2015-03-26 13:35:27", + "activated_slaves": 1, + "failed_tasks": 0, + "finished_tasks": 0, + "flags": { + "zk_session_timeout": "10secs", + "zk": "zk://localhost:2181/mesos", + "work_dir": "/var/lib/mesos", + "webui_dir": "/usr/share/mesos/webui", + "version": "false", + "user_sorter": "drf", + "slave_reregister_timeout": "10mins", + "root_submissions": "true", + "registry_strict": "false", + "registry_store_timeout": "5secs", + "registry_fetch_timeout": "1mins", + "registry": "replicated_log", + "initialize_driver_logging": "true", + "help": "false", + "framework_sorter": "drf", + "cluster": "datadog-test", + "authenticators": "crammd5", + "authenticate_slaves": "false", + "authenticate": "false", + "allocation_interval": "1secs", + "log_auto_initialize": "true", + "log_dir": "/var/log/mesos", + "logbufsecs": "0", + "logging_level": "INFO", + "port": "5050", + "quiet": "false", + "quorum": "1", + "recovery_slave_removal_limit": "100%" + }, + "frameworks": [ + { + "webui_url": "http://192.168.33.20:8080", + "user": "root", + "offered_resources": { + "mem": 0, + "disk": 0, + "cpus": 0 + }, + "name": "marathon", + "id": "20150403-140128-251789322-5050-6047-0000", + "hostname": "vagrant-ubuntu-trusty-64", + "failover_timeout": 604800, + "completed_tasks": [], + "checkpoint": 'true', + "active": 'true', + "offers": [], + "registered_time": 1428951955.38871, + "reregistered_time": 1428951955.38872, + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "role": "*", + "tasks": [ + { + "statuses": [ + { + "timestamp": 1428673971.61592, + "state": "TASK_RUNNING" + } + ], + "executor_id": "", + "framework_id": "20150403-140128-251789322-5050-6047-0000", + "id": "hello.dc130e23-df88-11e4-b9ec-080027fc1312", + "labels": [], + "name": "hello", + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "slave_id": "20150410-134224-16777343-5050-1778-S0", + "state": "TASK_RUNNING" + } + ], + "unregistered_time": 0, + "used_resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + } + } + ], + "git_sha": "e890e2414903bb69cab730d5204f10b10d2e91bb", + "git_tag": "0.22.0", + "hostname": "localhost", + "id": "20150413-190554-16777343-5050-16324" +} diff --git a/tests/checks/fixtures/mesos_master/stats.json b/tests/checks/fixtures/mesos_master/stats.json new file mode 100644 index 0000000000..7e449bcc72 --- /dev/null +++ b/tests/checks/fixtures/mesos_master/stats.json @@ -0,0 +1,108 @@ +{ + "valid_status_updates": 0, + "uptime": 706.524240128, + "total_schedulers": 1, + "system/mem_total_bytes": 513798144, + "system/mem_free_bytes": 13815808, + "system/load_5min": 0.02, + "system/load_1min": 0, + "system/load_15min": 0.07, + "system/cpus_total": 1, + "started_tasks": 0, + "staged_tasks": 0, + "registrar/state_store_ms/p9999": 9.90120192, + "registrar/state_store_ms/p999": 9.8956032, + "registrar/state_store_ms/p99": 9.839616, + "registrar/state_store_ms/p95": 9.590784, + "registrar/state_store_ms/p90": 9.279744, + "registrar/state_store_ms/p50": 6.791424, + "registrar/state_store_ms/min": 3.681024, + "registrar/state_store_ms/max": 9.901824, + "registrar/state_store_ms/count": 2, + "registrar/state_store_ms": 9.901824, + "registrar/state_fetch_ms": 3.717888, + "registrar/registry_size_bytes": 246, + "registrar/queued_operations": 0, + "outstanding_offers": 0, + "mem_used": 100, + "mem_total": 244, + "mem_percent": 0.409836065573771, + "master/valid_status_updates": 0, + "master/valid_status_update_acknowledgements": 0, + "master/valid_framework_to_executor_messages": 0, + "master/uptime_secs": 706.52485632, + "master/tasks_starting": 0, + "master/tasks_staging": 0, + "master/tasks_running": 1, + "master/tasks_lost": 0, + "master/tasks_killed": 0, + "master/tasks_finished": 0, + "master/tasks_failed": 0, + "master/tasks_error": 0, + "master/slaves_inactive": 0, + "master/slaves_disconnected": 0, + "master/invalid_framework_to_executor_messages": 0, + "master/frameworks_inactive": 0, + "master/frameworks_disconnected": 0, + "master/frameworks_connected": 1, + "master/frameworks_active": 1, + "master/event_queue_messages": 0, + "master/event_queue_http_requests": 0, + "master/event_queue_dispatches": 17, + "master/elected": 1, + "master/dropped_messages": 1, + "master/disk_used": 0, + "master/disk_total": 35164, + "master/disk_percent": 0, + "master/cpus_used": 1, + "master/cpus_total": 1, + "master/cpus_percent": 1, + "disk_percent": 0, + "deactivated_slaves": 0, + "cpus_used": 1, + "cpus_total": 1, + "cpus_percent": 1, + "active_tasks_gauge": 1, + "active_schedulers": 1, + "activated_slaves": 1, + "disk_total": 35164, + "disk_used": 0, + "elected": 1, + "failed_tasks": 0, + "finished_tasks": 0, + "invalid_status_updates": 0, + "killed_tasks": 0, + "lost_tasks": 0, + "master/invalid_status_update_acknowledgements": 0, + "master/invalid_status_updates": 0, + "master/mem_percent": 0.409836065573771, + "master/mem_total": 244, + "master/mem_used": 100, + "master/messages_authenticate": 0, + "master/messages_deactivate_framework": 0, + "master/messages_decline_offers": 123, + "master/messages_exited_executor": 0, + "master/messages_framework_to_executor": 0, + "master/messages_kill_task": 0, + "master/messages_launch_tasks": 0, + "master/messages_reconcile_tasks": 6, + "master/messages_register_framework": 0, + "master/messages_register_slave": 0, + "master/messages_reregister_framework": 1, + "master/messages_reregister_slave": 2, + "master/messages_resource_request": 0, + "master/messages_revive_offers": 0, + "master/messages_status_update": 0, + "master/messages_status_update_acknowledgement": 0, + "master/messages_unregister_framework": 0, + "master/messages_unregister_slave": 0, + "master/outstanding_offers": 0, + "master/recovery_slave_removals": 0, + "master/slave_registrations": 0, + "master/slave_removals": 0, + "master/slave_reregistrations": 1, + "master/slave_shutdowns_canceled": 0, + "master/slave_shutdowns_scheduled": 0, + "master/slaves_active": 1, + "master/slaves_connected": 1 +} diff --git a/tests/checks/fixtures/mesos_slave/state.json b/tests/checks/fixtures/mesos_slave/state.json new file mode 100644 index 0000000000..4ea97fe8b4 --- /dev/null +++ b/tests/checks/fixtures/mesos_slave/state.json @@ -0,0 +1,123 @@ +{ + "version": "0.22.0", + "started_tasks": 0, + "start_time": 1428673344.06054, + "staged_tasks": 1, + "cluster": "test", + "resources": { + "ports": "[31000-32000]", + "mem": 244, + "disk": 35164, + "cpus": 1 + }, + "pid": "slave(1)@127.0.0.1:5051", + "master_hostname": "localhost", + "flags": { + "work_dir": "/tmp/mesos", + "version": "false", + "switch_user": "true", + "strict": "true", + "resource_monitoring_interval": "1secs", + "registration_backoff_factor": "1secs", + "recovery_timeout": "15mins", + "recover": "reconnect", + "executor_shutdown_grace_period": "5secs", + "executor_registration_timeout": "1mins", + "enforce_container_disk_quota": "false", + "docker_stop_timeout": "0ns", + "docker_sandbox_directory": "/mnt/mesos/sandbox", + "docker_remove_delay": "6hrs", + "docker": "docker", + "disk_watch_interval": "1mins", + "authenticatee": "crammd5", + "cgroups_enable_cfs": "false", + "cgroups_hierarchy": "/sys/fs/cgroup", + "cgroups_limit_swap": "false", + "cgroups_root": "mesos", + "container_disk_watch_interval": "15secs", + "containerizers": "mesos", + "default_role": "*", + "frameworks_home": "", + "gc_delay": "1weeks", + "gc_disk_headroom": "0.1", + "hadoop_home": "", + "help": "false", + "initialize_driver_logging": "true", + "isolation": "posix/cpu,posix/mem", + "launcher_dir": "/usr/libexec/mesos", + "log_dir": "/var/log/mesos", + "logbufsecs": "0", + "logging_level": "INFO", + "master": "zk://localhost:2181/mesos", + "perf_duration": "10secs", + "perf_interval": "1mins", + "port": "5051", + "quiet": "false" + }, + "finished_tasks": 0, + "failed_tasks": 0, + "completed_frameworks": [], + "build_user": "root", + "build_time": 1427376927, + "build_date": "2015-03-26 13:35:27", + "attributes": {}, + "frameworks": [ + { + "user": "root", + "checkpoint": 'true', + "completed_executors": [], + "executors": [ + { + "tasks": [ + { + "statuses": [ + { + "timestamp": 1428673971.61592, + "state": "TASK_RUNNING" + } + ], + "executor_id": "", + "framework_id": "20150403-140128-251789322-5050-6047-0000", + "id": "hello.dc130e23-df88-11e4-b9ec-080027fc1312", + "labels": [], + "name": "hello", + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "slave_id": "20150410-134224-16777343-5050-1778-S0", + "state": "TASK_RUNNING" + } + ], + "completed_tasks": [], + "container": "f67a5e0b-91f9-474a-94a0-e2c6a3b28ea4", + "directory": "/tmp/mesos/slaves/20150410-134224-16777343-5050-1778-S0/frameworks/20150403-140128-251789322-5050-6047-0000/executors/hello.dc130e23-df88-11e4-b9ec-080027fc1312/runs/f67a5e0b-91f9-474a-94a0-e2c6a3b28ea4", + "id": "hello.dc130e23-df88-11e4-b9ec-080027fc1312", + "name": "Command Executor (Task: hello.dc130e23-df88-11e4-b9ec-080027fc1312) (Command: sh -c 'cd hello && ...')", + "queued_tasks": [], + "resources": { + "ports": "[31915-31915]", + "mem": 132, + "disk": 0, + "cpus": 1.1 + }, + "source": "hello.dc130e23-df88-11e4-b9ec-080027fc1312" + } + ], + "failover_timeout": 604800, + "hostname": "vagrant-ubuntu-trusty-64", + "id": "20150403-140128-251789322-5050-6047-0000", + "name": "marathon", + "role": "*" + } + ], + "git_sha": "e890e2414903bb69cab730d5204f10b10d2e91bb", + "git_tag": "0.22.0", + "hostname": "localhost", + "id": "20150410-134224-16777343-5050-1778-S0", + "killed_tasks": 0, + "log_dir": "/var/log/mesos", + "lost_tasks": 0 +} diff --git a/tests/checks/fixtures/mesos_slave/stats.json b/tests/checks/fixtures/mesos_slave/stats.json new file mode 100644 index 0000000000..62fa0c5564 --- /dev/null +++ b/tests/checks/fixtures/mesos_slave/stats.json @@ -0,0 +1,50 @@ +{ + "valid_status_updates": 1, + "uptime": 280965.77977984, + "total_frameworks": 1, + "system/mem_total_bytes": 513798144, + "system/mem_free_bytes": 34271232, + "system/load_5min": 0.08, + "system/load_1min": 0.1, + "system/load_15min": 0.06, + "system/cpus_total": 1, + "started_tasks": 0, + "staged_tasks": 1, + "slave/valid_status_updates": 1, + "slave/valid_framework_messages": 0, + "slave/uptime_secs": 280965.78028288, + "slave/tasks_starting": 0, + "slave/tasks_staging": 0, + "slave/executors_registering": 0, + "slave/disk_used": 0, + "slave/disk_total": 35164, + "slave/disk_percent": 0, + "slave/cpus_used": 1.1, + "slave/cpus_total": 1, + "slave/cpus_percent": 1.1, + "registered": 1, + "failed_tasks": 0, + "finished_tasks": 0, + "invalid_status_updates": 0, + "killed_tasks": 0, + "launched_tasks_gauge": 1, + "lost_tasks": 0, + "queued_tasks_gauge": 0, + "recovery_errors": 0, + "slave/executors_running": 1, + "slave/executors_terminated": 0, + "slave/executors_terminating": 0, + "slave/frameworks_active": 1, + "slave/invalid_framework_messages": 0, + "slave/invalid_status_updates": 0, + "slave/mem_percent": 0.540983606557377, + "slave/mem_total": 244, + "slave/mem_used": 132, + "slave/recovery_errors": 0, + "slave/registered": 1, + "slave/tasks_failed": 0, + "slave/tasks_finished": 0, + "slave/tasks_killed": 0, + "slave/tasks_lost": 0, + "slave/tasks_running": 1 +} diff --git a/tests/checks/mock/test_mesos_master.py b/tests/checks/mock/test_mesos_master.py new file mode 100644 index 0000000000..aaaeb8a4aa --- /dev/null +++ b/tests/checks/mock/test_mesos_master.py @@ -0,0 +1,51 @@ +from tests.checks.common import AgentCheckTest, get_check_class, Fixtures + +from nose.plugins.attrib import attr +from mock import patch +from checks import AgentCheck +import json +import time + + +def _mocked_get_master_state(*args, **kwargs): + state = json.loads(Fixtures.read_file('state.json')) + return state +def _mocked_get_master_stats(*args, **kwargs): + stats = json.loads(Fixtures.read_file('stats.json')) + return stats +def _mocked_get_master_roles(*args, **kwargs): + roles = json.loads(Fixtures.read_file('roles.json')) + return roles + +@attr(requires='mesos_master') +class TestMesosMaster(AgentCheckTest): + CHECK_NAME = 'mesos_master' + + def test_checks(self): + config = { + 'init_config': {}, + 'instances': [ + { + 'url': 'http://localhost:5050' + } + ] + } + + klass = get_check_class('mesos_master') + with patch.object(klass, '_get_master_state', _mocked_get_master_state): + with patch.object(klass, '_get_master_stats', _mocked_get_master_stats): + with patch.object(klass, '_get_master_roles', _mocked_get_master_roles): + check = klass('mesos_master', {}, {}) + self.run_check_twice(config) + metrics = {} + for d in (check.CLUSTER_TASKS_METRICS, check.CLUSTER_SLAVES_METRICS, + check.CLUSTER_RESOURCES_METRICS, check.CLUSTER_REGISTRAR_METRICS, + check.CLUSTER_FRAMEWORK_METRICS, check.SYSTEM_METRICS, check.STATS_METRICS): + metrics.update(d) + [self.assertMetric(v[0]) for k, v in check.FRAMEWORK_METRICS.iteritems()] + [self.assertMetric(v[0]) for k, v in metrics.iteritems()] + [self.assertMetric(v[0]) for k, v in check.ROLE_RESOURCES_METRICS.iteritems()] + self.assertMetric('mesos.cluster.total_frameworks') + self.assertMetric('mesos.framework.total_tasks') + self.assertMetric('mesos.role.frameworks.count') + self.assertMetric('mesos.role.weight') diff --git a/tests/checks/mock/test_mesos_slave.py b/tests/checks/mock/test_mesos_slave.py new file mode 100644 index 0000000000..2fbfe19683 --- /dev/null +++ b/tests/checks/mock/test_mesos_slave.py @@ -0,0 +1,43 @@ +from tests.checks.common import AgentCheckTest, get_check_class, Fixtures + +from nose.plugins.attrib import attr +from mock import patch +from checks import AgentCheck +import json +import time + + +def _mocked_get_state(*args, **kwargs): + state = json.loads(Fixtures.read_file('state.json')) + return state +def _mocked_get_stats(*args, **kwargs): + stats = json.loads(Fixtures.read_file('stats.json')) + return stats + +@attr(requires='mesos_slave') +class TestMesosSlave(AgentCheckTest): + CHECK_NAME = 'mesos_slave' + + def test_checks(self): + config = { + 'init_config': {}, + 'instances': [ + { + 'url': 'http://localhost:5050', + 'tasks': ['hello'] + } + ] + } + + klass = get_check_class('mesos_slave') + with patch.object(klass, '_get_state', _mocked_get_state): + with patch.object(klass, '_get_stats', _mocked_get_stats): + check = klass('mesos_slave', {}, {}) + self.run_check_twice(config) + metrics = {} + for d in (check.SLAVE_TASKS_METRICS, check.SYSTEM_METRICS, check.SLAVE_RESOURCE_METRICS, + check.SLAVE_EXECUTORS_METRICS, check.STATS_METRICS): + metrics.update(d) + [self.assertMetric(v[0]) for k, v in check.TASK_METRICS.iteritems()] + [self.assertMetric(v[0]) for k, v in metrics.iteritems()] + self.assertServiceCheck('hello.ok', count=1, status=AgentCheck.OK)