Skip to content

Commit

Permalink
[mesos] Change some metrics type and update test suite
Browse files Browse the repository at this point in the history
  • Loading branch information
DorianZaccaria committed May 29, 2015
1 parent e033b23 commit d036028
Show file tree
Hide file tree
Showing 10 changed files with 426 additions and 427 deletions.
24 changes: 12 additions & 12 deletions checks.d/mesos_master.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@

class MesosMaster(AgentCheck):
GAUGE = AgentCheck.gauge
RATE = AgentCheck.rate
MONOTONIC_COUNT = AgentCheck.monotonic_count
SERVICE_CHECK_NAME = "mesos_master.can_connect"
SERVICE_CHECK_NEEDED = True
service_check_needed = True


FRAMEWORK_METRICS = {
Expand All @@ -35,10 +35,10 @@ class MesosMaster(AgentCheck):
# These metrics are aggregated only on the elected master
CLUSTER_TASKS_METRICS = {
'master/tasks_error' : ('mesos.cluster.tasks_error', GAUGE),
'master/tasks_failed' : ('mesos.cluster.tasks_failed', GAUGE),
'master/tasks_finished' : ('mesos.cluster.tasks_finished', GAUGE),
'master/tasks_killed' : ('mesos.cluster.tasks_killed', GAUGE),
'master/tasks_lost' : ('mesos.cluster.tasks_lost', GAUGE),
'master/tasks_failed' : ('mesos.cluster.tasks_failed', MONOTONIC_COUNT),
'master/tasks_finished' : ('mesos.cluster.tasks_finished', MONOTONIC_COUNT),
'master/tasks_killed' : ('mesos.cluster.tasks_killed', MONOTONIC_COUNT),
'master/tasks_lost' : ('mesos.cluster.tasks_lost', MONOTONIC_COUNT),
'master/tasks_running' : ('mesos.cluster.tasks_running', GAUGE),
'master/tasks_staging' : ('mesos.cluster.tasks_staging', GAUGE),
'master/tasks_starting' : ('mesos.cluster.tasks_starting', GAUGE),
Expand Down Expand Up @@ -99,9 +99,9 @@ class MesosMaster(AgentCheck):
# These metrics are aggregated on all nodes in the cluster
SYSTEM_METRICS = {
'system/cpus_total' : ('mesos.stats.system.cpus_total', GAUGE),
'system/load_15min' : ('mesos.stats.system.load_15min', RATE),
'system/load_1min' : ('mesos.stats.system.load_1min', RATE),
'system/load_5min' : ('mesos.stats.system.load_5min', RATE),
'system/load_15min' : ('mesos.stats.system.load_15min', GAUGE),
'system/load_1min' : ('mesos.stats.system.load_1min', GAUGE),
'system/load_5min' : ('mesos.stats.system.load_5min', GAUGE),
'system/mem_free_bytes' : ('mesos.stats.system.mem_free_bytes', GAUGE),
'system/mem_total_bytes' : ('mesos.stats.system.mem_total_bytes', GAUGE),
'master/elected' : ('mesos.stats.elected', GAUGE),
Expand Down Expand Up @@ -145,10 +145,10 @@ def _get_json(self, url, timeout):
msg = str(e)
status = AgentCheck.CRITICAL
finally:
if self.SERVICE_CHECK_NEEDED:
if self.service_check_needed:
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags,
message=msg)
self.SERVICE_CHECK_NEEDED = False
self.service_check_needed = False
if status is AgentCheck.CRITICAL:
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags,
message=msg)
Expand Down Expand Up @@ -229,4 +229,4 @@ def check(self, instance):
metric_func(self, metric_name, stats_metrics[key_name], tags=tags)


self.SERVICE_CHECK_NEEDED = True
self.service_check_needed = True
24 changes: 12 additions & 12 deletions checks.d/mesos_slave.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@

class MesosSlave(AgentCheck):
GAUGE = AgentCheck.gauge
RATE = AgentCheck.rate
MONOTONIC_COUNT = AgentCheck.monotonic_count
SERVICE_CHECK_NAME = "mesos_slave.can_connect"
SERVICE_CHECK_NEEDED = True
service_check_needed = True

TASK_STATUS = {
'TASK_STARTING' : AgentCheck.OK,
Expand All @@ -37,20 +37,20 @@ class MesosSlave(AgentCheck):
}

SLAVE_TASKS_METRICS = {
'slave/tasks_failed' : ('mesos.slave.tasks_failed', GAUGE),
'slave/tasks_finished' : ('mesos.slave.tasks_finished', GAUGE),
'slave/tasks_killed' : ('mesos.slave.tasks_killed', GAUGE),
'slave/tasks_lost' : ('mesos.slave.tasks_lost', GAUGE),
'slave/tasks_failed' : ('mesos.slave.tasks_failed', MONOTONIC_COUNT),
'slave/tasks_finished' : ('mesos.slave.tasks_finished', MONOTONIC_COUNT),
'slave/tasks_killed' : ('mesos.slave.tasks_killed', MONOTONIC_COUNT),
'slave/tasks_lost' : ('mesos.slave.tasks_lost', MONOTONIC_COUNT),
'slave/tasks_running' : ('mesos.slave.tasks_running', GAUGE),
'slave/tasks_staging' : ('mesos.slave.tasks_staging', GAUGE),
'slave/tasks_starting' : ('mesos.slave.tasks_starting', GAUGE),
}

SYSTEM_METRICS = {
'system/cpus_total' : ('mesos.stats.system.cpus_total', GAUGE),
'system/load_15min' : ('mesos.stats.system.load_15min', RATE),
'system/load_1min' : ('mesos.stats.system.load_1min', RATE),
'system/load_5min' : ('mesos.stats.system.load_5min', RATE),
'system/load_15min' : ('mesos.stats.system.load_15min', GAUGE),
'system/load_1min' : ('mesos.stats.system.load_1min', GAUGE),
'system/load_5min' : ('mesos.stats.system.load_5min', GAUGE),
'system/mem_free_bytes' : ('mesos.stats.system.mem_free_bytes', GAUGE),
'system/mem_total_bytes' : ('mesos.stats.system.mem_total_bytes', GAUGE),
'slave/registered' : ('mesos.stats.registered', GAUGE),
Expand Down Expand Up @@ -111,9 +111,9 @@ def _get_json(self, url, timeout):
msg = str(e)
status = AgentCheck.CRITICAL
finally:
if self.SERVICE_CHECK_NEEDED:
if self.service_check_needed:
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg)
self.SERVICE_CHECK_NEEDED = False
self.service_check_needed = False
if status is AgentCheck.CRITICAL:
raise CheckException("Cannot connect to mesos, please check your configuration.")

Expand Down Expand Up @@ -183,4 +183,4 @@ def check(self, instance):
for key_name, (metric_name, metric_func) in m.iteritems():
metric_func(self, metric_name, stats_metrics[key_name], tags=tags)

self.SERVICE_CHECK_NEEDED = True
self.service_check_needed = True
17 changes: 17 additions & 0 deletions tests/checks/fixtures/mesos_master/roles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"roles": [
{
"weight": 1,
"resources": {
"ports": "[31915-31915]",
"mem": 100,
"disk": 0,
"cpus": 1
},
"name": "*",
"frameworks": [
"20150403-140128-251789322-5050-6047-0000"
]
}
]
}
132 changes: 132 additions & 0 deletions tests/checks/fixtures/mesos_master/state.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
{
"version": "0.22.0",
"unregistered_frameworks": [],
"started_tasks": 0,
"start_time": 1428951954.34111,
"staged_tasks": 0,
"slaves": [
{
"resources": {
"ports": "[31000-32000]",
"mem": 244,
"disk": 35164,
"cpus": 1
},
"reregistered_time": 1428951983.53731,
"registered_time": 1428951983.53725,
"pid": "slave(1)@127.0.0.1:5051",
"id": "20150410-134224-16777343-5050-1778-S0",
"hostname": "localhost",
"attributes": {},
"active": 'true'
}
],
"pid": "master@127.0.0.1:5050",
"orphan_tasks": [],
"lost_tasks": 0,
"log_dir": "/var/log/mesos",
"leader": "master@127.0.0.1:5050",
"killed_tasks": 0,
"elected_time": 1428951954.3774,
"deactivated_slaves": 0,
"completed_frameworks": [],
"cluster": "datadog-test",
"build_user": "root",
"build_time": 1427376927,
"build_date": "2015-03-26 13:35:27",
"activated_slaves": 1,
"failed_tasks": 0,
"finished_tasks": 0,
"flags": {
"zk_session_timeout": "10secs",
"zk": "zk://localhost:2181/mesos",
"work_dir": "/var/lib/mesos",
"webui_dir": "/usr/share/mesos/webui",
"version": "false",
"user_sorter": "drf",
"slave_reregister_timeout": "10mins",
"root_submissions": "true",
"registry_strict": "false",
"registry_store_timeout": "5secs",
"registry_fetch_timeout": "1mins",
"registry": "replicated_log",
"initialize_driver_logging": "true",
"help": "false",
"framework_sorter": "drf",
"cluster": "datadog-test",
"authenticators": "crammd5",
"authenticate_slaves": "false",
"authenticate": "false",
"allocation_interval": "1secs",
"log_auto_initialize": "true",
"log_dir": "/var/log/mesos",
"logbufsecs": "0",
"logging_level": "INFO",
"port": "5050",
"quiet": "false",
"quorum": "1",
"recovery_slave_removal_limit": "100%"
},
"frameworks": [
{
"webui_url": "http://192.168.33.20:8080",
"user": "root",
"offered_resources": {
"mem": 0,
"disk": 0,
"cpus": 0
},
"name": "marathon",
"id": "20150403-140128-251789322-5050-6047-0000",
"hostname": "vagrant-ubuntu-trusty-64",
"failover_timeout": 604800,
"completed_tasks": [],
"checkpoint": 'true',
"active": 'true',
"offers": [],
"registered_time": 1428951955.38871,
"reregistered_time": 1428951955.38872,
"resources": {
"ports": "[31915-31915]",
"mem": 100,
"disk": 0,
"cpus": 1
},
"role": "*",
"tasks": [
{
"statuses": [
{
"timestamp": 1428673971.61592,
"state": "TASK_RUNNING"
}
],
"executor_id": "",
"framework_id": "20150403-140128-251789322-5050-6047-0000",
"id": "hello.dc130e23-df88-11e4-b9ec-080027fc1312",
"labels": [],
"name": "hello",
"resources": {
"ports": "[31915-31915]",
"mem": 100,
"disk": 0,
"cpus": 1
},
"slave_id": "20150410-134224-16777343-5050-1778-S0",
"state": "TASK_RUNNING"
}
],
"unregistered_time": 0,
"used_resources": {
"ports": "[31915-31915]",
"mem": 100,
"disk": 0,
"cpus": 1
}
}
],
"git_sha": "e890e2414903bb69cab730d5204f10b10d2e91bb",
"git_tag": "0.22.0",
"hostname": "localhost",
"id": "20150413-190554-16777343-5050-16324"
}
108 changes: 108 additions & 0 deletions tests/checks/fixtures/mesos_master/stats.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
{
"valid_status_updates": 0,
"uptime": 706.524240128,
"total_schedulers": 1,
"system/mem_total_bytes": 513798144,
"system/mem_free_bytes": 13815808,
"system/load_5min": 0.02,
"system/load_1min": 0,
"system/load_15min": 0.07,
"system/cpus_total": 1,
"started_tasks": 0,
"staged_tasks": 0,
"registrar/state_store_ms/p9999": 9.90120192,
"registrar/state_store_ms/p999": 9.8956032,
"registrar/state_store_ms/p99": 9.839616,
"registrar/state_store_ms/p95": 9.590784,
"registrar/state_store_ms/p90": 9.279744,
"registrar/state_store_ms/p50": 6.791424,
"registrar/state_store_ms/min": 3.681024,
"registrar/state_store_ms/max": 9.901824,
"registrar/state_store_ms/count": 2,
"registrar/state_store_ms": 9.901824,
"registrar/state_fetch_ms": 3.717888,
"registrar/registry_size_bytes": 246,
"registrar/queued_operations": 0,
"outstanding_offers": 0,
"mem_used": 100,
"mem_total": 244,
"mem_percent": 0.409836065573771,
"master/valid_status_updates": 0,
"master/valid_status_update_acknowledgements": 0,
"master/valid_framework_to_executor_messages": 0,
"master/uptime_secs": 706.52485632,
"master/tasks_starting": 0,
"master/tasks_staging": 0,
"master/tasks_running": 1,
"master/tasks_lost": 0,
"master/tasks_killed": 0,
"master/tasks_finished": 0,
"master/tasks_failed": 0,
"master/tasks_error": 0,
"master/slaves_inactive": 0,
"master/slaves_disconnected": 0,
"master/invalid_framework_to_executor_messages": 0,
"master/frameworks_inactive": 0,
"master/frameworks_disconnected": 0,
"master/frameworks_connected": 1,
"master/frameworks_active": 1,
"master/event_queue_messages": 0,
"master/event_queue_http_requests": 0,
"master/event_queue_dispatches": 17,
"master/elected": 1,
"master/dropped_messages": 1,
"master/disk_used": 0,
"master/disk_total": 35164,
"master/disk_percent": 0,
"master/cpus_used": 1,
"master/cpus_total": 1,
"master/cpus_percent": 1,
"disk_percent": 0,
"deactivated_slaves": 0,
"cpus_used": 1,
"cpus_total": 1,
"cpus_percent": 1,
"active_tasks_gauge": 1,
"active_schedulers": 1,
"activated_slaves": 1,
"disk_total": 35164,
"disk_used": 0,
"elected": 1,
"failed_tasks": 0,
"finished_tasks": 0,
"invalid_status_updates": 0,
"killed_tasks": 0,
"lost_tasks": 0,
"master/invalid_status_update_acknowledgements": 0,
"master/invalid_status_updates": 0,
"master/mem_percent": 0.409836065573771,
"master/mem_total": 244,
"master/mem_used": 100,
"master/messages_authenticate": 0,
"master/messages_deactivate_framework": 0,
"master/messages_decline_offers": 123,
"master/messages_exited_executor": 0,
"master/messages_framework_to_executor": 0,
"master/messages_kill_task": 0,
"master/messages_launch_tasks": 0,
"master/messages_reconcile_tasks": 6,
"master/messages_register_framework": 0,
"master/messages_register_slave": 0,
"master/messages_reregister_framework": 1,
"master/messages_reregister_slave": 2,
"master/messages_resource_request": 0,
"master/messages_revive_offers": 0,
"master/messages_status_update": 0,
"master/messages_status_update_acknowledgement": 0,
"master/messages_unregister_framework": 0,
"master/messages_unregister_slave": 0,
"master/outstanding_offers": 0,
"master/recovery_slave_removals": 0,
"master/slave_registrations": 0,
"master/slave_removals": 0,
"master/slave_reregistrations": 1,
"master/slave_shutdowns_canceled": 0,
"master/slave_shutdowns_scheduled": 0,
"master/slaves_active": 1,
"master/slaves_connected": 1
}
Loading

0 comments on commit d036028

Please sign in to comment.