From c2996c8d87ae8bfb76419c30166922c4891d18f8 Mon Sep 17 00:00:00 2001 From: Igor Polishchuk Date: Thu, 8 Jan 2015 16:29:47 -0800 Subject: [PATCH 1/3] Add custom metrics and database_size metric --- checks.d/postgres.py | 40 ++++++++++++++++++++++++++++-------- conf.d/postgres.yaml.example | 19 +++++++++++++++++ 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/checks.d/postgres.py b/checks.d/postgres.py index 8acdd28274..70c9838178 100644 --- a/checks.d/postgres.py +++ b/checks.d/postgres.py @@ -1,6 +1,6 @@ """PostgreSQL check -Collects database-wide metrics and optionally per-relation metrics. +Collects database-wide metrics and optionally per-relation metrics, custom metrics. """ # project from checks import AgentCheck, CheckException @@ -13,7 +13,7 @@ class ShouldRestartException(Exception): pass class PostgreSql(AgentCheck): - """Collects per-database, and optionally per-relation metrics + """Collects per-database, and optionally per-relation metrics, custom metrics """ SOURCE_TYPE_NAME = 'postgresql' RATE = AgentCheck.rate @@ -48,6 +48,7 @@ class PostgreSql(AgentCheck): 'tup_inserted' : ('postgresql.rows_inserted', RATE), 'tup_updated' : ('postgresql.rows_updated', RATE), 'tup_deleted' : ('postgresql.rows_deleted', RATE), + 'pg_database_size(datname) as pg_database_size' : ('postgresql.database_size', GAUGE), } NEWER_92_METRICS = { @@ -148,9 +149,9 @@ class PostgreSql(AgentCheck): ('relname', 'table'), ], 'metrics': { - 'pg_table_size(C.oid)' : ('postgresql.table_size', GAUGE), - 'pg_indexes_size(C.oid)' : ('postgresql.index_size', GAUGE), - 'pg_total_relation_size(C.oid)': ('postgresql.total_size', GAUGE), + 'pg_table_size(C.oid) as table_size' : ('postgresql.table_size', GAUGE), + 'pg_indexes_size(C.oid) as index_size' : ('postgresql.index_size', GAUGE), + 'pg_total_relation_size(C.oid) as total_size' : ('postgresql.total_size', GAUGE), }, 'relation': True, 'query': """ @@ -250,10 +251,11 @@ def _get_bgw_metrics(self, key, db): metrics = self.bgw_metrics.get(key) return metrics - def _collect_stats(self, key, db, instance_tags, relations): + def _collect_stats(self, key, db, instance_tags, relations, custom_metrics): """Query pg_stat_* for various metrics If relations is not an empty list, gather per-relation metrics on top of that. + If custom_metrics is not an empty list, gather custom metrics defined in postgres.yaml """ self.DB_METRICS['metrics'] = self._get_instance_metrics(key, db) @@ -268,10 +270,11 @@ def _collect_stats(self, key, db, instance_tags, relations): self.LOCK_METRICS, self.REPLICATION_METRICS, self.REL_METRICS, self.IDX_METRICS, self.SIZE_METRICS) + full_metric_scope=list(metric_scope)+custom_metrics try: cursor = db.cursor() - for scope in metric_scope: + for scope in full_metric_scope: if scope == self.REPLICATION_METRICS or not self._is_above(key, db, [9,0,0]): log_func = self.log.debug warning_func = self.log.debug @@ -385,6 +388,14 @@ def get_connection(self, key, host, port, user, password, dbname, use_cached=Tru self.dbs[key] = connection return connection + def _process_customer_metrics(self,custom_metrics): + for m in custom_metrics: + self.log.debug("Metric: %s" % str(m)) + for v in m['metrics'].values(): + if v[1].upper() not in ['RATE','GAUGE','MONOTONIC']: + raise CheckException("Collector method %s is not known. Known methods are RATE,GAUGE,MONOTONIC" % (v[1].upper())) + v[1] = PostgreSql.__dict__[v[1].upper()] + self.log.debug("Method: %s" % (str(v[1]))) def check(self, instance): host = instance.get('host', '') @@ -394,6 +405,7 @@ def check(self, instance): tags = instance.get('tags', []) dbname = instance.get('dbname', None) relations = instance.get('relations', []) + custom_metrics = instance.get('custom_metrics', []) if relations and not dbname: self.warning('"dbname" parameter must be set when using the "relations" parameter.') @@ -413,6 +425,16 @@ def check(self, instance): # preset tags to the database name tags.extend(["db:%s" % dbname]) + # Clean up custom_metrics in case there was a None entry in the instance + # e.g. if the yaml contains custom_metrics: but no actual custom_metrics + if custom_metrics is None: + custom_metrics = [] + elif custom_metrics != []: + self._process_customer_metrics(custom_metrics) + + self.log.debug("Custom metrics: %s" % custom_metrics) + + # preset tags to the database name db = None # Collect metrics @@ -421,11 +443,11 @@ def check(self, instance): db = self.get_connection(key, host, port, user, password, dbname) version = self._get_version(key, db) self.log.debug("Running check against version %s" % version) - self._collect_stats(key, db, tags, relations) + self._collect_stats(key, db, tags, relations, custom_metrics) except ShouldRestartException: self.log.info("Resetting the connection") db = self.get_connection(key, host, port, user, password, dbname, use_cached=False) - self._collect_stats(key, db, tags, relations) + self._collect_stats(key, db, tags, relations, custom_metrics) if db is not None: service_check_tags = self._get_service_check_tags(host, port, dbname) diff --git a/conf.d/postgres.yaml.example b/conf.d/postgres.yaml.example index fdb441e362..b1f99661fa 100644 --- a/conf.d/postgres.yaml.example +++ b/conf.d/postgres.yaml.example @@ -17,3 +17,22 @@ instances: # relations: # - my_table # - my_other_table + + +# Custom metrics +# Below are some examples of commonly used metrics, which are implemented as custom metrics. +# Uncomment them if you want to use them as is, or use as an example for creating your own custom metrics. +# The format for describing custome metrics is identical with the one used for common metrics in postgres.py +# Be extra careful with ensuring proper custom metrics description format. If your custom metric does not work +# after an agent restart, look for errors in the otput of "/etc/init.d/datadog-agent info" command, as well as +# /var/log/datadog/collector.log file. + +# - # Londiste 3 replication lag +# descriptors: +# - [consumer_name, consumer_name] +# metrics: +# GREATEST(0, EXTRACT(EPOCH FROM lag)) as lag: [postgresql.londiste_lag, GAUGE] +# GREATEST(0, EXTRACT(EPOCH FROM lag)) as last_seen: [postgresql.londiste_last_seen, GAUGE] +# pending_events: [postgresql.londiste_pending_events, GAUGE] +# query: SELECT as consumer_name, %s from pgq.get_consumer_info() where consumer_name !~ 'watermark$'; +# relation: false From ae56ef18f955cf80e8d92fd5addac3a6d531d387 Mon Sep 17 00:00:00 2001 From: Igor Polishchuk Date: Tue, 13 Jan 2015 15:41:15 -0800 Subject: [PATCH 2/3] Add custom_metrics header to the comments part --- conf.d/postgres.yaml.example | 47 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/conf.d/postgres.yaml.example b/conf.d/postgres.yaml.example index b1f99661fa..143104fc88 100644 --- a/conf.d/postgres.yaml.example +++ b/conf.d/postgres.yaml.example @@ -3,20 +3,20 @@ init_config: instances: - host: localhost port: 5432 - # username: my_username - # password: my_password - # dbname: db_name - # tags: - # - optional_tag1 - # - optional_tag2 +# username: my_username +# password: my_password +# dbname: db_name +# tags: +# - optional_tag1 +# - optional_tag2 - # Track per-relation (table) metrics - # The list of relations/tables must be specified here. - # Each relation generates many metrics (10 + 10 per index) - # - # relations: - # - my_table - # - my_other_table +# Track per-relation (table) metrics +# The list of relations/tables must be specified here. +# Each relation generates many metrics (10 + 10 per index) +# +# relations: +# - my_table +# - my_other_table # Custom metrics @@ -26,13 +26,14 @@ instances: # Be extra careful with ensuring proper custom metrics description format. If your custom metric does not work # after an agent restart, look for errors in the otput of "/etc/init.d/datadog-agent info" command, as well as # /var/log/datadog/collector.log file. - -# - # Londiste 3 replication lag -# descriptors: -# - [consumer_name, consumer_name] -# metrics: -# GREATEST(0, EXTRACT(EPOCH FROM lag)) as lag: [postgresql.londiste_lag, GAUGE] -# GREATEST(0, EXTRACT(EPOCH FROM lag)) as last_seen: [postgresql.londiste_last_seen, GAUGE] -# pending_events: [postgresql.londiste_pending_events, GAUGE] -# query: SELECT as consumer_name, %s from pgq.get_consumer_info() where consumer_name !~ 'watermark$'; -# relation: false +# +# custom_metrics: +# - # Londiste 3 replication lag +# descriptors: +# - [consumer_name, consumer_name] +# metrics: +# GREATEST(0, EXTRACT(EPOCH FROM lag)) as lag: [postgresql.londiste_lag, GAUGE] +# GREATEST(0, EXTRACT(EPOCH FROM lag)) as last_seen: [postgresql.londiste_last_seen, GAUGE] +# pending_events: [postgresql.londiste_pending_events, GAUGE] +# query: SELECT as consumer_name, %s from pgq.get_consumer_info() where consumer_name !~ 'watermark$'; +# relation: false From 7457ed258ecbb2243a342577416aefb65ef4787c Mon Sep 17 00:00:00 2001 From: Remi Hakim Date: Wed, 25 Feb 2015 16:22:55 -0500 Subject: [PATCH 3/3] Some improvements to #1302 - Fix tests (and add one more) - Add a limit to the number of results custom queries can return - small copy changes to improve readability --- checks.d/postgres.py | 47 ++++++++++++++++++++++++------------ conf.d/postgres.yaml.example | 2 +- tests/test_postgresql.py | 25 ++++++++++++++++--- 3 files changed, 53 insertions(+), 21 deletions(-) diff --git a/checks.d/postgres.py b/checks.d/postgres.py index 4abb79f606..68ff1b947a 100644 --- a/checks.d/postgres.py +++ b/checks.d/postgres.py @@ -10,6 +10,9 @@ from pg8000 import InterfaceError, ProgrammingError import socket + +MAX_CUSTOM_RESULTS = 100 + class ShouldRestartException(Exception): pass class PostgreSql(AgentCheck): @@ -287,7 +290,7 @@ def _collect_stats(self, key, db, instance_tags, relations, custom_metrics): if self._is_9_1_or_above(key,db): metric_scope.append(self.REPLICATION_METRICS) - full_metric_scope=list(metric_scope)+custom_metrics + full_metric_scope = list(metric_scope) + custom_metrics try: cursor = db.cursor() @@ -322,8 +325,15 @@ def _collect_stats(self, key, db, instance_tags, relations, custom_metrics): if not results: continue + if scope in custom_metrics and len(results) > MAX_CUSTOM_RESULTS: + self.warning( + "Query: {0} returned more than {1} results ({2})Truncating").format( + query, MAX_CUSTOM_RESULTS, len(results)) + results = results[:MAX_CUSTOM_RESULTS] + if scope == self.DB_METRICS: - self.gauge("postgresql.db.count", len(results), tags=[t for t in instance_tags if not t.startswith("db:")]) + self.gauge("postgresql.db.count", len(results), + tags=[t for t in instance_tags if not t.startswith("db:")]) # parse & submit results # A row should look like this @@ -406,13 +416,24 @@ def get_connection(self, key, host, port, user, password, dbname, use_cached=Tru return connection def _process_customer_metrics(self,custom_metrics): + required_parameters = ("descriptors", "metrics", "query", "relation") + for m in custom_metrics: - self.log.debug("Metric: %s" % str(m)) - for v in m['metrics'].values(): - if v[1].upper() not in ['RATE','GAUGE','MONOTONIC']: - raise CheckException("Collector method %s is not known. Known methods are RATE,GAUGE,MONOTONIC" % (v[1].upper())) - v[1] = PostgreSql.__dict__[v[1].upper()] - self.log.debug("Method: %s" % (str(v[1]))) + for param in required_parameters: + if param not in m: + raise CheckException("Missing {0} parameter in custom metric"\ + .format(param)) + + self.log.debug("Metric: {0}".format(m)) + + for k, v in m['metrics'].items(): + if v[1].upper() not in ['RATE','GAUGE','MONOTONIC']: + raise CheckException("Collector method {0} is not known."\ + "Known methods are RATE,GAUGE,MONOTONIC".format( + v[1].upper())) + + m['metrics'][k][1] = getattr(PostgreSql, v[1].upper()) + self.log.debug("Method: %s" % (str(v[1]))) def check(self, instance): host = instance.get('host', '') @@ -422,7 +443,8 @@ def check(self, instance): tags = instance.get('tags', []) dbname = instance.get('dbname', None) relations = instance.get('relations', []) - custom_metrics = instance.get('custom_metrics', []) + custom_metrics = instance.get('custom_metrics') or [] + self._process_customer_metrics(custom_metrics) if relations and not dbname: self.warning('"dbname" parameter must be set when using the "relations" parameter.') @@ -442,13 +464,6 @@ def check(self, instance): # preset tags to the database name tags.extend(["db:%s" % dbname]) - # Clean up custom_metrics in case there was a None entry in the instance - # e.g. if the yaml contains custom_metrics: but no actual custom_metrics - if custom_metrics is None: - custom_metrics = [] - elif custom_metrics != []: - self._process_customer_metrics(custom_metrics) - self.log.debug("Custom metrics: %s" % custom_metrics) # preset tags to the database name diff --git a/conf.d/postgres.yaml.example b/conf.d/postgres.yaml.example index 143104fc88..4cc616bf53 100644 --- a/conf.d/postgres.yaml.example +++ b/conf.d/postgres.yaml.example @@ -35,5 +35,5 @@ instances: # GREATEST(0, EXTRACT(EPOCH FROM lag)) as lag: [postgresql.londiste_lag, GAUGE] # GREATEST(0, EXTRACT(EPOCH FROM lag)) as last_seen: [postgresql.londiste_last_seen, GAUGE] # pending_events: [postgresql.londiste_pending_events, GAUGE] -# query: SELECT as consumer_name, %s from pgq.get_consumer_info() where consumer_name !~ 'watermark$'; +# query: SELECT consumer_name, %s from pgq.get_consumer_info() where consumer_name !~ 'watermark$'; # relation: false diff --git a/tests/test_postgresql.py b/tests/test_postgresql.py index 9104d92da1..aa49b28b8c 100644 --- a/tests/test_postgresql.py +++ b/tests/test_postgresql.py @@ -1,5 +1,5 @@ import unittest -from tests.common import load_check +from tests.common import load_check, AgentCheckTest from nose.plugins.attrib import attr @@ -7,7 +7,9 @@ from pprint import pprint @attr(requires='postgres') -class TestPostgres(unittest.TestCase): +class TestPostgres(AgentCheckTest): + + CHECK_NAME = "postgres" def test_checks(self): host = 'localhost' @@ -23,6 +25,18 @@ def test_checks(self): 'password': 'datadog', 'dbname': dbname, 'relations': ['persons'], + 'custom_metrics': [ + { + "descriptors": [ + ("datname", "customdb") + ], + "metrics": { + "numbackends": ["custom.numbackends", "Gauge"], + }, + "query": "SELECT datname, %s FROM pg_stat_database WHERE datname = 'datadog_test' LIMIT(1)", + "relation": False, + } + ] } ] } @@ -62,8 +76,8 @@ def test_checks(self): self.check.run() metrics = self.check.get_metrics() - exp_metrics = 37 - exp_db_tagged_metrics = 24 + exp_metrics = 39 + exp_db_tagged_metrics = 26 if self.check._is_9_2_or_above(key, db): self.assertTrue(len([m for m in metrics if m[0] == u'postgresql.bgwriter.sync_time']) >= 1, pprint(metrics)) @@ -97,5 +111,8 @@ def test_checks(self): self.assertEquals(len([m for m in metrics if 'db:datadog_test' in str(m[3].get('tags', []))]), exp_db_tagged_metrics, metrics) self.assertEquals(len([m for m in metrics if 'table:persons' in str(m[3].get('tags', [])) ]), 11, metrics) + self.metrics = metrics + self.assertMetric("custom.numbackendss") + if __name__ == '__main__': unittest.main()