From 388d977568a80df72d39621794271ba9895ae64c Mon Sep 17 00:00:00 2001 From: "florian.veaux" Date: Tue, 7 Jan 2020 15:38:45 +0100 Subject: [PATCH] [review] Address minor comments --- vsphere/datadog_checks/vsphere/constants.py | 2 ++ vsphere/datadog_checks/vsphere/metrics.py | 10 +++--- vsphere/datadog_checks/vsphere/utils.py | 10 +++--- vsphere/datadog_checks/vsphere/vsphere.py | 39 +++++++++++++-------- 4 files changed, 38 insertions(+), 23 deletions(-) diff --git a/vsphere/datadog_checks/vsphere/constants.py b/vsphere/datadog_checks/vsphere/constants.py index c280ed4f2a87d..37056cb4b979b 100644 --- a/vsphere/datadog_checks/vsphere/constants.py +++ b/vsphere/datadog_checks/vsphere/constants.py @@ -35,6 +35,8 @@ HISTORICAL_RESOURCES = [vim.Datacenter, vim.Datastore, vim.ClusterComputeResource] ALL_RESOURCES_WITH_METRICS = REALTIME_RESOURCES + HISTORICAL_RESOURCES +REALTIME_METRICS_INTERVAL_ID = 20 + DEFAULT_BATCH_COLLECTOR_SIZE = 500 DEFAULT_METRICS_PER_QUERY = 500 UNLIMITED_HIST_METRICS_PER_QUERY = float('inf') diff --git a/vsphere/datadog_checks/vsphere/metrics.py b/vsphere/datadog_checks/vsphere/metrics.py index 9beef4dce9fc7..fd328fc4193d7 100644 --- a/vsphere/datadog_checks/vsphere/metrics.py +++ b/vsphere/datadog_checks/vsphere/metrics.py @@ -256,7 +256,7 @@ 'virtualDisk.writeIOSize.latest': (4, 4, True), 'virtualDisk.writeLatencyUS.latest': (4, 4, True), 'virtualDisk.writeLoadMetric.latest': (2, 2, True), - 'virtualDisk.writeOIO.latest': (2, 2, 'instance'), + 'virtualDisk.writeOIO.latest': (2, 2, True), } # All metrics that can be collected from ESXi Hosts. @@ -530,7 +530,7 @@ 'sys.resourceMemZero.latest': (3, 3, True), 'sys.uptime.latest': (1, 3), 'virtualDisk.busResets.sum': (2, 4, True), - 'virtualDisk.commandsAborted.sum': (2, 4, 'instance'), + 'virtualDisk.commandsAborted.sum': (2, 4, True), } # All metrics that can be collected from Datastores. @@ -552,7 +552,7 @@ 'disk.numberWriteAveraged.avg': (1, 3), 'disk.provisioned.latest': (1, 1, True), 'disk.unshared.latest': (1, 1, True), - 'disk.used.latest': (1, 1, 'instance'), + 'disk.used.latest': (1, 1, True), } # All metrics that can be collected from Datacenters. @@ -578,7 +578,7 @@ 'vmop.numSuspend.latest': (1, 3), 'vmop.numUnregister.latest': (1, 3), 'vmop.numVMotion.latest': (1, 3), - 'vmop.numXVMotion.latest': (1, 3, 'aggregate'), + 'vmop.numXVMotion.latest': (1, 3), } # All metrics that can be collected from Clusters. @@ -619,7 +619,7 @@ 'vmop.numSuspend.latest': (1, 3), 'vmop.numUnregister.latest': (1, 3), 'vmop.numVMotion.latest': (1, 3), - 'vmop.numXVMotion.latest': (1, 3, 'aggregate'), + 'vmop.numXVMotion.latest': (1, 3), } ALLOWED_METRICS_FOR_MOR = { diff --git a/vsphere/datadog_checks/vsphere/utils.py b/vsphere/datadog_checks/vsphere/utils.py index c9c7032b52855..db66274a9bdd1 100644 --- a/vsphere/datadog_checks/vsphere/utils.py +++ b/vsphere/datadog_checks/vsphere/utils.py @@ -17,7 +17,7 @@ def format_metric_name(counter): def match_any_regex(string, regexes): for regex in regexes: - match = regex.match(string) # FIXME: Should we use re.IGNORECASE like legacy? + match = regex.match(string) if match: return True return False @@ -26,8 +26,10 @@ def match_any_regex(string, regexes): def is_resource_excluded_by_filters(mor, infrastructure_data, resource_filters): resource_type = MOR_TYPE_AS_STRING[type(mor)] - if not [f for f in resource_filters if f[0] == resource_type]: - # No filter for this resource, collect everything + for f in resource_filters: + if f[0] == resource_type: + break + else: return False name_filter = resource_filters.get((resource_type, 'name')) @@ -91,9 +93,9 @@ def get_parent_tags_recursively(mor, infrastructure_data): """ mor_props = infrastructure_data.get(mor) parent = mor_props.get('parent') - parent_props = infrastructure_data.get(parent, {}) if parent: tags = [] + parent_props = infrastructure_data.get(parent, {}) parent_name = ensure_unicode(parent_props.get('name', 'unknown')) if isinstance(parent, vim.HostSystem): tags.append(u'vsphere_host:{}'.format(parent_name)) diff --git a/vsphere/datadog_checks/vsphere/vsphere.py b/vsphere/datadog_checks/vsphere/vsphere.py index a41674948f47b..0d2f3e0b39440 100644 --- a/vsphere/datadog_checks/vsphere/vsphere.py +++ b/vsphere/datadog_checks/vsphere/vsphere.py @@ -1,6 +1,8 @@ # (C) Datadog, Inc. 2019 # All rights reserved # Licensed under Simplified BSD License (see LICENSE) +from __future__ import division + import re import time from collections import defaultdict @@ -23,6 +25,7 @@ EXTRA_FILTER_PROPERTIES_FOR_VMS, HISTORICAL_RESOURCES, MAX_QUERY_METRICS_OPTION, + REALTIME_METRICS_INTERVAL_ID, REALTIME_RESOURCES, ) from datadog_checks.vsphere.legacy.event import VSphereEvent @@ -54,7 +57,7 @@ def __new__(cls, name, init_config, instances): return super(VSphereCheck, cls).__new__(cls) def __init__(self, name, init_config, instances): - AgentCheck.__init__(self, name, init_config, instances) + super(VSphereCheck, self).__init__(name, init_config, instances) # Configuration fields self.base_tags = self.instance.get("tags", []) self.collection_level = self.instance.get("collection_level", 1) @@ -135,7 +138,7 @@ def validate_and_format_config(self): if f['property'] not in allowed_prop_names: self.log.warning( u"Ignoring filter %r because property '%s' is not valid " - "for resource type %s. Should be one of %r.", + u"for resource type %s. Should be one of %r.", f, f['property'], f['resource'], @@ -166,7 +169,7 @@ def initiate_connection(self): self.api = VSphereAPI(self.instance) self.log.info("Connected") except APIConnectionError: - self.log.exception("Cannot authenticate to vCenter API. The check will not run.") + self.log.error("Cannot authenticate to vCenter API. The check will not run.") self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self.base_tags, hostname=None) raise @@ -301,7 +304,7 @@ def submit_metrics_callback(self, task): value = valid_values[-1] if metric_name in PERCENT_METRICS: # Convert the percentage to a float. - value = float(value) / 100 + value /= 100 tags = [] if should_collect_per_instance_values(metric_name, resource_type): @@ -334,10 +337,8 @@ def query_metrics_wrapper(self, query_specs): self.histogram('datadog.vsphere.query_metrics.time', t0.total(), tags=self.base_tags, raw=True) return metrics_values - def collect_metrics(self): + def collect_metrics(self, thread_pool): """Creates a pool of threads and run the query_metrics calls in parallel.""" - pool_executor = ThreadPoolExecutor(max_workers=self.threads_count) - self.log.info("Starting metric collection in %d threads." % self.threads_count) tasks = [] for resource_type in self.collected_resource_types: mors = self.infrastructure_cache.get_mors(resource_type) @@ -356,7 +357,7 @@ def collect_metrics(self): query_spec.entity = mor query_spec.metricId = metrics if resource_type in REALTIME_RESOURCES: - query_spec.intervalId = 20 # FIXME: Make constant + query_spec.intervalId = REALTIME_METRICS_INTERVAL_ID query_spec.maxSample = 1 # Request a single datapoint else: # We cannot use `maxSample` for historical metrics, let's specify a timewindow that will @@ -364,7 +365,7 @@ def collect_metrics(self): query_spec.startTime = datetime.now() - timedelta(hours=2) query_specs.append(query_spec) if query_specs: - tasks.append(pool_executor.submit(lambda q: self.query_metrics_wrapper(q), query_specs)) + tasks.append(thread_pool.submit(lambda q: self.query_metrics_wrapper(q), query_specs)) self.log.info("Queued all %d tasks, waiting for completion.", len(tasks)) while tasks: @@ -373,12 +374,15 @@ def collect_metrics(self): time.sleep(0.1) continue for task in finished_tasks: - self.submit_metrics_callback(task) + try: + self.submit_metrics_callback(task) + except Exception: + self.log.exception( + "Exception raised during the submit_metrics_callback. " + "Ignoring the error and continuing execution." + ) tasks.remove(task) - self.log.info("All tasks completed, shutting down the thread pool.") - pool_executor.shutdown() - def make_batch(self, mors, metric_ids, resource_type): """Iterates over mor and generate batches with a fixed number of metrics to query. Querying multiple resource types in the same call is error prone if we query a cluster metric. Indeed, @@ -513,4 +517,11 @@ def check(self, _): vm_count = len(self.infrastructure_cache.get_mors(vim.VirtualMachine)) self.gauge('vm.count', vm_count, tags=self.base_tags, hostname=None) - self.collect_metrics() + # Creating a thread pool and starting metric collection + pool_executor = ThreadPoolExecutor(max_workers=self.threads_count) + self.log.info("Starting metric collection in %d threads." % self.threads_count) + try: + self.collect_metrics(pool_executor) + self.log.info("All tasks completed, shutting down the thread pool.") + finally: + pool_executor.shutdown()