vSphere new implementation (#5251)

* Initial commit (with following squashed) Clean up config options Add dependency Re-implement excluded_host_tags work Fix tests Fetch remaining metrics Clean-up Fix dependencies fix style [review] max_query_metrics exception [review] Document self._conn.content type [review] Address minor comments * Update copyright headers * fix style * Submit number of each ressources * Address review * Move config into a separate class * Add collection_type: both * Add empty_default_hostname: true * Update metadata.csv * Sort metadata.csv * Add debug metrics to metadata * Use PROTOCOL_TLS_CLIENT if possible * Add reference metric * Use native type everywhere * Fully separate parralel logic from the rest * Add comment about per-instance collection level * Make config methods private * Reduce log verbosity * Add an extra test * address * Update vsphere/datadog_checks/vsphere/api.py * Keep commit history * Revert legacy changes
DataDog · Feb 3, 2020 · aff37d6 · aff37d6
1 parent 9afa9ed
commit aff37d6
Showing 45 changed files with 57,318 additions and 305 deletions.
diff --git a/datadog_checks_base/datadog_checks/base/data/agent_requirements.in b/datadog_checks_base/datadog_checks/base/data/agent_requirements.in
@@ -15,6 +15,7 @@ ddtrace==0.32.2
 dnspython==1.16.0
 flup==1.0.3.dev-20110405; python_version < '3.0'
 flup-py3==1.0.3; python_version > '3.0'
+futures==3.3.0; python_version < '3.0'
 gearman==2.0.2; sys_platform != 'win32' and python_version < '3.0'
 httplib2==0.10.3
 in-toto==0.4.2

diff --git a/vsphere/datadog_checks/vsphere/__init__.py b/vsphere/datadog_checks/vsphere/__init__.py
@@ -1,4 +1,7 @@
+# (C) Datadog, Inc. 2018-present
+# All rights reserved
+# Licensed under Simplified BSD License (see LICENSE)
 from .__about__ import __version__
-from .vsphere import VSphereCheck
+from .vsphere_new import VSphereCheck
 
 __all__ = ['__version__', 'VSphereCheck']
diff --git a/vsphere/datadog_checks/vsphere/api.py b/vsphere/datadog_checks/vsphere/api.py
@@ -0,0 +1,176 @@
+# (C) Datadog, Inc. 2019-present
+# All rights reserved
+# Licensed under Simplified BSD License (see LICENSE)
+import functools
+import ssl
+
+from pyVim import connect
+from pyVmomi import vim, vmodl
+
+from datadog_checks.vsphere.constants import ALL_RESOURCES, MAX_QUERY_METRICS_OPTION, UNLIMITED_HIST_METRICS_PER_QUERY
+
+# Python 3 only
+PROTOCOL_TLS_CLIENT = getattr(ssl, 'PROTOCOL_TLS_CLIENT', ssl.PROTOCOL_TLS)
+
+
+def smart_retry(f):
+    """A function decorated with this `@smart_retry` will trigger a new authentication if it fails. The function
+    will then be retried.
+    This is useful when the integration keeps a semi-healthy connection to the vSphere API"""
+
+    @functools.wraps(f)
+    def wrapper(api_instance, *args, **kwargs):
+        try:
+            return f(api_instance, *args, **kwargs)
+        except Exception as e:
+            api_instance.log.debug(
+                "An exception occurred when executing %s: %s. Refreshing the connection to vCenter and retrying",
+                f.__name__,
+                e,
+            )
+            api_instance.smart_connect()
+            return f(api_instance, *args, **kwargs)
+
+    return wrapper
+
+
+class APIConnectionError(Exception):
+    pass
+
+
+class VSphereAPI(object):
+    """Abstraction class over the vSphere SOAP api using the pyvmomi library"""
+
+    def __init__(self, config, log):
+        self.config = config
+        self.log = log
+
+        self._conn = None
+        self.smart_connect()
+
+    def smart_connect(self):
+        """Creates the connection object to the vSphere API using parameters supplied from the configuration.
+        """
+        context = None
+        if not self.config.ssl_verify:
+            context = ssl.SSLContext(ssl.PROTOCOL_TLS)
+            context.verify_mode = ssl.CERT_NONE
+        elif self.config.ssl_capath:
+            context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+            context.verify_mode = ssl.CERT_REQUIRED
+            context.load_verify_locations(capath=self.config.ssl_capath)
+
+        try:
+            # Object returned by SmartConnect is a ServerInstance
+            # https://www.vmware.com/support/developer/vc-sdk/visdk2xpubs/ReferenceGuide/vim.ServiceInstance.html
+            conn = connect.SmartConnect(
+                host=self.config.hostname, user=self.config.username, pwd=self.config.password, sslContext=context
+            )
+            # Next line tries a simple API call to check the health of the connection.
+            conn.CurrentTime()
+        except Exception as e:
+            err_msg = "Connection to {} failed: {}".format(self.config.hostname, e)
+            raise APIConnectionError(err_msg)
+
+        self._conn = conn
+
+    @smart_retry
+    def check_health(self):
+        self._conn.CurrentTime()
+
+    @smart_retry
+    def get_perf_counter_by_level(self, collection_level):
+        """Requests and returns the list of counter available for a given collection_level."""
+        return self._conn.content.perfManager.QueryPerfCounterByLevel(collection_level)
+
+    @smart_retry
+    def get_infrastructure(self):
+        """Traverse the whole vSphere infrastructure and outputs a dict mapping the mors to their properties.
+
+        :return: {
+            'vim.VirtualMachine-VM0': {
+              'name': 'VM-0',
+              ...
+            }
+            ...
+        }
+        """
+        content = self._conn.content  # vim.ServiceInstanceContent reference from the connection
+
+        property_specs = []
+        # Specify which attributes we want to retrieve per object
+        for resource in ALL_RESOURCES:
+            property_spec = vmodl.query.PropertyCollector.PropertySpec()
+            property_spec.type = resource
+            property_spec.pathSet = ["name", "parent", "customValue"]
+            if resource == vim.VirtualMachine:
+                property_spec.pathSet.append("runtime.powerState")
+                property_spec.pathSet.append("runtime.host")
+                property_spec.pathSet.append("guest.hostName")
+            property_specs.append(property_spec)
+
+        # Specify the attribute of the root object to traverse to obtain all the attributes
+        traversal_spec = vmodl.query.PropertyCollector.TraversalSpec()
+        traversal_spec.path = "view"
+        traversal_spec.skip = False
+        traversal_spec.type = vim.view.ContainerView
+
+        retr_opts = vmodl.query.PropertyCollector.RetrieveOptions()
+        # To limit the number of objects retrieved per call.
+        # If batch_collector_size is 0, collect maximum number of objects.
+        retr_opts.maxObjects = self.config.batch_collector_size
+
+        # Specify the root object from where we collect the rest of the objects
+        obj_spec = vmodl.query.PropertyCollector.ObjectSpec()
+        obj_spec.skip = True
+        obj_spec.selectSet = [traversal_spec]
+
+        # Create our filter spec from the above specs
+        filter_spec = vmodl.query.PropertyCollector.FilterSpec()
+        filter_spec.propSet = property_specs
+
+        view_ref = content.viewManager.CreateContainerView(content.rootFolder, ALL_RESOURCES, True)
+        try:
+            obj_spec.obj = view_ref
+            filter_spec.objectSet = [obj_spec]
+
+            # Collect the objects and their properties
+            res = content.propertyCollector.RetrievePropertiesEx([filter_spec], retr_opts)
+            mors = res.objects
+            # Results can be paginated
+            while res.token is not None:
+                res = content.propertyCollector.ContinueRetrievePropertiesEx(res.token)
+                mors.extend(res.objects)
+        finally:
+            view_ref.Destroy()
+
+        infrastructure_data = {mor.obj: {prop.name: prop.val for prop in mor.propSet} for mor in mors if mor.propSet}
+
+        root_folder = self._conn.content.rootFolder
+        infrastructure_data[root_folder] = {"name": root_folder.name, "parent": None}
+        return infrastructure_data
+
+    @smart_retry
+    def query_metrics(self, query_specs):
+        perf_manager = self._conn.content.perfManager
+        values = perf_manager.QueryPerf(query_specs)
+        return values
+
+    @smart_retry
+    def get_new_events(self, start_time):
+        event_manager = self._conn.content.eventManager
+        query_filter = vim.event.EventFilterSpec()
+        time_filter = vim.event.EventFilterSpec.ByTime(beginTime=start_time)
+        query_filter.time = time_filter
+        return event_manager.QueryEvents(query_filter)
+
+    @smart_retry
+    def get_latest_event_timestamp(self):
+        event_manager = self._conn.content.eventManager
+        return event_manager.latestEvent.createdTime
+
+    @smart_retry
+    def get_max_query_metrics(self):
+        vcenter_settings = self._conn.content.setting.QueryOptions(MAX_QUERY_METRICS_OPTION)
+        max_historical_metrics = int(vcenter_settings[0].value)
+        return max_historical_metrics if max_historical_metrics > 0 else UNLIMITED_HIST_METRICS_PER_QUERY
diff --git a/vsphere/datadog_checks/vsphere/cache.py b/vsphere/datadog_checks/vsphere/cache.py
@@ -0,0 +1,90 @@
+# (C) Datadog, Inc. 2019-present
+# All rights reserved
+# Licensed under Simplified BSD License (see LICENSE)
+import time
+from contextlib import contextmanager
+
+
+class VSphereCache(object):
+    """
+    Wraps configuration and status for the Morlist and Metadata caches.
+    VSphereCache is *not* threadsafe.
+    """
+
+    def __init__(self, interval_sec):
+        self._last_ts = 0
+        self._interval = interval_sec
+        self._content = {}
+
+    @contextmanager
+    def update(self):
+        """A context manager to allow modification of the cache. It will restore the previous value
+        on any error.
+        Usage:
+        ```
+            with cache.update():
+                cache.set_XXX(SOME_DATA)
+        ```
+        """
+        old_content = self._content
+        self._content = {}  # 1. clear the content
+        try:
+            yield  # 2. Actually update the cache
+            self._last_ts = time.time()  # 3. Cache was updated successfully
+        except Exception:
+            # Restore old data
+            self._content = old_content
+            raise
+
+    def is_expired(self):
+        """The cache has a global time to live, all elements expire at the same time.
+        :return True if the cache is expired."""
+        elapsed = time.time() - self._last_ts
+        return elapsed > self._interval
+
+
+class MetricsMetadataCache(VSphereCache):
+    """A VSphere cache dedicated to store the metrics metadata from a user environment.
+    Data is stored like this:
+
+    _content = {
+        vim.HostSystem: {
+            <COUNTER_KEY>: <DD_METRIC_NAME>,
+            ...
+        },
+        vim.VirtualMachine: {...},
+        ...
+    }
+    """
+
+    def get_metadata(self, resource_type):
+        return self._content.get(resource_type)
+
+    def set_metadata(self, resource_type, metadata):
+        self._content[resource_type] = metadata
+
+
+class InfrastructureCache(VSphereCache):
+    """A VSphere cache dedicated to store the infrastructure data from a user environment.
+    Data is stored like this:
+
+    _content = {
+        vim.VirtualMachine: {
+            <MOR_REFERENCE>: <MOR_PROPS_DICT>
+        },
+        ...
+    }
+    """
+
+    def get_mor_props(self, mor, default=None):
+        mor_type = type(mor)
+        return self._content.get(mor_type, {}).get(mor, default)
+
+    def get_mors(self, resource_type):
+        return self._content.get(resource_type, {}).keys()
+
+    def set_mor_data(self, mor, mor_data):
+        mor_type = type(mor)
+        if mor_type not in self._content:
+            self._content[mor_type] = {}
+        self._content[mor_type][mor] = mor_data