-
Notifications
You must be signed in to change notification settings - Fork 20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Trigger alert for vol and brick failures #287
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
import json | ||
import os | ||
import re | ||
import socket | ||
import subprocess | ||
|
||
from tendrl.gluster_integration.sds_sync import brick_utilization | ||
|
@@ -12,6 +13,8 @@ | |
|
||
from tendrl.commons import sds_sync | ||
from tendrl.gluster_integration import ini2json | ||
from tendrl.commons.utils.time_utils import now as tendrl_now | ||
|
||
|
||
|
||
class GlusterIntegrationSdsSyncStateThread(sds_sync.SdsSyncThread): | ||
|
@@ -20,6 +23,37 @@ def __init__(self): | |
super(GlusterIntegrationSdsSyncStateThread, self).__init__() | ||
self._complete = gevent.event.Event() | ||
|
||
def _emit_event(self, resource, curr_value, msg): | ||
alert = {} | ||
alert['source'] = 'tendrl-gluster-integration' | ||
alert['pid'] = os.getpid() | ||
alert['timestamp'] = tendrl_now().isoformat() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be |
||
alert['alert_type'] = 'status' | ||
severity = "info" | ||
if curr_value == "Stopped": | ||
severity = "critical" | ||
alert['severity'] = severity | ||
alert['resource'] = resource | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Resource name should be something like You can refer Tendrl/notifier#78 for more details. If required a separate handler to be written for brick status changes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The resource should be the name of object as defined in Tendrl definitions i.e. Volume and the other part of resource should be the attribute on which the alert is triggered Volume.status There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @r0h4n The alerting module expects it as volume_status or volume_utilization so basically its the combination of <entity_type>_<alert_type> where alert_type is either status or utilization... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The alerting module does need to know about the object definition and attributes. Tendrl will be generating alerts on specific attributes of a Tendrl object. But if you are expecting a underscore between the entity/object and alert/attribute we can continue with below scheme _ works? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes .. volume_status works with alerting as it stands today... |
||
alert['current_value'] = curr_value | ||
alert['tags'] = dict( | ||
message=msg, | ||
cluster_id=NS.tendrl_context.cluster_id, | ||
cluster_name=NS.tendrl_context.cluster_name, | ||
sds_name=NS.tendrl_context.sds_name, | ||
fqdn=socket.getfqdn() | ||
) | ||
alert['node_id'] = NS.node_context.node_id | ||
if not NS.node_context.node_id: | ||
return | ||
Event( | ||
Message( | ||
"notice", | ||
"alerting", | ||
{'message': json.dumps(alert)}, | ||
node_id=NS.node_context.node_id | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This(node_id) is not required it will be automatically taken by Message class.. |
||
) | ||
) | ||
|
||
def _run(self): | ||
Event( | ||
Message( | ||
|
@@ -82,8 +116,35 @@ def _run(self): | |
if "Volumes" in raw_data: | ||
index = 1 | ||
volumes = raw_data['Volumes'] | ||
node_context = NS.node_context.load() | ||
tag_list = list(node_context.tags) | ||
while True: | ||
try: | ||
# Raise alerts for volume state change. | ||
cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id | ||
if cluster_provisioner in tag_list: | ||
try: | ||
stored_volume_status = NS._int.client.read( | ||
"clusters/%s/Volumes/%s/status" % ( | ||
NS.tendrl_context.integration_id, | ||
volumes['volume%s.id' % index] | ||
) | ||
).value | ||
current_status = volumes['volume%s.status' % index] | ||
if current_status != stored_volume_status: | ||
msg = "Status of volume: %s changed from %s to %s" % ( | ||
volumes['volume%s.name' % index], | ||
stored_volume_status, | ||
current_status | ||
) | ||
self._emit_event( | ||
volumes['volume%s.name' % index], | ||
current_status, | ||
msg) | ||
|
||
except etcd.EtcdKeyNotFound: | ||
pass | ||
|
||
volume = NS.gluster.objects.Volume( | ||
vol_id=volumes[ | ||
'volume%s.id' % index | ||
|
@@ -187,6 +248,7 @@ def _run(self): | |
hostname not in network_ip): | ||
b_index += 1 | ||
continue | ||
|
||
sub_vol_size = (int(volumes['volume%s.brickcount' % index])) / int( | ||
volumes[ | ||
'volume%s.subvol_count' % index | ||
|
@@ -198,6 +260,40 @@ def _run(self): | |
) | ||
].split(":")[-1].replace("/","_") | ||
|
||
# Raise alerts if the brick path changes | ||
try: | ||
stored_brick_status = NS._int.client.read( | ||
"clusters/%s/Bricks/all/%s/status" % ( | ||
NS.tendrl_context.integration_id, | ||
brick_name | ||
) | ||
).value | ||
current_status = volumes.get( | ||
'volume%s.brick%s.status' % ( | ||
index, | ||
b_index | ||
) | ||
) | ||
if current_status != stored_brick_status: | ||
msg = "Status of brick: %s under volume %s changed from %s to %s" % ( | ||
volumes[ | ||
'volume%s.brick%s.path' % ( | ||
index, b_index | ||
) | ||
], | ||
volumes['volume%s.name' % index], | ||
stored_brick_status, | ||
current_status | ||
) | ||
self._emit_event( | ||
volumes['volume%s.name' % index], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for cases like brick where info about its volume and cluster both are important, have resource type as brick_status and have an additional parameter under tags called the plugin_instance and have its value in accordance with https://github.com/Tendrl/node-monitoring/blob/develop/tendrl/node_monitoring/plugins/tendrl_glusterfs_brick_utilization.py#L286 Now, the reason for not having additional fields like vol_name, brick_path extra under the dict tags is , that collectd does not allow us to have custom additional fields in tags as the tags attribute for collectd generated alerts come directly from collectd based on how the plugin is configured(an example is the above link) and only a few reserved fields can be played around with which leaves plugin_instance as the best attribute choice left... |
||
current_status, | ||
msg | ||
) | ||
|
||
except etcd.EtcdKeyNotFound: | ||
pass | ||
|
||
vol_brick_path = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s" % ( | ||
NS.tendrl_context.integration_id, | ||
volumes['volume%s.id' % index], | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use NS.publisher_id