Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[IMPROVEMENT] Tracking for failed alerts #200

Merged
merged 1 commit into from
Dec 2, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 16 additions & 11 deletions promgen/management/commands/test-alert.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,31 @@

from django.core.management.base import BaseCommand
from django.test import override_settings

from promgen import models, tasks, tests


class Command(BaseCommand):
data = tests.PromgenTest.data_json("examples", "alertmanager.json")

def add_arguments(self, parser):
parser.add_argument("--shard", default="Test Shard")
parser.add_argument("--service", default=self.data["commonLabels"]["service"])
parser.add_argument("--project", default=self.data["commonLabels"]["project"])

@override_settings(CELERY_TASK_ALWAYS_EAGER=True)
def handle(self, **kwargs):
def handle(self, shard, service, project, **kwargs):
logging._handlers = []
logging.basicConfig(level=logging.DEBUG)

data = tests.PromgenTest.data_json('examples', 'alertmanager.json')

shard, _ = models.Shard.objects.get_or_create(name='Shard Test')
service, _ = models.Service.objects.get_or_create(
shard=shard, name=data['commonLabels']['service']
)
shard, _ = models.Shard.objects.get_or_create(name=shard)
service, _ = models.Service.objects.get_or_create(name=service)
project, _ = models.Project.objects.get_or_create(
service=service, name=data['commonLabels']['project']
name=project, defaults={"shard": shard, "service": service}
)

alert = models.Alert.objects.create(
body=json.dumps(data)
)
alert = models.Alert.objects.create(body=json.dumps(self.data), error_count=1)

tasks.process_alert(alert.pk)

alert.alerterror_set.create(message="Test from CLI")
33 changes: 33 additions & 0 deletions promgen/migrations/0011_notifier_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Generated by Django 2.2.4 on 2019-11-28 02:21

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('promgen', '0010_app_label_migration'),
]

operations = [
migrations.AddField(
model_name='alert',
name='error_count',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='alert',
name='sent_count',
field=models.PositiveIntegerField(default=0),
),
migrations.CreateModel(
name='AlertError',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('message', models.TextField()),
('created', models.DateTimeField(default=django.utils.timezone.now)),
('alert', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='promgen.Alert')),
],
),
]
8 changes: 8 additions & 0 deletions promgen/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,8 @@ class RuleAnnotation(models.Model):
class Alert(models.Model):
created = models.DateTimeField(default=timezone.now)
body = models.TextField()
sent_count = models.PositiveIntegerField(default=0)
error_count = models.PositiveIntegerField(default=0)

def get_absolute_url(self):
return reverse("alert-detail", kwargs={"pk": self.pk})
Expand Down Expand Up @@ -531,6 +533,12 @@ def json(self):
return json.loads(self.body)


class AlertError(models.Model):
alert = models.ForeignKey(Alert, on_delete=models.CASCADE)
created = models.DateTimeField(default=timezone.now)
message = models.TextField()


class Audit(models.Model):
body = models.TextField()
created = models.DateTimeField()
Expand Down
11 changes: 9 additions & 2 deletions promgen/notification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,21 @@
import logging
import textwrap

from promgen import util

from django import forms
from django.template.loader import render_to_string

from promgen import plugins, util

logger = logging.getLogger(__name__)


def load(name):
for driver in plugins.notifications():
if name == driver.module_name:
return driver.load()()
raise ImportError("Unknown notification plugin %s" % name)


class FormSenderBase(forms.Form):
value = forms.CharField(required=True)
alias = forms.CharField(required=False)
Expand Down
30 changes: 23 additions & 7 deletions promgen/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

from atomicwrites import atomic_write
from celery import shared_task
from django.conf import settings
from promgen import models, plugins, prometheus, util

from promgen import models, prometheus, util, notification

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -62,13 +62,29 @@ def send_alert(sender, target, data, alert_pk=None):
"""
Send alert to specific target

alert_pk is used for debugging purposes
alert_pk is used when quering our alert normally and is missing
when we send a test message. In the case we send a test message
we want to raise any exceptions so that the test function can
handle it
"""
logger.debug("Sending %s %s", sender, target)
for plugin in plugins.notifications():
if sender == plugin.module_name:
instance = plugin.load()()
instance._send(target, data)
try:
notifier = notification.load(sender)
notifier._send(target, data)
except ImportError:
logging.exception("Error loading plugin %s", sender)
if alert_pk is None:
raise
except Exception as e:
logging.exception("Error sending notification")
if alert_pk:
util.inc_for_pk(models.Alert, pk=alert_pk, error_count=1)
models.AlertError.objects.create(alert_id=alert_pk, message=str(e))
else:
raise
else:
if alert_pk:
util.inc_for_pk(models.Alert, pk=alert_pk, sent_count=1)


@shared_task
Expand Down
10 changes: 10 additions & 0 deletions promgen/templates/promgen/alert_detail.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,15 @@
{% block content %}
{% breadcrumb alert %}

<ul class="list-unstyled">
{% for error in alert.alerterror_set.all %}
<li class="alert alert-danger" role="alert">
<span class="glyphicon glyphicon-alert" aria-hidden="true"></span>
{{error.message}}
<span class="pull-right">{{error.created}}</span>
</li>
{% endfor %}
</ul>

<pre>{{alert.json|pretty_json}}</pre>
{% endblock %}
8 changes: 7 additions & 1 deletion promgen/templates/promgen/alert_list.html
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
{% extends "base.html" %}

{% load promgen %}
{% block content %}

{% breadcrumb label="Alerts" %}

{% include 'promgen/pagination_short.html' %}

<table class="table">
Expand All @@ -12,6 +14,8 @@
<th>Service</th>
<th>Project</th>
<th>Job</th>
<th>Sent</th>
<th>Error</th>
</tr>
{% for alert in alert_list %}
<tr>
Expand All @@ -21,6 +25,8 @@
<td>{{alert.json.commonLabels.service}}</td>
<td>{{alert.json.commonLabels.project}}</td>
<td>{{alert.json.commonLabels.job}}</td>
<td {% if alert.sent_count == 0 %}class="warning" {% endif %}>{{alert.sent_count}}</td>
<td {% if alert.error_count %}class="danger" {% endif %}>{{alert.error_count}}</td>
</tr>
{% endfor %}
</table>
Expand Down
5 changes: 4 additions & 1 deletion promgen/templates/promgen/navbar.html
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Misc <span class="caret"></span></a>
<ul class="dropdown-menu">
<li><a href="{% url 'audit-list' %}">Log</a></li>
<li><a href="{% url 'audit-list' %}">Edit History</a></li>
<li><a href="{% url 'alert-list' %}">Alert History</a></li>
<li role="separator" class="divider"></li>

<li><a href="{% url 'farm-list' %}">Farms</a></li>
<li><a href="{% url 'host-list' %}">Hosts</a></li>
<li><a href="{% url 'shard-list' %}">Shard</a></li>
Expand Down
22 changes: 22 additions & 0 deletions promgen/tests/test_webhook.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,25 @@ def test_filter(self, mock_post):

self.assertCount(models.Alert, 1, "Alert should be queued")
self.assertEqual(mock_post.call_count, 1, "One notification should be skipped")

@override_settings(PROMGEN=TEST_SETTINGS)
@override_settings(CELERY_TASK_ALWAYS_EAGER=True)
@override_settings(CELERY_TASK_EAGER_PROPAGATES=True)
@mock.patch("promgen.util.post")
def test_failure(self, mock_post):
# When our post results in a failure, then our error_count should be
# properly updated and some errors should be logged to be viewed later
mock_post.side_effect = Exception("Boom!")

response = self.client.post(
reverse("alert"), data=TEST_ALERT, content_type="application/json"
)

self.assertRoute(response, views.Alert, 202)
self.assertCount(models.Alert, 1, "Alert should be queued")
self.assertEqual(mock_post.call_count, 2, "Two posts should be attempted")
self.assertCount(models.AlertError, 2, "Two errors should be logged")

alert = models.Alert.objects.first()
self.assertEqual(alert.sent_count, 0, "No successful sent")
self.assertEqual(alert.error_count, 2, "Error incremented")
7 changes: 6 additions & 1 deletion promgen/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# These sources are released under the terms of the MIT license: see LICENSE

import requests.sessions

from django.db.models import F
from promgen.version import __version__

from django.conf import settings
Expand Down Expand Up @@ -61,3 +61,8 @@ def __init__(self, model):

def __getattr__(self, name):
return self.model._meta.get_field(name).help_text


def inc_for_pk(model, pk, **kwargs):
# key=F('key') + value
model.objects.filter(pk=pk).update(**{key: F(key) + kwargs[key] for key in kwargs})
13 changes: 11 additions & 2 deletions promgen/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,8 +360,7 @@ def post(self, request, pk):
sender = get_object_or_404(models.Sender, id=pk)
try:
sender.test()
except:
logger.exception('Error sending test message with %s', sender.sender)
except Exception:
messages.warning(request, 'Error sending test message with ' + sender.sender)
else:
messages.info(request, 'Sent test message with ' + sender.sender)
Expand Down Expand Up @@ -1101,6 +1100,16 @@ def collect(self):
except models.Alert.DoesNotExist:
pass

try:
yield CounterMetricFamily(
"promgen_alerts_failed",
"Failed Alerts",
models.AlertError.objects.latest("id").id,
)
except models.AlertError.DoesNotExist:
pass


yield GaugeMetricFamily(
"promgen_shards", "Registered Shards", models.Shard.objects.count()
)
Expand Down