grafana · Ferril · Jun 28, 2023 · Jun 26, 2023 · Jun 26, 2023 · Jun 26, 2023
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased
+
+### Added
+
+- Add metric "how many alert groups user was notified of" to Prometheus exporter ([#2334](https://github.com/grafana/oncall/pull/2334/))
+
 ## v1.3.2
 
 ### Changed

@@ -21,6 +21,8 @@ It is a gauge, and its name has the suffix `alert_groups_total`
 - Response time on alert groups for each integration (mean time between the start and first action of all alert groups
 for the last 7 days in selected period). It is a histogram, and its name has the suffix `alert_groups_response_time`
 with the histogram suffixes such as `_bucket`, `_sum` and `_count`
+- A total count of alert groups users were notified of for each user. It is a counter, and its name has the suffix
+`user_was_notified_of_alert_groups_total`
 
 You can find more information about metrics types in the [Prometheus documentation](https://prometheus.io/docs/concepts/metric_types).
 
@@ -31,15 +33,17 @@ To retrieve Prometheus metrics use PromQL. If you are not familiar with PromQL,
 OnCall application metrics are collected in preinstalled `grafanacloud_usage` datasource and are available for every
 cloud instance.
 
-Metrics have prefix `grafanacloud_oncall_instance`, e.g. `grafanacloud_oncall_instance_alert_groups_total` and
-`grafanacloud_oncall_instance_alert_groups_response_time_seconds_bucket`.
+Metrics have prefix `grafanacloud_oncall_instance`, e.g. `grafanacloud_oncall_instance_alert_groups_total`,
+`grafanacloud_oncall_instance_alert_groups_response_time_seconds_bucket` and
+`grafanacloud_oncall_instance_user_was_notified_of_alert_groups_total`.
 
 ### For open source customers
 
 To collect OnCall application metrics you need to set up Prometheus and add it to your Grafana instance as a datasource.
 You can find more information about Prometheus setup in the [OSS documentation](https://github.com/grafana/oncall#readme)
 
-Metrics will have the prefix `oncall`, e.g. `oncall_alert_groups_total` and `oncall_alert_groups_response_time_seconds_bucket`.
+Metrics will have the prefix `oncall`, e.g. `oncall_alert_groups_total`, `oncall_alert_groups_response_time_seconds_bucket`
+and `oncall_user_was_notified_of_alert_groups_total`.
 
 Your metrics may also have additional labels, such as `pod`, `instance`, `container`, depending on your Prometheus setup.
 
@@ -86,17 +90,37 @@ in Grafana stack "test_stack":
 grafanacloud_oncall_instance_alert_groups_response_time_seconds_bucket{slug="test_stack", integration="Grafana Alerting", le="600"}
 ```
 
+### Metric Alert groups user was notified of
+
+This metric has the following labels:
+
+| Label Name    |                                 Description                                   |
+|---------------|:-----------------------------------------------------------------------------:|
+| `id`          | ID of Grafana instance (stack)                                                |
+| `slug`        | Slug of Grafana instance (stack)                                              |
+| `org_id`      | ID of Grafana organization                                                    |
+| `username`    | User username                                                                 |
+
+**Query example:**
+
+Get the number of alert groups user with username "alex" was notified of in Grafana stack "test_stack":
+
+```promql
+grafanacloud_oncall_instance_user_was_notified_of_alert_groups_total{slug="test_stack", username="alex"}
+```
+
 ### Dashboard
 
-To import OnCall metrics dashboard go to `Administration` -> `Plugins` page, find OnCall in the plugins list, open
-`Dashboards` tab at the OnCall plugin settings page and click "Import" near "OnCall metrics". After that you can find
-the "OnCall metrics" dashboard in your dashboards list. In the datasource dropdown select your Prometheus datasource
-(for Cloud customers it's `grafanacloud_usage`). You can filter data by your Grafana instances, teams and integrations.
+You can find the "OnCall Metrics" dashboard in the list of your dashboards in the folder `General`, it has the tag
+`oncall`. In the datasource dropdown select your Prometheus datasource (for Cloud customers it's `grafanacloud_usage`).
+You can filter data by your Grafana instances, teams and integrations.
+
+To re-import OnCall metrics dashboard go to `Administration` -> `Plugins` page, find OnCall in the plugins list, open
+`Dashboards` tab at the OnCall plugin settings page and click "Re-import" near "OnCall Metrics". After that you can find
+the "OnCall Metrics" dashboard in your dashboards list.
 
-To update the dashboard to the newest version go to `Dashboards` tab at the OnCall plugin settings page and click
-“Re-import”.
-Be aware: if you have made changes to the dashboard, they will be deleted after re-importing. To save your changes go
-to the dashboard settings, click "Save as" and save a copy of the dashboard.
+Be aware: if you have made changes to the dashboard, they will be lost after re-importing or after the plugin update.
+To save your changes go to the "OnCall Metrics" dashboard settings, click "Save as" and save a copy of the dashboard.
 
 ## Insight Logs
 

@@ -9,6 +9,7 @@
 from apps.alerts.constants import NEXT_ESCALATION_DELAY
 from apps.alerts.signals import user_notification_action_triggered_signal
 from apps.base.messaging import get_messaging_backend_from_id
+from apps.metrics_exporter.helpers import metrics_update_user_cache
 from apps.phone_notifications.phone_backend import PhoneBackend
 from common.custom_celery_tasks import shared_dedicated_queue_retry_task
 
@@ -186,6 +187,17 @@ def notify_user_task(
                         notification_channel=notification_policy.notify_by,
                     )
         if log_record:  # log_record is None if user notification policy step is unspecified
+            # if this is the first notification step, and user hasn't been notified for this alert group - update metric
+            if (
+                log_record.type == UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED
+                and previous_notification_policy_pk is None
+                and not user.personal_log_records.filter(
+                    type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED,
+                    alert_group_id=alert_group_pk,
+                ).exists()
+            ):
+                metrics_update_user_cache(user)
+
             log_record.save()
             if notify_user_task.request.retries == 0:
                 transaction.on_commit(lambda: send_user_notification_signal.apply_async((log_record.pk,)))

@@ -1,6 +1,8 @@
 import datetime
 import typing
 
+from django.conf import settings
+
 
 class AlertGroupsTotalMetricsDict(typing.TypedDict):
     integration_name: str
@@ -25,6 +27,14 @@ class AlertGroupsResponseTimeMetricsDict(typing.TypedDict):
     response_time: list
 
 
+class UserWasNotifiedOfAlertGroupsMetricsDict(typing.TypedDict):
+    user_username: str
+    org_id: int
+    slug: str
+    id: int
+    counter: int
+
+
 class RecalculateMetricsTimer(typing.TypedDict):
     recalculate_timeout: int
     forced_started: bool
@@ -35,6 +45,9 @@ class RecalculateOrgMetricsDict(typing.TypedDict):
     force: bool
 
 
+METRICS_PREFIX = "oncall_" if settings.IS_OPEN_SOURCE else "grafanacloud_oncall_instance_"
+USER_WAS_NOTIFIED_OF_ALERT_GROUPS = METRICS_PREFIX + "user_was_notified_of_alert_groups"
+
 ALERT_GROUPS_TOTAL = "oncall_alert_groups_total"
 ALERT_GROUPS_RESPONSE_TIME = "oncall_alert_groups_response_time_seconds"
 

@@ -17,8 +17,11 @@
     METRICS_RECALCULATION_CACHE_TIMEOUT,
     METRICS_RECALCULATION_CACHE_TIMEOUT_DISPERSE,
     METRICS_RESPONSE_TIME_CALCULATION_PERIOD,
+    USER_WAS_NOTIFIED_OF_ALERT_GROUPS,
     AlertGroupsResponseTimeMetricsDict,
     AlertGroupsTotalMetricsDict,
+    RecalculateMetricsTimer,
+    UserWasNotifiedOfAlertGroupsMetricsDict,
 )
 
 if typing.TYPE_CHECKING:
@@ -46,6 +49,27 @@ def get_organization_ids():
     return organizations_ids
 
 
+def is_allowed_to_start_metrics_calculation(organization_id, force=False) -> bool:
+    """Check if metrics_cache_timer doesn't exist or if recalculation was started by force."""
+    recalculate_timeout = get_metrics_recalculation_timeout()
+    metrics_cache_timer_key = get_metrics_cache_timer_key(organization_id)
+    metrics_cache_timer = cache.get(metrics_cache_timer_key)
+    if metrics_cache_timer:
+        if not force or metrics_cache_timer.get("forced_started", False):
+            return False
+        else:
+            metrics_cache_timer["forced_started"] = True
+    else:
+        metrics_cache_timer: RecalculateMetricsTimer = {
+            "recalculate_timeout": recalculate_timeout,
+            "forced_started": force,
+        }
+
+    metrics_cache_timer["recalculate_timeout"] = recalculate_timeout
+    cache.set(metrics_cache_timer_key, metrics_cache_timer, timeout=recalculate_timeout)
+    return True
+
+
 def get_response_time_period() -> datetime.datetime:
     """Returns period for response time calculation"""
     return timezone.now() - METRICS_RESPONSE_TIME_CALCULATION_PERIOD
@@ -87,6 +111,14 @@ def get_metric_alert_groups_response_time_key(organization_id) -> str:
     return f"{ALERT_GROUPS_RESPONSE_TIME}_{organization_id}"
 
 
+def get_metric_user_was_notified_of_alert_groups_key(organization_id) -> str:
+    return f"{USER_WAS_NOTIFIED_OF_ALERT_GROUPS}_{organization_id}"
+
+
+def get_metric_calculation_started_key(metric_name) -> str:
+    return f"calculation_started_for_{metric_name}"
+
+
 def metrics_update_integration_cache(integration: "AlertReceiveChannel") -> None:
     """Update integration data in metrics cache"""
     metrics_cache_timeout = get_metrics_cache_timeout(integration.organization_id)
@@ -239,3 +271,25 @@ def metrics_update_alert_groups_response_time_cache(integrations_response_time,
             continue
         integration_response_time_metrics["response_time"].extend(integration_response_time)
     cache.set(metric_alert_groups_response_time_key, metric_alert_groups_response_time, timeout=metrics_cache_timeout)
+
+
+def metrics_update_user_cache(user):
+    """Update "user_was_notified_of_alert_groups" metric cache."""
+    metrics_cache_timeout = get_metrics_cache_timeout(user.organization_id)
+    metric_user_was_notified_key = get_metric_user_was_notified_of_alert_groups_key(user.organization_id)
+    metric_user_was_notified: typing.Dict[int, UserWasNotifiedOfAlertGroupsMetricsDict] = cache.get(
+        metric_user_was_notified_key, {}
+    )
+
+    metric_user_was_notified.setdefault(
+        user.id,
+        {
+            "user_username": user.username,
+            "org_id": user.organization.org_id,
+            "slug": user.organization.stack_slug,
+            "id": user.organization.stack_id,
+            "counter": 0,
+        },
+    )["counter"] += 1
+
+    cache.set(metric_user_was_notified_key, metric_user_was_notified, timeout=metrics_cache_timeout)