Skip to content

Commit

Permalink
Make queue monitor more robust
Browse files Browse the repository at this point in the history
  • Loading branch information
samidalouche committed Jun 11, 2024
1 parent 63509b5 commit 9fdec2e
Showing 1 changed file with 107 additions and 8 deletions.
115 changes: 107 additions & 8 deletions infra/modules/alerts-polling/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -42,27 +42,127 @@ resource "aws_cloudwatch_metric_alarm" "foobar" {
}
}

resource "datadog_monitor" "queue_size_out_of_control" {
name = "[facebook-connector-${var.stage}] Queue size out of control"

resource "datadog_monitor" "queue_size_growing" {
name = "[${local.app_name}-${var.stage}] Queue size growing"
type = "metric alert"

message = <<EOF
The queue size has grown over the past hour.
The queue size out of control contains more information on the possible root cause.
EOF


query = "change(min(last_1h),last_5m):avg:facebook_connector.QueueSize{stage:${var.stage}} > 0"

monitor_thresholds {
critical = 0
#critical_recovery = 0
#warning = 0
#warning_recovery = 0
}

require_full_window = true
evaluation_delay = 900
notify_no_data = true

renotify_interval = 60
renotify_statuses = [
"alert",
"no data"
]

tags = ["application:${local.app_name}", "stage:${var.stage}"]
}

resource "datadog_monitor" "queue_size_not_changing" {
name = "[${local.app_name}-${var.stage}] Queue size not changing"
type = "metric alert"

message = <<EOF
The queue size has not been changing over the past hour. This is not a concern by itself
but becomes one if the queue size is not empty.
EOF


query = "change(min(last_1h),last_5m):avg:facebook_connector.QueueSize{stage:${var.stage}} == 0"

monitor_thresholds {
critical = 0
#critical_recovery = 0
#warning = 0
#warning_recovery = 0
}

require_full_window = true
evaluation_delay = 900
notify_no_data = true

renotify_interval = 60
renotify_statuses = [
"alert",
"no data"
]

tags = ["application:${local.app_name}", "stage:${var.stage}"]
}

resource "datadog_monitor" "queue_size_not_empty" {
name = "[${local.app_name}-${var.stage}] Queue size non empty for the past hour"
type = "metric alert"

message = <<EOF
The queue size has grown consistently over the past hour.
The queue size has not been empty for the past hour. This is not a concern by itself
but becomes one if the queue size is either stable or growing.
EOF


query = "min(last_1h):avg:facebook_connector.QueueSize{stage:${var.stage}} > 0"

monitor_thresholds {
critical = 0
#critical_recovery = 0
#warning = 0
#warning_recovery = 0
}

require_full_window = true
evaluation_delay = 900
notify_no_data = true

renotify_interval = 60
renotify_statuses = [
"alert",
"no data"
]

tags = ["application:${local.app_name}", "stage:${var.stage}"]
}

resource "datadog_monitor" "queue_size_out_of_control" {
name = "[${local.app_name}-${var.stage}] Queue size out of control"
type = "composite"

message = <<EOF
The queue size is out of control:
- growing over time, or
- stable, but never being emptied
This could indicate that:
1. The process is stuck / not making progress
2. The process is not able to keep up with the amount of data being transacted
For more details please check the Facebook Connector Dashboard:
For more details please check the Dashboard:
https://app.datadoghq.com/dashboard/h8x-qvp-nij
@slack-auto-techops
@backend-application-warnings@narrative.io
EOF


query = "change(min(last_1h),last_5m):avg:facebook_connector.QueueSize{stage:${var.stage}} > 0"
query = "${datadog_monitor.queue_size_growing.id} || (${datadog_monitor.queue_size_not_empty.id} && ${datadog_monitor.queue_size_not_changing.id})"

monitor_thresholds {
critical = 0
Expand All @@ -72,7 +172,6 @@ EOF
}

require_full_window = true
evaluation_delay = 900
notify_no_data = true

renotify_interval = 60
Expand All @@ -81,5 +180,5 @@ EOF
"no data"
]

tags = ["application:facebook-connector", "stage:${var.stage}"]
tags = ["application:${local.app_name}", "stage:${var.stage}"]
}

0 comments on commit 9fdec2e

Please sign in to comment.