-
Notifications
You must be signed in to change notification settings - Fork 0
/
alerts.tf
121 lines (113 loc) · 4.97 KB
/
alerts.tf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
data "aws_ec2_instance_type" "this" {
instance_type = trim(var.instance_class, "db.")
}
data "aws_db_instance" "database" {
db_instance_identifier = var.identifier
depends_on = [
module.db
]
}
locals {
// SampleCount statistic adds 2 to the real count in case the engine is postgres, so 7 means 5 + 2
slow_queries_alert_threshold = var.engine == "postgres" ? 7 : 5
}
module "cw_alerts" {
count = var.alarms.enabled ? 1 : 0
source = "dasmeta/monitoring/aws//modules/alerts"
version = "1.3.5"
sns_topic = var.alarms.sns_topic
alerts = concat([
{
name = "DB: High CPU Utilization Alert on Instance ${var.identifier}"
source = "AWS/RDS/CPUUtilization"
filters = {
DBInstanceIdentifier = var.identifier
}
statistic = try(var.alarms.custom_values.cpu.statistic, "avg")
threshold = try(var.alarms.custom_values.cpu.threshold, "90") # percent
period = try(var.alarms.custom_values.cpu.period, "300")
},
{
name = "DB: Low EBS IO Balance Percentage on Instance ${var.identifier}"
source = "AWS/RDS/EBSIOBalance%"
filters = {
DBInstanceIdentifier = var.identifier
}
period = try(var.alarms.custom_values.ebs.IObalance.period, "1800")
threshold = try(var.alarms.custom_values.ebs.IObalance.threshold, "10") # percent
equation = try(var.alarms.custom_values.ebs.IObalance.equation, "lt")
statistic = try(var.alarms.custom_values.ebs.IObalance.statistic, "avg")
},
{
name = "DB: Low Freeable Memory Alert on Instance ${var.identifier}"
source = "AWS/RDS/FreeableMemory"
filters = {
DBInstanceIdentifier = var.identifier
}
period = try(var.alarms.custom_values.memory.period, "1800")
threshold = try(var.alarms.custom_values.memory.threshold, data.aws_ec2_instance_type.this.memory_size * 0.05 * 1024 * 1024)
equation = try(var.alarms.custom_values.memory.equation, "lt")
statistic = try(var.alarms.custom_values.memory.statistic, "avg")
},
{
name = "DB: High Read Latency Detected on Instance ${var.identifier}"
source = "AWS/RDS/ReadLatency"
filters = {
DBInstanceIdentifier = var.identifier
}
period = try(var.alarms.custom_values.network.read.period, "60")
threshold = try(var.alarms.custom_values.network.read.threshold, "1")
equation = try(var.alarms.custom_values.network.read.equation, "gte")
statistic = try(var.alarms.custom_values.network.read.statistic, "avg")
},
{
name = "DB: High Write Latency Detected on Instance ${var.identifier}"
source = "AWS/RDS/WriteLatency"
filters = {
DBInstanceIdentifier = var.identifier
}
period = try(var.alarms.custom_values.network.write.period, "60")
threshold = try(var.alarms.custom_values.network.write.threshold, "1")
equation = try(var.alarms.custom_values.network.write.equation, "gte")
statistic = try(var.alarms.custom_values.network.write.statistic, "avg")
},
{
name = "DB: High Database Connection Usage on Instance ${var.identifier}"
source = "AWS/RDS/DatabaseConnections"
filters = {
DBInstanceIdentifier = var.identifier
}
# considering https://aws.amazon.com/premiumsupport/knowledge-center/rds-mysql-max-connections/; expecting that only 80% of memory is used for PostgreSQL; warn at 80% connection usage
period = try(var.alarms.custom_values.connections.period, "60")
threshold = try(var.alarms.custom_values.connections.threshold, min(ceil(data.aws_ec2_instance_type.this.memory_size * 0.8 * 0.8 * 1024 * 1024 / 9531392), 5000))
statistic = try(var.alarms.custom_values.connections.statistic, "avg")
},
{
name = "DB: Low Free Storage Space on Instance ${var.identifier}"
source = "AWS/RDS/FreeStorageSpace"
filters = {
DBInstanceIdentifier = var.identifier
}
period = try(var.alarms.custom_values.disk.period, "300")
threshold = try(var.alarms.custom_values.disk.threshold, data.aws_db_instance.database.allocated_storage * 0.1 * 1024 * 1024 * 1024) #10% of storage in Bytes
equation = try(var.alarms.custom_values.disk.equation, "lte")
statistic = try(var.alarms.custom_values.disk.statistic, "avg")
},
],
// This will get into in alarm state in case there are 5 slow queries in 5 minutes
var.slow_queries.enabled ? [
{
name = "DB: Excessive Slow Queries on Instance ${var.identifier}"
source = "RDSLogBasedMetrics/${var.identifier}-RDSSlowQueries"
filters = {}
period = try(var.alarms.custom_values.slow-queries.period, "300")
threshold = try(var.alarms.custom_values.slow-queries.threshold, local.slow_queries_alert_threshold)
equation = try(var.alarms.custom_values.slow-queries.equation, "gte")
statistic = try(var.alarms.custom_values.slow-queries.statistic, "count")
}
] : []
)
depends_on = [
module.db
]
}