Merge pull request #16 from evidentid/iops-alerts

Add AWS RDS Read/Write IOPS alarms
lorenzoaiello · Feb 17, 2024 · 54e9f42 · 54e9f42
2 parents 605f984 + 1c7dc5f
commit 54e9f42
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 12 deletions.
diff --git a/main.tf b/main.tf
@@ -201,3 +201,42 @@ resource "aws_cloudwatch_metric_alarm" "maximum_used_transaction_ids_too_high" {
   alarm_actions       = var.actions_alarm
   ok_actions          = var.actions_ok
 }
+
+# SOC2 requirements
+resource "aws_cloudwatch_metric_alarm" "read_iops_too_high" {
+  count               = var.create_read_iops_alarm ? 1 : 0
+  alarm_name          = "${var.prefix}rds-${var.db_instance_id}-read-iops-too-high"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = var.evaluation_period
+  metric_name         = "ReadIOPS"
+  namespace           = "AWS/RDS"
+  period              = var.statistic_period
+  statistic           = "Average"
+  threshold           = var.read_iops_too_high_threshold
+  alarm_description   = "Average Read IO over last ${(var.evaluation_period * var.statistic_period / 60)} minutes too high, performance may suffer"
+  alarm_actions       = var.actions_alarm
+  ok_actions          = var.actions_ok
+
+  dimensions = {
+    DBInstanceIdentifier = "${var.db_instance_id}-read-iops-too-high"
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "write_iops_too_high" {
+  count               = var.create_write_iops_alarm ? 1 : 0
+  alarm_name          = "${var.prefix}rds-${var.db_instance_id}-write-iops-too-high"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = var.evaluation_period
+  metric_name         = "WriteIOPS"
+  namespace           = "AWS/RDS"
+  period              = var.statistic_period
+  statistic           = "Average"
+  threshold           = var.write_iops_too_high_threshold
+  alarm_description   = "Average Write IO over last ${(var.evaluation_period * var.statistic_period / 60)} minutes too high, performance may suffer"
+  alarm_actions       = var.actions_alarm
+  ok_actions          = var.actions_ok
+
+  dimensions = {
+    DBInstanceIdentifier = "${var.prefix}rds-${var.db_instance_id}-write-iops-too-high"
+  }
+}
diff --git a/outputs.tf b/outputs.tf
@@ -1,6 +1,6 @@
 output "alarm_cpu_utilization_too_high" {
   # For older terraform support...
-  value       = var.create_high_cpu_alarm ? aws_cloudwatch_metric_alarm.cpu_utilization_too_high[0] : null
+  value = var.create_high_cpu_alarm ? aws_cloudwatch_metric_alarm.cpu_utilization_too_high[0] : null
   # For Terraform 0.15+, eventually use this much nicer code instead...
   # value       = one(aws_cloudwatch_metric_alarm.cpu_utilization_too_high.*)
   description = "The CloudWatch Metric Alarm resource block for high CPU Utilization"
@@ -16,47 +16,47 @@ output "alarm_cpu_credit_balance_too_low" {
 
 output "alarm_disk_queue_depth_too_high" {
   # For older terraform support...
-  value       = var.create_high_queue_depth_alarm ? aws_cloudwatch_metric_alarm.disk_queue_depth_too_high[0] : null
+  value = var.create_high_queue_depth_alarm ? aws_cloudwatch_metric_alarm.disk_queue_depth_too_high[0] : null
   # For Terraform 0.15+, eventually use this much nicer code instead...
   # value       = one(aws_cloudwatch_metric_alarm.disk_queue_depth_too_high.*)
   description = "The CloudWatch Metric Alarm resource block for high Disk Queue Depth"
 }
 
 output "alarm_disk_free_storage_space_too_low" {
   # For older terraform support...
-  value       = var.create_low_disk_space_alarm ? aws_cloudwatch_metric_alarm.disk_free_storage_space_too_low[0] : null
+  value = var.create_low_disk_space_alarm ? aws_cloudwatch_metric_alarm.disk_free_storage_space_too_low[0] : null
   # For Terraform 0.15+, eventually use this much nicer code instead...
   # value       = one(aws_cloudwatch_metric_alarm.disk_free_storage_space_too_low.*)
   description = "The CloudWatch Metric Alarm resource block for low Free Storage Space"
 }
 
 output "alarm_disk_burst_balance_too_low" {
   # For older terraform support...
-  value       = var.create_low_disk_burst_alarm ? aws_cloudwatch_metric_alarm.disk_burst_balance_too_low[0] : null
+  value = var.create_low_disk_burst_alarm ? aws_cloudwatch_metric_alarm.disk_burst_balance_too_low[0] : null
   # For Terraform 0.15+, eventually use this much nicer code instead...
   # value       = one(aws_cloudwatch_metric_alarm.disk_burst_balance_too_low.*)
   description = "The CloudWatch Metric Alarm resource block for low Disk Burst Balance"
 }
 
 output "alarm_memory_freeable_too_low" {
   # For older terraform support...
-  value       = var.create_low_memory_alarm ? aws_cloudwatch_metric_alarm.memory_freeable_too_low[0] : null
+  value = var.create_low_memory_alarm ? aws_cloudwatch_metric_alarm.memory_freeable_too_low[0] : null
   # For Terraform 0.15+, eventually use this much nicer code instead...
   # value       = one(aws_cloudwatch_metric_alarm.memory_freeable_too_low.*)
   description = "The CloudWatch Metric Alarm resource block for low Freeable Memory"
 }
 
 output "alarm_memory_swap_usage_too_high" {
   # For older terraform support...
-  value       = var.create_swap_alarm ? aws_cloudwatch_metric_alarm.memory_swap_usage_too_high[0] : null
+  value = var.create_swap_alarm ? aws_cloudwatch_metric_alarm.memory_swap_usage_too_high[0] : null
   # For Terraform 0.15+, eventually use this much nicer code instead...
   # value       = one(aws_cloudwatch_metric_alarm.memory_swap_usage_too_high.*)
   description = "The CloudWatch Metric Alarm resource block for high Memory Swap Usage"
 }
 
 output "alarm_connection_count_anomalous" {
   # For older terraform support...
-  value       = var.create_anomaly_alarm ? aws_cloudwatch_metric_alarm.connection_count_anomalous[0] : null
+  value = var.create_anomaly_alarm ? aws_cloudwatch_metric_alarm.connection_count_anomalous[0] : null
   # For Terraform 0.15+, eventually use this much nicer code instead...
   # value       = one(aws_cloudwatch_metric_alarm.connection_count_anomalous.*)
   description = "The CloudWatch Metric Alarm resource block for anomalous Connection Count"

diff --git a/variables.tf b/variables.tf
@@ -69,6 +69,18 @@ variable "create_anomaly_alarm" {
   description = "Whether or not to create the fairly noisy anomaly alarm.  Default is to create it (for backwards compatible support), but recommended to disable this for non-production databases"
 }
 
+variable "create_read_iops_alarm" {
+  type        = bool
+  default     = true
+  description = "Whether or not to create the Read IOPS too high alarm. Default is to create it."
+}
+
+variable "create_write_iops_alarm" {
+  type        = bool
+  default     = true
+  description = "Whether or not to create the Write IOPS too high alarm. Default is to create it."
+}
+
 variable "anomaly_period" {
   type        = string
   default     = "600"
@@ -82,13 +94,13 @@ variable "anomaly_band_width" {
 }
 
 variable "actions_alarm" {
-  type        = list
+  type        = list(any)
   default     = []
   description = "A list of actions to take when alarms are triggered. Will likely be an SNS topic for event distribution."
 }
 
 variable "actions_ok" {
-  type        = list
+  type        = list(any)
   default     = []
   description = "A list of actions to take when alarms are cleared. Will likely be an SNS topic for event distribution."
 }
@@ -135,21 +147,33 @@ variable "memory_swap_usage_too_high_threshold" {
   description = "Alarm threshold for the 'highSwapUsage' alarm"
 }
 
+variable "read_iops_too_high_threshold" {
+  type        = string
+  default     = "100"
+  description = "Alarm threshold for the 'read-iops-too-high' alarm"
+}
+
+variable "write_iops_too_high_threshold" {
+  type        = string
+  default     = "10000"
+  description = "Alarm threshold for the 'write-iops-too-high' alarm"
+}
+
 variable "tags" {
   type        = map(string)
   default     = {}
   description = "Tags to attach to each alarm"
 }
 
 variable "db_instance_class" {
-  type      = string
+  type        = string
   description = "The rds instance class, e.g. db.t3.medium"
 }
 
 variable "engine" {
-  type = string
+  type        = string
   description = "The RDS engine being used. Used for postgres or mysql specific alarms"
-  default = ""
+  default     = ""
 }
 
 variable "maximum_used_transaction_ids_too_high_threshold" {