alliance-genome · christabone · Jan 12, 2023 · Jan 12, 2023 · Jan 12, 2023
diff --git a/hosts b/hosts
@@ -8,7 +8,7 @@ localhost
 34.207.206.156
 
 [production]
-54.208.219.17
+54.235.4.35
 
 [logs]
 54.91.196.72

diff --git a/templates/alertmanager.yml.j2 b/templates/alertmanager.yml.j2
@@ -3,10 +3,51 @@ global:
   slack_api_url: '{{ SLACK_ALERT_WEBHOOK }}'
 
 route:
-  receiver: 'slack-notifications'
+  # Default receiver
+  receiver: 'slack-warnings'
+  routes:
+  - receiver: 'slack-warnings'
+    continue: true
+    match_re:
+      severity: warning
+  - receiver: 'slack-alerts'
+    continue: true
+    match_re:
+      severity: critical
 
 receivers:
-- name: 'slack-notifications'
+- name: 'slack-warnings'
+  slack_configs:
+  - channel: '#system-warnings'
+    send_resolved: true
+    icon_url: https://avatars3.githubusercontent.com/u/3380462
+#    text: "summary: {% raw %}{{ .CommonAnnotations.summary }}\ndescription: {{ .CommonAnnotations.description }}{% endraw %}"
+    {% raw %}
+    title: |-
+     [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
+     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
+       {{" "}}(
+       {{- with .CommonLabels.Remove .GroupLabels.Names }}
+         {{- range $index, $label := .SortedPairs -}}
+           {{ if $index }}, {{ end }}
+           {{- $label.Name }}="{{ $label.Value -}}"
+         {{- end }}
+       {{- end -}}
+       )
+     {{- end }}
+    text: >-
+     {{ range .Alerts -}}
+     *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
+
+     *Description:* {{ .Annotations.description }}
+
+     *Details:*
+       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
+       {{ end }}
+     {{ end }}
+     {% endraw %}
+
+- name: 'slack-alerts'
   slack_configs:
   - channel: '#system-alerts'
     send_resolved: true

diff --git a/templates/rules.yml.j2 b/templates/rules.yml.j2
@@ -20,16 +20,6 @@ groups:
       summary: Prometheus too many restarts (instance {{ $labels.instance }})
       description: Prometheus has restarted more than twice in the last 15 minutes. It might be crash-looping. VALUE = {{ $value }} 
 
-- name: Endpoints
-  rules:
-    - alert: EndpointDown
-      expr: probe_success == 0
-      for: 10s
-      labels:
-        severity: "critical"
-      annotations:
-        summary: "Endpoint {{ $labels.instance }} down"
-
 - name: Hardware
   rules:
     - alert: HostOutOfMemory
@@ -105,7 +95,7 @@ groups:
       expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
       for: 2m
       labels:
-        severity: critical
+        severity: warning
       annotations:
         summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
         description: Filesystem is predicted to run out of space within the next 24 hours at current write rate. VALUE = {{ $value }} 
@@ -123,7 +113,7 @@ groups:
       expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
       for: 2m
       labels:
-        severity: critical
+        severity: warning
       annotations:
         summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
         description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate. VALUE = {{ $value }} 
@@ -219,7 +209,7 @@ groups:
     expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
     for: 2m
     labels:
-      severity: critical
+      severity: warning
     annotations:
       summary: Container Volume usage (instance {{ $labels.instance }})
       description: Container Volume usage is above 80%. VALUE = {{ $value }} 
@@ -228,7 +218,7 @@ groups:
     expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
     for: 2m
     labels:
-      severity: critical
+      severity: warning
     annotations:
       summary: Container Volume IO usage (instance {{ $labels.instance }})
       description: Container Volume IO usage is above 80%. VALUE = {{ $value }} 

diff --git a/templates/rules_production.yml.j2 b/templates/rules_production.yml.j2
@@ -20,6 +20,16 @@ groups:
       summary: Prometheus too many restarts (instance {{ $labels.instance }})
       description: Prometheus has restarted more than twice in the last 15 minutes. It might be crash-looping. VALUE = {{ $value }} LABELS = { { $labels } }
 
+- name: Endpoints
+  rules:
+    - alert: EndpointDown
+      expr: probe_success == 0
+      for: 10s
+      labels:
+        severity: "critical"
+      annotations:
+        summary: "Endpoint {{ $labels.instance }} down"
+
 - name: Hardware
   rules:
     - alert: HostOutOfMemory
@@ -231,4 +241,4 @@ groups:
     annotations:
       summary: Container high throttle rate (instance {{ $labels.instance }})
       description: Container is being throttled. VALUE = {{ $value }} LABELS = { { $labels } }
-{% endraw %}
+{% endraw %}