Skip to content

Commit

Permalink
feat: Add basic prometheus alerts (#177)
Browse files Browse the repository at this point in the history
Signed-off-by: Tomas Coufal <tcoufal@redhat.com>
  • Loading branch information
tumido authored Jul 18, 2022
1 parent bc589a8 commit 80e1129
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 0 deletions.
1 change: 1 addition & 0 deletions manifests/base/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ kind: Kustomization
resources:
- controller
- tasks
- monitoring
commonLabels:
app.kubernetes.io/name: peribolos-as-service
app.kubernetes.io/managed-by: sig-services
Expand Down
5 changes: 5 additions & 0 deletions manifests/base/monitoring/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- prometheusrule.yaml
62 changes: 62 additions & 0 deletions manifests/base/monitoring/prometheusrule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: probot
spec:
groups:
- name: controller
rules:
- alert: ProbotControllerDown
annotations:
description: "Controller {{ $labels.namespace }}/{{ $labels.pod }} is down."
summary: Controller is down
expr: 'up{pod=~".*-controller-.*"} < 1'
labels:
severity: critical
- alert: ProbotOperationFailed
annotations:
description: >-
Controller {{ $labels.namespace }}/{{ $labels.pod }} attempted {{
$labels.operation }}/{{ $labels.method }} operation on {{
$labels.install }} installation but failed.
summary: Controller failed to perform kubernetes operation.
expr: 'probot_operations_triggered{status="Failed"} > 0'
labels:
severity: warning
- alert: ProbotEventLoopLagging
annotations:
description: >-
Controller's time to schedule event processing has increased above
30ms, this may indicate a problem in controller runtime.
summary: Controller's event loop is lagging
expr: probot_nodejs_eventloop_lag_seconds > 0.03
for: 1m
labels:
severity: warning
- name: tekton
rules:
- alert: TaskRunFailed
annotations:
description: >-
TaskRun {{ $labels.label_tekton_dev_task_run }} ({{
$labels.namespace }}/{{ $labels.pod }}) has failed. Something
unexpected must have happened during task execution.
summary: Tekton task execution has failed.
expr: >-
kube_pod_status_phase{phase="Failed"} * on(pod) group_right
kube_pod_labels{label_tekton_dev_task=~".+"} > 0
labels:
severity: critical
- alert: TaskRunPending
annotations:
description: >-
TaskRun {{ $labels.label_tekton_dev_task_run }} ({{
$labels.namespace }}/{{ $labels.pod }}) takes too long to be
scheduled. Investigate for possible scheduling delays.
summary: TaskRun is in pending state for too long.
expr: >-
kube_pod_status_phase{phase="Pending"} * on(pod) group_right
kube_pod_labels{label_tekton_dev_task=~".+"} > 0
for: 2m
labels:
severity: warning

0 comments on commit 80e1129

Please sign in to comment.