-
Notifications
You must be signed in to change notification settings - Fork 191
/
0000_90_cluster-version-operator_02_servicemonitor.yaml
96 lines (96 loc) · 5.14 KB
/
0000_90_cluster-version-operator_02_servicemonitor.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: cluster-version-operator
name: cluster-version-operator
namespace: openshift-cluster-version
annotations:
exclude.release.openshift.io/internal-openshift-hosted: "true"
include.release.openshift.io/self-managed-high-availability: "true"
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
port: metrics
scheme: https
tlsConfig:
caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
serverName: cluster-version-operator.openshift-cluster-version.svc
namespaceSelector:
matchNames:
- openshift-cluster-version
selector:
matchLabels:
k8s-app: cluster-version-operator
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
k8s-app: cluster-version-operator
name: cluster-version-operator
namespace: openshift-cluster-version
annotations:
exclude.release.openshift.io/internal-openshift-hosted: "true"
include.release.openshift.io/self-managed-high-availability: "true"
spec:
groups:
- name: cluster-version
rules:
- alert: ClusterVersionOperatorDown
annotations:
message: Cluster version operator has disappeared from Prometheus target discovery. Operator may be down or disabled, cluster will not be kept up to date and upgrades will not be possible.
expr: |
absent(up{job="cluster-version-operator"} == 1)
for: 10m
labels:
severity: critical
- alert: CannotRetrieveUpdates
annotations:
message: Cluster version operator has not retrieved updates in {{ "{{ $value | humanizeDuration }}" }}. Failure reason {{ "{{ with $cluster_operator_conditions := \"cluster_operator_conditions\" | query}}{{range $value := .}}{{if and (eq (label \"name\" $value) \"version\") (eq (label \"condition\" $value) \"RetrievedUpdates\") (eq (label \"endpoint\" $value) \"metrics\") (eq (value $value) 0.0)}}{{label \"reason\" $value}} {{end}}{{end}}{{end}}" }}. {{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} For more information refer to {{ label \"url\" (first $console_url ) }}/settings/cluster/.{{ end }}{{ end }}" }}
expr: |
(time()-cluster_version_operator_update_retrieval_timestamp_seconds) >= 3600 and ignoring(condition, name, reason) cluster_operator_conditions{name="version", condition="RetrievedUpdates", endpoint="metrics", reason!="NoChannel"}
labels:
severity: warning
- alert: UpdateAvailable
annotations:
message: Your upstream update recommendation service recommends you update your cluster. For more information refer to 'oc adm upgrade'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
expr: |
cluster_version_available_updates > 0
labels:
severity: info
- name: cluster-operators
rules:
- alert: ClusterNotUpgradeable
annotations:
message: One or more cluster operators have been blocking minor version cluster upgrades for at least an hour for reason {{ "{{ with $cluster_operator_conditions := \"cluster_operator_conditions\" | query}}{{range $value := .}}{{if and (eq (label \"name\" $value) \"version\") (eq (label \"condition\" $value) \"Upgradeable\") (eq (label \"endpoint\" $value) \"metrics\") (eq (value $value) 0.0) (ne (len (label \"reason\" $value)) 0) }}{{label \"reason\" $value}}.{{end}}{{end}}{{end}}"}} {{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} For more information refer to {{ label \"url\" (first $console_url ) }}/settings/cluster/.{{ end }}{{ end }}" }}
expr: |
max by (name, condition, endpoint) (cluster_operator_conditions{name="version", condition="Upgradeable", endpoint="metrics"} == 0)
for: 60m
labels:
severity: warning
- alert: ClusterOperatorDown
annotations:
message: Cluster operator {{ "{{ $labels.name }}" }} has not been available for 10 minutes. Operator may be down or disabled, cluster will not be kept up to date and upgrades will not be possible.
expr: |
cluster_operator_up{job="cluster-version-operator"} == 0
for: 10m
labels:
severity: critical
- alert: ClusterOperatorDegraded
annotations:
message: Cluster operator {{ "{{ $labels.name }}" }} has been degraded for 10 minutes. Operator is degraded because {{ "{{ $labels.reason }}" }} and cluster upgrades will be unstable.
expr: |
cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"} == 1
for: 10m
labels:
severity: critical
- alert: ClusterOperatorFlapping
annotations:
message: Cluster operator {{ "{{ $labels.name }}" }} up status is changing often. This might cause upgrades to be unstable.
expr: |
changes(cluster_operator_up{job="cluster-version-operator"}[2m]) > 2
for: 10m
labels:
severity: warning