diff --git a/.werft/observability/monitoring-satellite.ts b/.werft/observability/monitoring-satellite.ts index 43c09625cb5706..e13cd727eadf98 100644 --- a/.werft/observability/monitoring-satellite.ts +++ b/.werft/observability/monitoring-satellite.ts @@ -107,6 +107,10 @@ export class MonitoringSatelliteInstaller { "gitURL": "https://github.com/gitpod-io/observability", "path": "monitoring-satellite/manifests/probers", }, + { + "gitURL": "https://github.com/gitpod-io/gitpod", + "path": "operations/observability/mixins/meta/rules", + }, { "gitURL": "https://github.com/gitpod-io/gitpod", "path": "operations/observability/mixins/IDE/rules", diff --git a/operations/observability/mixins/meta/mixin.libsonnet b/operations/observability/mixins/meta/mixin.libsonnet index 1ccff37afff3dc..154efffdea7505 100644 --- a/operations/observability/mixins/meta/mixin.libsonnet +++ b/operations/observability/mixins/meta/mixin.libsonnet @@ -3,5 +3,4 @@ * Licensed under the MIT License. See License-MIT.txt in the project root for license information. */ -(import './rules.libsonnet') + (import './dashboards.libsonnet') diff --git a/operations/observability/mixins/meta/rules.libsonnet b/operations/observability/mixins/meta/rules.libsonnet deleted file mode 100644 index 9c6e9658e094fd..00000000000000 --- a/operations/observability/mixins/meta/rules.libsonnet +++ /dev/null @@ -1,7 +0,0 @@ -/** - * Copyright (c) 2021 Gitpod GmbH. All rights reserved. - * Licensed under the MIT License. See License-MIT.txt in the project root for license information. - */ - -(import './rules/SLO/SLOs.libsonnet') + -(import './rules/components/components.libsonnet') diff --git a/operations/observability/mixins/meta/rules/SLO/SLOs.libsonnet b/operations/observability/mixins/meta/rules/SLO/SLOs.libsonnet deleted file mode 100644 index 8c5609c2eddc95..00000000000000 --- a/operations/observability/mixins/meta/rules/SLO/SLOs.libsonnet +++ /dev/null @@ -1,7 +0,0 @@ -/** - * Copyright (c) 2021 Gitpod GmbH. All rights reserved. - * Licensed under the MIT License. See License-MIT.txt in the project root for license information. - */ - -(import './login/alerts.libsonnet') + -(import './login/rules.libsonnet') diff --git a/operations/observability/mixins/meta/rules/SLO/login/alerts.libsonnet b/operations/observability/mixins/meta/rules/SLO/login/alerts.libsonnet deleted file mode 100644 index 5e6332c53348e0..00000000000000 --- a/operations/observability/mixins/meta/rules/SLO/login/alerts.libsonnet +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Copyright (c) 2021 Gitpod GmbH. All rights reserved. - * Licensed under the MIT License. See License-MIT.txt in the project root for license information. - */ - -{ - prometheusAlerts+:: { - groups+: [ - { - name: 'gitpod-login-slo-alerts', - rules: [ - // Please read this entire page: https://sre.google/workbook/alerting-on-slos/ - // We are alerting on strategy #6 - { - alert: 'GitpodLoginErrorBudgetBurn', - labels: { - severity: 'critical', - }, - annotations: { - runbook_url: 'https://github.com/gitpod-com/observability/blob/main/runbooks/GitpodLoginErrorBudgetBurn.md', - summary: 'Error budget is being burn too quickly.', - description: 'Error budget is being burn too quickly. At this rate, the whole monthly budget will be burnt in less than 2 days.', - }, - expr: ||| - ( - gitpod_server_login_requests_total:1h_failure_ratio > (14.4 * (1 - gitpod_server_login_requests_total:slo_target)) - and - gitpod_server_login_requests_total:5m_failure_ratio > (14.4 * (1 - gitpod_server_login_requests_total:slo_target)) - ) - or - ( - gitpod_server_login_requests_total:6h_failure_ratio > (6 * (1 - gitpod_server_login_requests_total:slo_target)) - and - gitpod_server_login_requests_total:30m_failure_ratio > (6 * (1 - gitpod_server_login_requests_total:slo_target)) - ) - |||, - }, - { - alert: 'GitpodLoginErrorBudgetBurn', - labels: { - severity: 'warning', - }, - annotations: { - runbook_url: 'https://github.com/gitpod-com/observability/blob/main/runbooks/GitpodLoginErrorBudgetBurn.md', - summary: 'Error budget is being burn quickly.', - description: 'Error budget is being burn quickly. At this rate, the whole monthly budget will be burnt in less than 10 days.', - }, - expr: ||| - ( - gitpod_server_login_requests_total:1d_failure_ratio > (3 * (1 - gitpod_server_login_requests_total:slo_target)) - and - gitpod_server_login_requests_total:2h_failure_ratio > (3 * (1 - gitpod_server_login_requests_total:slo_target)) - ) - or - ( - gitpod_server_login_requests_total:3d_failure_ratio > (1 * (1 - gitpod_server_login_requests_total:slo_target)) - and - gitpod_server_login_requests_total:6h_failure_ratio > (1 * (1 - gitpod_server_login_requests_total:slo_target)) - ) - |||, - }, - ], - }, - ], - }, -} diff --git a/operations/observability/mixins/meta/rules/SLO/login/rules.libsonnet b/operations/observability/mixins/meta/rules/SLO/login/rules.libsonnet deleted file mode 100644 index d7037dbedea86e..00000000000000 --- a/operations/observability/mixins/meta/rules/SLO/login/rules.libsonnet +++ /dev/null @@ -1,92 +0,0 @@ -/** - * Copyright (c) 2021 Gitpod GmbH. All rights reserved. - * Licensed under the MIT License. See License-MIT.txt in the project root for license information. - */ - -{ - prometheusRules+:: { - groups+: [ - { - name: 'gitpod-login-slo-records', - rules: [ - { - record: 'gitpod_server_login_requests_total:5m_failure_ratio', - expr: ||| - sum(rate(gitpod_server_login_requests_total{status="failed"}[5m])) - / - sum(rate(gitpod_server_login_requests_total[5m])) - |||, - }, - { - record: 'gitpod_server_login_requests_total:30m_failure_ratio', - expr: ||| - sum(rate(gitpod_server_login_requests_total{status="failed"}[30m])) - / - sum(rate(gitpod_server_login_requests_total[30m])) - |||, - }, - { - record: 'gitpod_server_login_requests_total:1h_failure_ratio', - expr: ||| - sum(rate(gitpod_server_login_requests_total{status="failed"}[1h])) - / - sum(rate(gitpod_server_login_requests_total[1h])) - |||, - }, - { - record: 'gitpod_server_login_requests_total:2h_failure_ratio', - expr: ||| - sum(rate(gitpod_server_login_requests_total{status="failed"}[2h])) - / - sum(rate(gitpod_server_login_requests_total[2h])) - |||, - }, - { - record: 'gitpod_server_login_requests_total:6h_failure_ratio', - expr: ||| - sum(rate(gitpod_server_login_requests_total{status="failed"}[6h])) - / - sum(rate(gitpod_server_login_requests_total[6h])) - |||, - }, - { - record: 'gitpod_server_login_requests_total:1d_failure_ratio', - expr: ||| - sum(rate(gitpod_server_login_requests_total{status="failed"}[1d])) - / - sum(rate(gitpod_server_login_requests_total[1d])) - |||, - }, - { - record: 'gitpod_server_login_requests_total:3d_failure_ratio', - expr: ||| - sum(rate(gitpod_server_login_requests_total{status="failed"}[3d])) - / - sum(rate(gitpod_server_login_requests_total[3d])) - |||, - }, - { - record: 'gitpod_server_login_requests_total:30d_failure_ratio', - expr: ||| - sum(rate(gitpod_server_login_requests_total{status="failed"}[30d])) - / - sum(rate(gitpod_server_login_requests_total[30d])) - |||, - }, - { - record: 'gitpod_server_login_requests_total:slo_target', - expr: '0.95', - }, - { - record: 'gitpod_server_login_requests_total:error_budget_remaining', - expr: 'gitpod_server_login_requests_total:monthly_availability - gitpod_server_login_requests_total:slo_target', - }, - { - record: 'gitpod_server_login_requests_total:monthly_availability', - expr: '1 - gitpod_server_login_requests_total:30d_failure_ratio', - }, - ], - }, - ], - }, -} diff --git a/operations/observability/mixins/meta/rules/components/components.libsonnet b/operations/observability/mixins/meta/rules/components/components.libsonnet deleted file mode 100644 index 26e3ac9f72529c..00000000000000 --- a/operations/observability/mixins/meta/rules/components/components.libsonnet +++ /dev/null @@ -1,10 +0,0 @@ -/** - * Copyright (c) 2021 Gitpod GmbH. All rights reserved. - * Licensed under the MIT License. See License-MIT.txt in the project root for license information. - */ - -(import './nodes/alerts.libsonnet') + -(import './nodes/rules.libsonnet') + -(import './server/alerts.libsonnet') + -(import './messagebus/alerts.libsonnet') + -(import './usage/alerts.libsonnet') diff --git a/operations/observability/mixins/meta/rules/components/messagebus/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/messagebus/alerts.libsonnet deleted file mode 100644 index 93b67c1d69e4b7..00000000000000 --- a/operations/observability/mixins/meta/rules/components/messagebus/alerts.libsonnet +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright (c) 2021 Gitpod GmbH. All rights reserved. - * Licensed under the MIT License. See License-MIT.txt in the project root for license information. - */ - -{ - prometheusAlerts+:: { - groups+: [ - { - name: 'gitpod-component-meta-messagebus-alerts', - rules: [ - { - alert: 'GitpodMetaMessagebusTotalQueues', - labels: { - severity: 'critical', - }, - 'for': '2m', - annotations: { - runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodMetaMessagebusTotalQueues.md', - summary: 'A messagebus has too many queues in total.', - description: 'messagebus {{ $labels.pod }} is reporting {{ printf "%.2f" $value }} queues in total.', - }, - expr: 'sum by (instance) (rabbitmq_queues) > 10000', - }, - ], - }, - ], - }, -} diff --git a/operations/observability/mixins/meta/rules/components/nodes/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/nodes/alerts.libsonnet deleted file mode 100644 index 2f7f68fa4cda51..00000000000000 --- a/operations/observability/mixins/meta/rules/components/nodes/alerts.libsonnet +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Copyright (c) 2021 Gitpod GmbH. All rights reserved. - * Licensed under the MIT License. See License-MIT.txt in the project root for license information. - */ - -{ - prometheusAlerts+:: { - groups+: [ - { - name: 'gitpod-component-meta-node-alerts', - rules: [ - { - alert: 'GitpodMetaNodeOOMKills', - labels: { - severity: 'warning', - }, - annotations: { - runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodMetaNodeOOMKills.md', - summary: 'A meta node is reporting OOM kills.', - description: 'Meta node {{ $labels.instance }} is reporting {{ printf "%.2f" $value }} Out Of Memory kills in the last 10 minutes.', - }, - expr: 'increase(node_vmstat_oom_kill{instance=~".*meta.*"}[10m]) > 1', - }, - { - alert: 'GitpodMetaNodeCPUSaturation', - labels: { - severity: 'warning', - }, - 'for': '10m', - annotations: { - runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodMetaNodeCPUSaturation.md', - summary: 'High CPU Saturation of a meta node.', - description: 'Meta node {{ $labels.instance }} is reporting {{ printf "%.2f" $value }}% CPU usage for more than 10 minutes.', - }, - expr: '(1 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle", instance=~".*meta.*"}[2m])))) * 100 > 75', - }, - ], - }, - ], - }, -} diff --git a/operations/observability/mixins/meta/rules/components/nodes/rules.libsonnet b/operations/observability/mixins/meta/rules/components/nodes/rules.libsonnet deleted file mode 100644 index 270dc27134d883..00000000000000 --- a/operations/observability/mixins/meta/rules/components/nodes/rules.libsonnet +++ /dev/null @@ -1,11 +0,0 @@ -/** - * Copyright (c) 2021 Gitpod GmbH. All rights reserved. - * Licensed under the MIT License. See License-MIT.txt in the project root for license information. - */ - -{ - prometheusRules+:: { - groups+: [], - // meta team doesn have any recording rules yet - }, -} diff --git a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet deleted file mode 100644 index a9af57c22489f2..00000000000000 --- a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet +++ /dev/null @@ -1,155 +0,0 @@ -/** - * Copyright (c) 2021 Gitpod GmbH. All rights reserved. - * Licensed under the MIT License. See License-MIT.txt in the project root for license information. - */ - -{ - prometheusAlerts+:: { - groups+: [ - { - name: 'gitpod-component-meta-server-alerts', - rules: [ - { - alert: 'WebsocketConnectionsNotClosing', - expr: 'sum(server_websocket_connection_count) == 10000', - 'for': '10m', - labels: { - severity: 'critical', - }, - annotations: { - runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionsNotClosing.md', - summary: 'Open websocket connections are not closing for the last 10 minutes and accumulating.', - description: 'We have accumulated {{ printf "%.2f" $value }} open websocket connections.', - }, - }, - { - alert: 'ServerEventLoopLagTooHigh', - expr: 'avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) > 0.35', - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/ServerEventLoopLagTooHigh.md', - summary: 'Server accumulated too much "event loop lag". The webapp will become unresponsive if we don\'t act here.', - description: 'Server has accumulated {{ printf "%.2f" $value }}s event loop lag.', - }, - }, - { - alert: 'InstanceStartFailures', - // Reasoning: 1 failure every 120s should not trigger an incident: 1/120 = 0.00833.. => 0.01 - expr: 'sum (irate(gitpod_server_instance_starts_failed_total[2m])) by (reason) > 0.01', - 'for': '30s', - labels: { - severity: 'critical', - }, - annotations: { - runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/InstanceStartFailures.md', - summary: 'Server tries to start an instance, but cannot for whatever reason. Investigation required.', - description: 'Server cannot start workspace instances on workspace clusters.', - }, - }, - // Rollout alerts - { - alert: 'JsonRpcApiErrorRates', - // Reasoning: the values are taken from past data - expr: 'sum (rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) / sum(rate(gitpod_server_api_calls_total[5m])) > 0.04', - 'for': '5m', - labels: { - // sent to the team internal channel until we fine tuned it - severity: 'warning', - team: 'webapp' - }, - annotations: { - runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodApiErrorRate.md', - summary: 'The error rate of the JSON RPC API is high. Investigation required.', - description: 'JSON RPC API error rate high', - }, - }, - { - alert: 'WebsocketConnectionRateHigh', - // Reasoning: the values are taken from past data - expr: 'sum(rate(gitpod_server_api_connections_total[2m])) by (pod) > 5', - 'for': '10m', - labels: { - // sent to the team internal channel until we fine tuned it - severity: 'warning', - team: 'webapp' - }, - annotations: { - runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionRateHigh.md', - summary: 'The websocket connection rate is higher than usual. Investigation required.', - description: 'Websocket connection rate high', - }, - }, - /** - * TODO(gpl) This will be true for US all the time. Can we exclude that cluster somehow? - * { - * alert: 'db-sync not running', - * expr: 'sum (kube_pod_status_phase{pod=~"db-sync.*"}) by (pod) < 1', - * 'for': '5m', - * labels: { - * // sent to the team internal channel until we fine tuned it - * severity: 'warning', - * team: 'webapp' - * }, - * annotations: { - * runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/DbSyncNotRunning.md', - * summary: 'The db-sync pod is not running. Investigation required.', - * description: 'db-sync pod not running', - * }, - * }, - * - */ - { - alert: 'MessagebusNotRunning', - expr: 'up{job="messagebus"} < 1', - 'for': '2m', - labels: { - // sent to the team internal channel until we fine tuned it - severity: 'warning', - team: 'webapp' - }, - annotations: { - runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/MessagebusNotRunning.md', - summary: 'The messagebus pod is not running. Workspace information is not being correctly propagated into web app clusters. Investigation required.', - description: 'Messagebus pod not running', - }, - }, - { - alert: 'WebAppServicesHighCPUUsage', - // Reasoning: high rates of CPU consumption should only be temporary. - expr: 'sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node) > 0.80', - 'for': '10m', - labels: { - // sent to the team internal channel until we fine tuned it - severity: 'warning', - team: 'webapp' - }, - annotations: { - runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesHighCPUUsage.md', - summary: 'WebApp services consume excessive amounts of CPU. Investigation required.', - description: 'WebApp Services execcisve CPU usage', - }, - }, - { - alert: 'WebAppServicesCrashlooping', - // Reasoning: alert if any pod is restarting more than 3 times / 5 minutes. - expr: 'increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m]) > 3', - 'for': '5m', - labels: { - // sent to the team internal channel until we fine tuned it - severity: 'warning', - team: 'webapp' - }, - annotations: { - runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md', - summary: 'Pod is crash looping.', - description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes', - }, - }, - ], - }, - ], - }, -} diff --git a/operations/observability/mixins/meta/rules/components/usage/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/usage/alerts.libsonnet deleted file mode 100644 index ab23f186d8369b..00000000000000 --- a/operations/observability/mixins/meta/rules/components/usage/alerts.libsonnet +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (c) 2021 Gitpod GmbH. All rights reserved. - * Licensed under the MIT License. See License-MIT.txt in the project root for license information. - */ - -{ - prometheusAlerts+:: { - groups+: [ - { - name: 'gitpod-component-webapp-usage-alerts', - rules: [ - { - alert: 'GitpodUsageScheduledReconciliationFailures', - expr: 'sum(increase(gitpod_usage_reconcile_completed_duration_seconds_count{outcome!="success"}[1h])) > 1', - 'for': '30m', - labels: { - severity: 'warning', - team: 'webapp' - }, - annotations: { - runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodUsageScheduledReconciliationFailures.md', - summary: 'There are failed scheduled reconciliations in the usage component.', - description: 'We have accumulated {{ printf "%.2f" $value }} failures. This affects how stale usage data is and/or updating invoices in Stripe.', - }, - }, - ], - }, - ], - }, -} diff --git a/operations/observability/mixins/meta/rules/login-slo.yaml b/operations/observability/mixins/meta/rules/login-slo.yaml new file mode 100644 index 00000000000000..751214bfb89c42 --- /dev/null +++ b/operations/observability/mixins/meta/rules/login-slo.yaml @@ -0,0 +1,114 @@ +# Copyright (c) 2022 Gitpod GmbH. All rights reserved. +# Licensed under the GNU Affero General Public License (AGPL). +# See License-AGPL.txt in the project root for license information. + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: k8s + role: alert-rules + name: login-slo-monitoring-rules + namespace: monitoring-satellite +spec: + groups: + - name: login-slo-recording-rules + rules: + - record: gitpod_server_login_requests_total:5m_failure_ratio + expr: | + sum(rate(gitpod_server_login_requests_total{status="failed"}[5m])) + / + sum(rate(gitpod_server_login_requests_total[5m])) + + - record: gitpod_server_login_requests_total:30m_failure_ratio + expr: | + sum(rate(gitpod_server_login_requests_total{status="failed"}[30m])) + / + sum(rate(gitpod_server_login_requests_total[30m])) + + - record: gitpod_server_login_requests_total:1h_failure_ratio + expr: | + sum(rate(gitpod_server_login_requests_total{status="failed"}[1h])) + / + sum(rate(gitpod_server_login_requests_total[1h])) + + - record: gitpod_server_login_requests_total:2h_failure_ratio + expr: | + sum(rate(gitpod_server_login_requests_total{status="failed"}[2h])) + / + sum(rate(gitpod_server_login_requests_total[2h])) + + - record: gitpod_server_login_requests_total:6h_failure_ratio + expr: | + sum(rate(gitpod_server_login_requests_total{status="failed"}[6h])) + / + sum(rate(gitpod_server_login_requests_total[6h])) + + - record: gitpod_server_login_requests_total:1d_failure_ratio + expr: | + sum(rate(gitpod_server_login_requests_total{status="failed"}[1d])) + / + sum(rate(gitpod_server_login_requests_total[1d])) + + - record: gitpod_server_login_requests_total:3d_failure_ratio + expr: | + sum(rate(gitpod_server_login_requests_total{status="failed"}[3d])) + / + sum(rate(gitpod_server_login_requests_total[3d])) + + - record: gitpod_server_login_requests_total:30d_failure_ratio + expr: | + sum(rate(gitpod_server_login_requests_total{status="failed"}[30d])) + / + sum(rate(gitpod_server_login_requests_total[30d])) + + - record: gitpod_server_login_requests_total:slo_target + expr: "0.95" + + - record: gitpod_server_login_requests_total:error_budget_remaining + expr: gitpod_server_login_requests_total:monthly_availability - gitpod_server_login_requests_total:slo_target + + - record: gitpod_server_login_requests_total:monthly_availability + expr: 1 - gitpod_server_login_requests_total:30d_failure_ratio + + - name: login-slo-alerts + rules: + - alert: GitpodLoginErrorBudgetBurn + labels: + severity: critical + annotations: + runbook_url: https://github.com/gitpod-com/observability/blob/main/runbooks/GitpodLoginErrorBudgetBurn.md + summary: Error budget is being burn too quickly. + description: Error budget is being burn too quickly. At this rate, the whole monthly budget will be burnt in less than 2 days. + expr: | + ( + gitpod_server_login_requests_total:1h_failure_ratio > (14.4 * (1 - gitpod_server_login_requests_total:slo_target)) + and + gitpod_server_login_requests_total:5m_failure_ratio > (14.4 * (1 - gitpod_server_login_requests_total:slo_target)) + ) + or + ( + gitpod_server_login_requests_total:6h_failure_ratio > (6 * (1 - gitpod_server_login_requests_total:slo_target)) + and + gitpod_server_login_requests_total:30m_failure_ratio > (6 * (1 - gitpod_server_login_requests_total:slo_target)) + ) + + - alert: GitpodLoginErrorBudgetBurn + labels: + severity: warning + annotations: + runbook_url: https://github.com/gitpod-com/observability/blob/main/runbooks/GitpodLoginErrorBudgetBurn.md + summary: Error budget is being burn quickly. + description: Error budget is being burn quickly. At this rate, the whole monthly budget will be burnt in less than 10 days. + expr: | + ( + gitpod_server_login_requests_total:1d_failure_ratio > (3 * (1 - gitpod_server_login_requests_total:slo_target)) + and + gitpod_server_login_requests_total:2h_failure_ratio > (3 * (1 - gitpod_server_login_requests_total:slo_target)) + ) + or + ( + gitpod_server_login_requests_total:3d_failure_ratio > (1 * (1 - gitpod_server_login_requests_total:slo_target)) + and + gitpod_server_login_requests_total:6h_failure_ratio > (1 * (1 - gitpod_server_login_requests_total:slo_target)) + ) diff --git a/operations/observability/mixins/meta/rules/messagebus.yaml b/operations/observability/mixins/meta/rules/messagebus.yaml new file mode 100644 index 00000000000000..2a246207ff90bc --- /dev/null +++ b/operations/observability/mixins/meta/rules/messagebus.yaml @@ -0,0 +1,25 @@ +# Copyright (c) 2022 Gitpod GmbH. All rights reserved. +# Licensed under the GNU Affero General Public License (AGPL). +# See License-AGPL.txt in the project root for license information. + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: k8s + role: alert-rules + name: messagebus-monitoring-rules + namespace: monitoring-satellite +spec: + groups: + - name: messagebus + rules: + - alert: GitpodMetaMessagebusTotalQueues + labels: + severity: critical + for: 2m + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodMetaMessagebusTotalQueues.md + summary: A messagebus has too many queues in total. + description: messagebus {{ $labels.pod }} is reporting {{ printf "%.2f" $value }} queues in total. + expr: sum by (instance) (rabbitmq_queues) > 10000 diff --git a/operations/observability/mixins/meta/rules/meta-nodes.yaml b/operations/observability/mixins/meta/rules/meta-nodes.yaml new file mode 100644 index 00000000000000..788d6de2505996 --- /dev/null +++ b/operations/observability/mixins/meta/rules/meta-nodes.yaml @@ -0,0 +1,33 @@ +# Copyright (c) 2022 Gitpod GmbH. All rights reserved. +# Licensed under the GNU Affero General Public License (AGPL). +# See License-AGPL.txt in the project root for license information. + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: k8s + role: alert-rules + name: meta-nodes-monitoring-rules + namespace: monitoring-satellite +spec: + groups: + - name: meta-nodes + rules: + - alert: GitpodMetaNodeOOMKills + labels: + severity: warning + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodMetaNodeOOMKills.md + summary: A meta node is reporting OOM kills. + description: Meta node {{ $labels.instance }} is reporting {{ printf "%.2f" $value }} Out Of Memory kills in the last 10 minutes. + expr: increase(node_vmstat_oom_kill{instance=~".*meta.*"}[10m]) > 1 + - alert: GitpodMetaNodeCPUSaturation + labels: + severity: warning + for: 10m + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodMetaNodeCPUSaturation.md + summary: High CPU Saturation of a meta node. + description: Meta node {{ $labels.instance }} is reporting {{ printf "%.2f" $value }}% CPU usage for more than 10 minutes. + expr: (1 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle", instance=~".*meta.*"}[2m])))) * 100 > 75 diff --git a/operations/observability/mixins/meta/rules/server.yaml b/operations/observability/mixins/meta/rules/server.yaml new file mode 100644 index 00000000000000..3c363a99c7e7d5 --- /dev/null +++ b/operations/observability/mixins/meta/rules/server.yaml @@ -0,0 +1,125 @@ +# Copyright (c) 2022 Gitpod GmbH. All rights reserved. +# Licensed under the GNU Affero General Public License (AGPL). +# See License-AGPL.txt in the project root for license information. + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: k8s + role: alert-rules + name: server-monitoring-rules + namespace: monitoring-satellite +spec: + groups: + - name: server + rules: + - alert: WebsocketConnectionsNotClosing + expr: sum(server_websocket_connection_count) == 10000 + for: 10m + labels: + severity: critical + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionsNotClosing.md + summary: Open websocket connections are not closing for the last 10 minutes and accumulating. + description: We have accumulated {{ printf "%.2f" $value }} open websocket connections. + + - alert: ServerEventLoopLagTooHigh + expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) > 0.35 + for: 5m + labels: + severity: critical + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/ServerEventLoopLagTooHigh.md + summary: Server accumulated too much "event loop lag". The webapp will become unresponsive if we don't act here. + description: Server has accumulated {{ printf "%.2f" $value }}s event loop lag. + + - alert: InstanceStartFailures + # Reasoning: 1 failure every 120s should not trigger an incident: 1/120 = 0.00833.. => 0.01 + expr: sum (irate(gitpod_server_instance_starts_failed_total[2m])) by (reason) > 0.01 + for: 30s + labels: + severity: critical + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/InstanceStartFailures.md + summary: Server tries to start an instance, but cannot for whatever reason. Investigation required. + description: Server cannot start workspace instances on workspace clusters. + + # Rollout alerts + - alert: JsonRpcApiErrorRates + # Reasoning: the values are taken from past data + expr: sum (rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) / sum(rate(gitpod_server_api_calls_total[5m])) > 0.04 + for: 5m + labels: + # sent to the team internal channel until we fine tuned it + severity: warning + team: webapp + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodApiErrorRate.md + summary: The error rate of the JSON RPC API is high. Investigation required. + description: JSON RPC API error rate high + + - alert: WebsocketConnectionRateHigh + # Reasoning: the values are taken from past data + expr: sum(rate(gitpod_server_api_connections_total[2m])) by (pod) > 5 + for: 10m + labels: + # sent to the team internal channel until we fine tuned it + severity: warning + team: webapp + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionRateHigh.md + summary: The websocket connection rate is higher than usual. Investigation required. + description: Websocket connection rate high + + # TODO(gpl) This will be true for US all the time. Can we exclude that cluster somehow? + # + # - alert: db-sync not running + # expr: sum (kube_pod_status_phase{pod=~"db-sync.*"}) by (pod) < 1 + # for: 5m + # labels: + # # sent to the team internal channel until we fine tuned it + # severity: warning + # team: webapp + # annotations: + # runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/DbSyncNotRunning.md + # summary: The db-sync pod is not running. Investigation required. + # description: db-sync pod not running + + - alert: MessagebusNotRunning + expr: up{job="messagebus"} < 1 + for: 2m + labels: + # sent to the team internal channel until we fine tuned it + severity: warning + team: webapp + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/MessagebusNotRunning.md + summary: The messagebus pod is not running. Workspace information is not being correctly propagated into web app clusters. Investigation required. + description: Messagebus pod not running + + - alert: WebAppServicesHighCPUUsage + # Reasoning: high rates of CPU consumption should only be temporary. + expr: sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node) > 0.80 + for: 10m + labels: + # sent to the team internal channel until we fine tuned it + severity: warning + team: webapp + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesHighCPUUsage.md + summary: WebApp services consume excessive amounts of CPU. Investigation required. + description: WebApp Services execcisve CPU usage + + - alert: WebAppServicesCrashlooping + # Reasoning: alert if any pod is restarting more than 3 times / 5 minutes. + expr: increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m]) > 3 + for: 5m + labels: + # sent to the team internal channel until we fine tuned it + severity: warning + team: webapp + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md + summary: Pod is crash looping. + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes diff --git a/operations/observability/mixins/meta/rules/usage.yaml b/operations/observability/mixins/meta/rules/usage.yaml new file mode 100644 index 00000000000000..2ff8af560055f9 --- /dev/null +++ b/operations/observability/mixins/meta/rules/usage.yaml @@ -0,0 +1,26 @@ +# Copyright (c) 2022 Gitpod GmbH. All rights reserved. +# Licensed under the GNU Affero General Public License (AGPL). +# See License-AGPL.txt in the project root for license information. + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: k8s + role: alert-rules + name: usage-monitoring-rules + namespace: monitoring-satellite +spec: + groups: + - name: usage + rules: + - alert: GitpodUsageScheduledReconciliationFailures' + expr: sum(increase(gitpod_usage_reconcile_completed_duration_seconds_count{outcome!="success"}[1h])) > 1 + for: 30m + labels: + severity: warning + team: webapp + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodUsageScheduledReconciliationFailures.md + summary: There are failed scheduled reconciliations in the usage component. + description: We have accumulated {{ printf "%.2f" $value }} failures. This affects how stale usage data is and/or updating invoices in Stripe.