Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lifecycle latency alerts #2480

Merged
merged 5 commits into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions .github/workflows/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,23 @@ jobs:
uses: actions/checkout@v4

- name: Render and test lifecycle
uses: scality/action-prom-render-test@1.0.2
uses: scality/action-prom-render-test@1.0.3
with:
alert_file_path: monitoring/lifecycle/alerts.yaml
test_file_path: monitoring/lifecycle/alerts.test.yaml
alert_inputs: >-
namespace=zenko,job_lifecycle_producer=artesca-data-backbeat-lifecycle-producer-headless,job_lifecycle_bucket_processor=artesca-data-backbeat-lifecycle-bucket-processor-headless,job_lifecycle_object_processor=artesca-data-backbeat-lifecycle-object-processor-headless,lifecycle_conductor_replicas=1,lifecycle_bucket_replicas=3,lifecycle_object_replicas=3
alert_inputs: |
namespace=zenko
job_lifecycle_producer=artesca-data-backbeat-lifecycle-producer-headless
job_lifecycle_bucket_processor=artesca-data-backbeat-lifecycle-bucket-processor-headless
job_lifecycle_object_processor=artesca-data-backbeat-lifecycle-object-processor-headless
job_lifecycle_transition_processor=artesca-data-backbeat-lifecycle-transition-processor-headless
job_sorbet_forwarder=artesca-data-sorbet-fwd-
lifecycle_conductor_replicas=1
lifecycle_bucket_replicas=3
lifecycle_object_replicas=3
lifecycle_transition_replicas=3
lifecycle_latency_warning_threshold=120
lifecycle_latency_critical_threshold=180
github_token: ${{ secrets.GIT_ACCESS_TOKEN }}

- name: Render and test replication
Expand Down
187 changes: 187 additions & 0 deletions monitoring/lifecycle/alerts.test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,193 @@ tests:
description: "Less than 50% of lifecycle object processors for expiration are up and healthy"
summary: "Degraded lifecycle object processor"

- name: LifecycleTransitionProcessor Replicas
interval: 1m
input_series:
- series: up{namespace="zenko",job="artesca-data-backbeat-lifecycle-transition-processor-headless",pod="object-1"}
values: 1 1 1
- series: up{namespace="zenko",job="artesca-data-backbeat-lifecycle-transition-processor-headless",pod="object-2"}
values: 1 1 0
- series: up{namespace="zenko",job="artesca-data-backbeat-lifecycle-transition-processor-headless",pod="object-3"}
values: 1 0 0
alert_rule_test:
- alertname: LifecycleTransitionProcessorDegraded
eval_time: 1m
exp_alerts: []
- alertname: LifecycleTransitionProcessorCritical
eval_time: 1m
exp_alerts: []
- alertname: LifecycleTransitionProcessorDegraded
eval_time: 2m
exp_alerts:
- exp_labels:
severity: warning
exp_annotations:
zenko_service: backbeat-lifecycle-transition-processor
description: "Less than 100% of lifecycle transition processors are up and healthy"
summary: "Degraded lifecycle transition processor"
- alertname: LifecycleTransitionProcessorCritical
eval_time: 2m
exp_alerts: []
- alertname: LifecycleTransitionProcessorDegraded
eval_time: 3m
exp_alerts:
- exp_labels:
severity: warning
exp_annotations:
zenko_service: backbeat-lifecycle-transition-processor
description: "Less than 100% of lifecycle transition processors are up and healthy"
summary: "Degraded lifecycle transition processor"
- alertname: LifecycleTransitionProcessorCritical
eval_time: 3m
exp_alerts:
- exp_labels:
severity: critical
exp_annotations:
zenko_service: backbeat-lifecycle-transition-processor
description: "Less than 50% of lifecycle transition processors are up and healthy"
summary: "Degraded lifecycle transition processor"

- name: LifecycleLateScanWarning
interval: 1m
input_series:
- series: s3_lifecycle_latest_batch_start_time{namespace="zenko",job="artesca-data-backbeat-lifecycle-producer-headless"}
values: 0 0 0 0 240000 299000 299000 299000
alert_rule_test:
- alertname: LifecycleLateScanWarning
eval_time: 1m
exp_alerts: []
- alertname: LifecycleLateScanWarning
eval_time: 2m
exp_alerts: []
- alertname: LifecycleLateScanWarning
eval_time: 3m
exp_alerts:
- exp_labels:
severity: warning
exp_annotations:
zenko_service: backbeat-lifecycle-producer
description: "Last lifecycle scan was performed more than 2m 0s ago."
summary: "Lifecycle scan not executed in time"
- alertname: LifecycleLateScanWarning
eval_time: 4m
exp_alerts: []
- alertname: LifecycleLateScanWarning
eval_time: 5m
exp_alerts: []
- alertname: LifecycleLateScanWarning
eval_time: 6m
exp_alerts: []
- alertname: LifecycleLateScanWarning
eval_time: 7m
exp_alerts:
- exp_labels:
severity: warning
exp_annotations:
zenko_service: backbeat-lifecycle-producer
description: "Last lifecycle scan was performed more than 2m 0s ago."
summary: "Lifecycle scan not executed in time"

- name: LifecycleLateScanCritical
interval: 1m
input_series:
- series: s3_lifecycle_latest_batch_start_time{namespace="zenko",job="artesca-data-backbeat-lifecycle-producer-headless"}
values: 0 0 0 0 0 60000 180000 240000
alert_rule_test:
- alertname: LifecycleLateScanCritical
eval_time: 1m
exp_alerts: []
- alertname: LifecycleLateScanCritical
eval_time: 2m
exp_alerts: []
- alertname: LifecycleLateScanCritical
eval_time: 3m
exp_alerts: []
- alertname: LifecycleLateScanCritical
eval_time: 4m
exp_alerts:
- exp_labels:
severity: critical
exp_annotations:
zenko_service: backbeat-lifecycle-producer
description: "Last lifecycle scan was performed more than 3m 0s ago."
summary: "Lifecycle scan not executed in time"
- alertname: LifecycleLateScanCritical
eval_time: 5m
exp_alerts:
- exp_labels:
severity: critical
exp_annotations:
zenko_service: backbeat-lifecycle-producer
description: "Last lifecycle scan was performed more than 3m 0s ago."
summary: "Lifecycle scan not executed in time"
- alertname: LifecycleLateScanCritical
eval_time: 6m
exp_alerts: []
- alertname: LifecycleLateScanCritical
eval_time: 7m
exp_alerts: []

- name: LifecycleLatency
interval: 10m
input_series:
- series: s3_lifecycle_latency_seconds_bucket{le="10", namespace="zenko", type="transition", location="us-east-1", job="artesca-data-backbeat-lifecycle-transition-processor-headless"}
values: 100 200 300 305 310 315 320 325 330 430
- series: s3_lifecycle_latency_seconds_bucket{le="50", namespace="zenko", type="transition", location="us-east-1", job="artesca-data-backbeat-lifecycle-transition-processor-headless"}
values: 100 200 300 305 310 315 320 325 330 430
- series: s3_lifecycle_latency_seconds_bucket{le="180", namespace="zenko", type="transition", location="us-east-1", job="artesca-data-backbeat-lifecycle-transition-processor-headless"}
values: 100 200 300 400 500 600 605 610 615 620
- series: s3_lifecycle_latency_seconds_bucket{le="240", namespace="zenko", type="transition", location="us-east-1", job="artesca-data-backbeat-lifecycle-transition-processor-headless"}
values: 100 200 300 400 500 600 700 800 900 1000
- series: s3_lifecycle_latency_seconds_bucket{le="+inf", namespace="zenko", type="transition", location="us-east-1", job="artesca-data-backbeat-lifecycle-transition-processor-headless"}
values: 100 200 300 400 500 600 700 800 900 1000
alert_rule_test:
- alertname: LifecycleLatencyWarning
eval_time: 10m
exp_alerts: []
- alertname: LifecycleLatencyCritical
eval_time: 10m
exp_alerts: []
- alertname: LifecycleLatencyWarning
eval_time: 20m
exp_alerts: []
- alertname: LifecycleLatencyCritical
eval_time: 20m
exp_alerts: []
- alertname: LifecycleLatencyWarning
eval_time: 30m
exp_alerts:
- exp_labels:
severity: warning
type: transition
location: us-east-1
exp_annotations:
description: "Lifecycle latency for `transition` is above the warning threshold on location `us-east-1`."
summary: "High lifecycle latency"
- alertname: LifecycleLatencyCritical
eval_time: 30m
exp_alerts: []
- alertname: LifecycleLatencyWarning
eval_time: 60m
exp_alerts:
- exp_labels:
severity: warning
type: transition
location: us-east-1
exp_annotations:
description: "Lifecycle latency for `transition` is above the warning threshold on location `us-east-1`."
summary: "High lifecycle latency"
- alertname: LifecycleLatencyCritical
eval_time: 60m
exp_alerts:
- exp_labels:
severity: critical
type: transition
location: us-east-1
exp_annotations:
description: "Lifecycle latency for `transition` is above the critical threshold on location `us-east-1`."
summary: "Very high lifecycle latency"

- name: KafkaConsumerSlowTask
interval: 1m
input_series:
Expand Down
126 changes: 126 additions & 0 deletions monitoring/lifecycle/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,24 @@ x-inputs:
type: constant
- name: job_lifecycle_bucket_processor
type: constant
- name: job_lifecycle_transition_processor
type: constant
- name: job_sorbet_forwarder
type: constant
- name: lifecycle_conductor_replicas
type: constant
- name: lifecycle_bucket_replicas
type: constant
- name: lifecycle_object_replicas
type: constant
- name: lifecycle_transition_replicas
type: constant
- name: lifecycle_latency_warning_threshold
type: config
value: 24*60*60 # 24h, in seconds
- name: lifecycle_latency_critical_threshold
type: config
value: 36*60*60 # 36h, in seconds

groups:
- name: LifecycleProducer
Expand All @@ -28,6 +40,40 @@ groups:
description: "Lifecycle producer pod has been down for 30 seconds"
summary: "Lifecycle producer service is down"

- alert: LifecycleLateScanWarning
Expr: |
(
time()
- max(s3_lifecycle_latest_batch_start_time{
namespace="${namespace}", job="${job_lifecycle_producer}"
}) / 1000
) / ${lifecycle_latency_warning_threshold} > 1
Labels:
severity: warning
Annotations:
zenko_service: backbeat-lifecycle-producer
description: >-
Last lifecycle scan was performed more than
{{ ${lifecycle_latency_warning_threshold} | humanizeDuration }} ago.
summary: "Lifecycle scan not executed in time"

- alert: LifecycleLateScanCritical
Expr: |
(
time()
- max(s3_lifecycle_latest_batch_start_time{
namespace="${namespace}", job="${job_lifecycle_producer}"
}) / 1000
) / ${lifecycle_latency_critical_threshold} > 1
Labels:
severity: critical
Annotations:
zenko_service: backbeat-lifecycle-producer
description: >-
Last lifecycle scan was performed more than
{{ ${lifecycle_latency_critical_threshold} | humanizeDuration }} ago.
summary: "Lifecycle scan not executed in time"

- name: LifecycleBucketProcessor
rules:

Expand Down Expand Up @@ -126,6 +172,86 @@ groups:
description: "More than 5% of S3 requests by object processors resulting in errors"
summary: "Very high rate of S3 request errors"

- alert: LifecycleTransitionProcessorDegraded
Expr: sum(up{namespace="${namespace}", job="${job_lifecycle_transition_processor}"}) < ${lifecycle_transition_replicas}
For: "30s"
Labels:
severity: warning
Annotations:
zenko_service: backbeat-lifecycle-transition-processor
description: "Less than 100% of lifecycle transition processors are up and healthy"
summary: "Degraded lifecycle transition processor"

- alert: LifecycleTransitionProcessorCritical
Expr: sum(up{namespace="${namespace}", job="${job_lifecycle_transition_processor}"}) * 2 < ${lifecycle_transition_replicas}
For: "30s"
Labels:
severity: critical
Annotations:
zenko_service: backbeat-lifecycle-transition-processor
description: "Less than 50% of lifecycle transition processors are up and healthy"
summary: "Degraded lifecycle transition processor"

- alert: LifecycleTransitionProcessorRequestWarning
Expr: |
sum(rate(s3_lifecycle_s3_operations_total{namespace="${namespace}", job="${job_lifecycle_transition_processor}", status!="2xx"}[5m]))
/ sum(rate(s3_lifecycle_s3_operations_total{namespace="${namespace}", job="${job_lifecycle_transition_processor}"}[5m]))
>= 0.03
For: "5m"
Labels:
severity: warning
Annotations:
zenko_service: backbeat-lifecycle-transition-processor
description: "More than 3% of S3 requests by transition processors resulting in errors"
summary: "High rate of S3 request errors"

- alert: LifecycleTransitionProcessorRequestCritical
Expr: |
sum(rate(s3_lifecycle_s3_operations_total{namespace="${namespace}", job="${job_lifecycle_transition_processor}", status!="2xx"}[5m]))
/ sum(rate(s3_lifecycle_s3_operations_total{namespace="${namespace}", job="${job_lifecycle_transition_processor}"}[5m]))
>= 0.05
For: "5m"
Labels:
severity: critical
Annotations:
zenko_service: backbeat-lifecycle-transition-processor
description: "More than 5% of S3 requests by transition processors resulting in errors"
summary: "Very high rate of S3 request errors"

- alert: LifecycleLatencyWarning
Expr: |
histogram_quantile(0.95,
sum(rate(
s3_lifecycle_latency_seconds_bucket{
namespace="${namespace}",job=~"${job_lifecycle_object_processor}|${job_lifecycle_transition_processor}|${job_sorbet_forwarder}.*"
}[10m]
)) by(le, type, location)
) / ${lifecycle_latency_warning_threshold} > 1
Labels:
severity: warning
Annotations:
description: >-
Lifecycle latency for `{{ $labels.type }}` is above the warning threshold on location
`{{ $labels.location }}`.
summary: "High lifecycle latency"

- alert: LifecycleLatencyCritical
Expr: |
histogram_quantile(0.95,
sum(rate(
s3_lifecycle_latency_seconds_bucket{
namespace="${namespace}", job=~"${job_lifecycle_object_processor}|${job_lifecycle_transition_processor}|${job_sorbet_forwarder}.*"
}[10m]
)) by(le, type, location)
) / ${lifecycle_latency_critical_threshold} > 1
Labels:
severity: critical
Annotations:
description: >-
Lifecycle latency for `{{ $labels.type }}` is above the critical threshold on location
`{{ $labels.location }}`.
summary: "Very high lifecycle latency"

- name: Kafka Messages
rules:

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "backbeat",
"version": "8.6.30",
"version": "8.6.31",
"description": "Asynchronous queue and job manager",
"main": "index.js",
"scripts": {
Expand Down