Skip to content

Commit

Permalink
update mixin
Browse files Browse the repository at this point in the history
Signed-off-by: Xiang Dai <764524258@qq.com>
  • Loading branch information
daixiang0 committed Feb 20, 2020
1 parent cd8a067 commit 4bda22a
Show file tree
Hide file tree
Showing 12 changed files with 115 additions and 22 deletions.
42 changes: 42 additions & 0 deletions examples/alerts/alerts.md
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,48 @@ rules:
severity: warning
```
## Replicate
[embedmd]:# (../tmp/thanos-replicate.rules.yaml yaml)
```yaml
name: thanos-replicate.rules
rules:
- alert: ThanosReplicateIsDown
annotations:
message: Thanos Replicate has disappeared from Prometheus target discovery.
expr: |
absent(up{job=~"thanos-replicate.*"})
for: 5m
labels:
severity: critical
- alert: ThanosReplicateErrorRate
annotations:
message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts
failed.
expr: |
(
sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-replicate.*"}[5m]))
/ on (namespace) group_left
sum(rate(thanos_replicate_replication_runs_total{job=~"thanos-replicate.*"}[5m]))
) * 100 >= 10
for: 5m
labels:
severity: critical
- alert: ThanosReplicateRunLatency
annotations:
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{
$value }} seconds for the replicate operations.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-replicate.*"})) > 120
and
sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-replicate.*"}[5m])) > 0
)
for: 5m
labels:
severity: critical
```
## Extras
### Absent Rules
Expand Down
36 changes: 36 additions & 0 deletions examples/alerts/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -439,3 +439,39 @@ groups:
for: 5m
labels:
severity: critical
- name: thanos-replicate.rules
rules:
- alert: ThanosReplicateIsDown
annotations:
message: Thanos Replicate has disappeared from Prometheus target discovery.
expr: |
absent(up{job=~"thanos-replicate.*"})
for: 5m
labels:
severity: critical
- alert: ThanosReplicateErrorRate
annotations:
message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts
failed.
expr: |
(
sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-replicate.*"}[5m]))
/ on (namespace) group_left
sum(rate(thanos_replicate_replication_runs_total{job=~"thanos-replicate.*"}[5m]))
) * 100 >= 10
for: 5m
labels:
severity: critical
- alert: ThanosReplicateRunLatency
annotations:
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{
$value }} seconds for the replicate operations.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-replicate.*"})) > 120
and
sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-replicate.*"}[5m])) > 0
)
for: 5m
labels:
severity: critical
2 changes: 2 additions & 0 deletions examples/alerts/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,5 @@ groups:
labels:
quantile: "0.99"
record: :thanos_objstore_bucket_operation_duration_seconds:histogram_quantile
- name: thanos-replicate.rules
rules: []
1 change: 1 addition & 0 deletions examples/dashboards/dashboards.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ There exists Grafana dashboards for each component (not all of them complete) ta
- [Thanos Receiver](thanos-receiver.json)
- [Thanos Sidecar](thanos-sidecar.json)
- [Thanos Ruler](thanos-ruler.json)
- [Thanos Replicate](thanos-replicate.json)

You can import them via `Import -> Paste JSON` in Grafana.
These dashboards require Grafana 5 or above, importing them in older versions are known not to work.
Expand Down
5 changes: 5 additions & 0 deletions mixin/thanos/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ This project is intended to be used as a library. You can extend and customize d
selector: 'job=~"%s.*"' % self.jobPrefix,
title: '%(prefix)sSidecar' % $.dashboard.prefix,
},
replicate+:: {
jobPrefix: 'thanos-replicate',
selector: 'job=~"%s.*"' % self.jobPrefix,
title: '%(prefix)sReplicate' % $.dashboard.prefix,
},
overview+:: {
title: '%(prefix)sOverview' % $.dashboard.prefix,
},
Expand Down
3 changes: 2 additions & 1 deletion mixin/thanos/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
(import 'sidecar.libsonnet') +
(import 'store.libsonnet') +
(import 'rule.libsonnet') +
(import 'absent.libsonnet')
(import 'absent.libsonnet') +
(import 'replicate.libsonnet')
8 changes: 4 additions & 4 deletions mixin/thanos/alerts/replicate.libsonnet
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
local thanos = self,
replicator+:: {
replicate+:: {
jobPrefix: error 'must provide job prefix for Thanos Replicate dashboard',
selector: error 'must provide selector for Thanos Replicate dashboard',
},
Expand All @@ -13,7 +13,7 @@
alert: 'ThanosReplicateIsDown',
expr: |||
absent(up{%(selector)s})
||| % thanos.replicator,
||| % thanos.replicate,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -33,7 +33,7 @@
/ on (namespace) group_left
sum(rate(thanos_replicate_replication_runs_total{%(selector)s}[5m]))
) * 100 >= 10
||| % thanos.replicator,
||| % thanos.replicate,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -50,7 +50,7 @@
and
sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m])) > 0
)
||| % thanos.replicator,
||| % thanos.replicate,
'for': '5m',
labels: {
severity: 'critical',
Expand Down
1 change: 1 addition & 0 deletions mixin/thanos/dashboards/dashboards.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
(import 'receive.libsonnet') +
(import 'rule.libsonnet') +
(import 'compact.libsonnet') +
(import 'replicate.libsonnet') +
(import 'overview.libsonnet') +
(import 'defaults.libsonnet')
29 changes: 14 additions & 15 deletions mixin/thanos/dashboards/replicate.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,35 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet';

{
local thanos = self,
replicator+:: {
replicate+:: {
jobPrefix: error 'must provide job prefix for Thanos Replicate dashboard',
selector: error 'must provide selector for Thanos Replicate dashboard',
title: error 'must provide title for Thanos Replicate dashboard',
},
grafanaDashboards+:: {
'replicate.json':
g.dashboard(thanos.replicator.title)
g.dashboard(thanos.replicate.title)
.addRow(
g.row('Replicate Runs')
.addPanel(
g.panel('Rate') +
g.qpsErrTotalPanel(
'thanos_replicate_replication_runs_total{result="error", namespace="$namespace",%(selector)s}' % thanos.replicator,
'thanos_replicate_replication_runs_total{namespace="$namespace",%(selector)s}' % thanos.replicator,
'thanos_replicate_replication_runs_total{result="error", namespace="$namespace",%(selector)s}' % thanos.replicate,
'thanos_replicate_replication_runs_total{namespace="$namespace",%(selector)s}' % thanos.replicate,
)
)
.addPanel(
g.panel('Errors', 'Shows rate of errors.') +
g.queryPanel(
'sum(rate(thanos_replicate_replication_runs_total{result="error", namespace="$namespace",%(selector)s}[$interval])) by (result)' % thanos.replicator,
'sum(rate(thanos_replicate_replication_runs_total{result="error", namespace="$namespace",%(selector)s}[$interval])) by (result)' % thanos.replicate,
'{{result}}'
) +
{ yaxes: g.yaxes('percentunit') } +
g.stack
)
.addPanel(
g.panel('Duration', 'Shows how long has it taken to run a replication cycle.') +
g.latencyPanel('thanos_replicate_replication_run_duration_seconds', 'result="success", namespace="$namespace",%(selector)s' % thanos.replicator)
g.latencyPanel('thanos_replicate_replication_run_duration_seconds', 'result="success", namespace="$namespace",%(selector)s' % thanos.replicate)
)
)
.addRow(
Expand All @@ -39,20 +39,19 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet';
g.panel('Metrics') +
g.queryPanel(
[
'sum(rate(thanos_replicate_origin_iterations_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicator,
'sum(rate(thanos_replicate_origin_meta_loads_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicator,
'sum(rate(thanos_replicate_origin_partial_meta_reads_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicator,
'sum(rate(thanos_replicate_blocks_already_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicator,
'sum(rate(thanos_replicate_blocks_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicator,
'sum(rate(thanos_replicate_objects_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicator,
'sum(rate(thanos_replicate_origin_iterations_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicate,
'sum(rate(thanos_replicate_origin_meta_loads_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicate,
'sum(rate(thanos_replicate_origin_partial_meta_reads_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicate,
'sum(rate(thanos_replicate_blocks_already_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicate,
'sum(rate(thanos_replicate_blocks_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicate,
'sum(rate(thanos_replicate_objects_replicated_total{namespace="$namespace",%(selector)s}[$interval]))' % thanos.replicate,
],
['iterations', 'meta loads', 'partial meta reads', 'already replicated blocks', 'replicated blocks', 'replicated objects']
)
)
)
+
g.template('namespace', 'kube_pod_info') +
g.template('job', 'up', 'namespace="$namespace",%(selector)s' % thanos.replicator, true, '%(jobPrefix)s.*' % thanos.replicator),
g.template('job', 'up', 'namespace="$namespace",%(selector)s' % thanos.replicate, true, '%(jobPrefix)s.*' % thanos.replicate),
},
} +
(import 'defaults.libsonnet')
}
5 changes: 5 additions & 0 deletions mixin/thanos/defaults.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
selector: 'job=~"%s.*"' % self.jobPrefix,
title: '%(prefix)sSidecar' % $.dashboard.prefix,
},
replicate+:: {
jobPrefix: 'thanos-replicate',
selector: 'job=~"%s.*"' % self.jobPrefix,
title: '%(prefix)sReplicate' % $.dashboard.prefix,
},
overview+:: {
title: '%(prefix)sOverview' % $.dashboard.prefix,
},
Expand Down
2 changes: 1 addition & 1 deletion mixin/thanos/rules/replicate.libsonnet
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
local thanos = self,
replicator+:: {
replicate+:: {
selector: error 'must provide selector for Thanos Replicate dashboard',
},
prometheusRules+:: {
Expand Down
3 changes: 2 additions & 1 deletion mixin/thanos/rules/rules.libsonnet
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
(import 'query.libsonnet') +
(import 'receive.libsonnet') +
(import 'store.libsonnet')
(import 'store.libsonnet') +
(import 'replicate.libsonnet')

0 comments on commit 4bda22a

Please sign in to comment.