Skip to content

Commit

Permalink
Add config options for loki dashboards (#2617)
Browse files Browse the repository at this point in the history
  • Loading branch information
periklis authored Sep 18, 2020
1 parent 0b1dbe2 commit 899e8cc
Show file tree
Hide file tree
Showing 9 changed files with 1,123 additions and 260 deletions.
217 changes: 5 additions & 212 deletions production/loki-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -1,212 +1,5 @@
local g = import 'grafana-builder/grafana.libsonnet';
local utils = import 'mixin-utils/utils.libsonnet';

{
grafanaDashboards+: {
'loki-logs.json': import './dashboard-loki-logs.json',
'loki-operational.json': import './dashboard-loki-operational.json',
'loki-writes.json':
g.dashboard('Loki / Writes')
.addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*loki.*"}', 'cluster')
.addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*loki.*"}', 'namespace')
.addRow(
g.row('Frontend (cortex_gw)')
.addPanel(
g.panel('QPS') +
g.qpsPanel('loki_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route=~"api_prom_push|loki_api_v1_push"}')
)
.addPanel(
g.panel('Latency') +
utils.latencyRecordingRulePanel('loki_request_duration_seconds', [utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.re('route', 'api_prom_push|loki_api_v1_push')], extra_selectors=[utils.selector.re('cluster', '$cluster')])
)
)
.addRow(
g.row('Distributor')
.addPanel(
g.panel('QPS') +
g.qpsPanel('loki_request_duration_seconds_count{cluster=~"($cluster)", job=~"($namespace)/distributor"}')
)
.addPanel(
g.panel('Latency') +
utils.latencyRecordingRulePanel('loki_request_duration_seconds', [utils.selector.re('job', '($namespace)/distributor')], extra_selectors=[utils.selector.re('cluster', '$cluster')])
)
)
.addRow(
g.row('Ingester')
.addPanel(
g.panel('QPS') +
g.qpsPanel('loki_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",route="/logproto.Pusher/Push"}')
)
.addPanel(
g.panel('Latency') +
utils.latencyRecordingRulePanel('loki_request_duration_seconds', [utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('route', '/logproto.Pusher/Push')], extra_selectors=[utils.selector.re('cluster', '$cluster')])
)
)
.addRow(
g.row('BigTable')
.addPanel(
g.panel('QPS') +
g.qpsPanel('cortex_bigtable_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="/google.bigtable.v2.Bigtable/MutateRows"}')
)
.addPanel(
g.panel('Latency') +
utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')] + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')])
)
)
.addRow(
g.row('BoltDB Shipper')
.addPanel(
g.panel('QPS') +
g.qpsPanel('loki_boltdb_shipper_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="WRITE"}')
)
.addPanel(
g.panel('Latency') +
g.latencyPanel('loki_boltdb_shipper_request_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/ingester", operation="WRITE"}')
)
),

local http_routes = 'loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values',
local grpc_routes = '/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series',

'loki-reads.json':
g.dashboard('Loki / Reads')
.addTemplate('cluster', 'kube_pod_container_info{image=~".*loki.*"}', 'cluster')
.addTemplate('namespace', 'kube_pod_container_info{image=~".*loki.*"}', 'namespace')
.addRow(
g.row('Frontend (cortex_gw)')
.addPanel(
g.panel('QPS') +
g.qpsPanel('loki_request_duration_seconds_count{cluster="$cluster", job="$namespace/cortex-gw", route=~"%s"}' % http_routes)
)
.addPanel(
g.panel('Latency') +
utils.latencyRecordingRulePanel('loki_request_duration_seconds', [utils.selector.eq('job', '$namespace/cortex-gw'), utils.selector.re('route', http_routes)], extra_selectors=[utils.selector.eq('cluster', '$cluster')], sum_by=['route'])
)
)
.addRow(
g.row('Frontend (query-frontend)')
.addPanel(
g.panel('QPS') +
g.qpsPanel('loki_request_duration_seconds_count{cluster="$cluster", job="$namespace/query-frontend", route=~"%s"}' % http_routes)
)
.addPanel(
g.panel('Latency') +
utils.latencyRecordingRulePanel('loki_request_duration_seconds', [utils.selector.eq('job', '$namespace/query-frontend'), utils.selector.re('route', http_routes)], extra_selectors=[utils.selector.eq('cluster', '$cluster')], sum_by=['route'])
)
)
.addRow(
g.row('Querier')
.addPanel(
g.panel('QPS') +
g.qpsPanel('loki_request_duration_seconds_count{cluster="$cluster", job="$namespace/querier", route=~"%s"}' % http_routes)
)
.addPanel(
g.panel('Latency') +
utils.latencyRecordingRulePanel('loki_request_duration_seconds', [utils.selector.eq('job', '$namespace/querier'), utils.selector.re('route', http_routes)], extra_selectors=[utils.selector.eq('cluster', '$cluster')], sum_by=['route'])
)
)
.addRow(
g.row('Ingester')
.addPanel(
g.panel('QPS') +
g.qpsPanel('loki_request_duration_seconds_count{cluster="$cluster", job="$namespace/ingester",route=~"%s"}' % grpc_routes)
)
.addPanel(
g.panel('Latency') +
utils.latencyRecordingRulePanel('loki_request_duration_seconds', [utils.selector.eq('job', '$namespace/ingester'), utils.selector.re('route', grpc_routes)], extra_selectors=[utils.selector.eq('cluster', '$cluster')], sum_by=['route'])
)
)
.addRow(
g.row('BigTable')
.addPanel(
g.panel('QPS') +
g.qpsPanel('cortex_bigtable_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="/google.bigtable.v2.Bigtable/ReadRows"}')
)
.addPanel(
g.panel('Latency') +
utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')] + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')])
)
)
.addRow(
g.row('BoltDB Shipper')
.addPanel(
g.panel('QPS') +
g.qpsPanel('loki_boltdb_shipper_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="QUERY"}')
)
.addPanel(
g.panel('Latency') +
g.latencyPanel('loki_boltdb_shipper_request_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/querier", operation="QUERY"}')
)
),


'loki-chunks.json':
g.dashboard('Loki / Chunks')
.addTemplate('cluster', 'kube_pod_container_info{image=~".*loki.*"}', 'cluster')
.addTemplate('namespace', 'kube_pod_container_info{image=~".*loki.*"}', 'namespace')
.addRow(
g.row('Active Series / Chunks')
.addPanel(
g.panel('Series') +
g.queryPanel('sum(loki_ingester_memory_chunks{cluster="$cluster", job="$namespace/ingester"})', 'series'),
)
.addPanel(
g.panel('Chunks per series') +
g.queryPanel('sum(loki_ingester_memory_chunks{cluster="$cluster", job="$namespace/ingester"}) / sum(loki_ingester_memory_streams{job="$namespace/ingester"})', 'chunks'),
)
)
.addRow(
g.row('Flush Stats')
.addPanel(
g.panel('Utilization') +
g.latencyPanel('loki_ingester_chunk_utilization', '{cluster="$cluster", job="$namespace/ingester"}', multiplier='1') +
{ yaxes: g.yaxes('percentunit') },
)
.addPanel(
g.panel('Age') +
g.latencyPanel('loki_ingester_chunk_age_seconds', '{cluster="$cluster", job="$namespace/ingester"}'),
),
)
.addRow(
g.row('Flush Stats')
.addPanel(
g.panel('Size') +
g.latencyPanel('loki_ingester_chunk_entries', '{cluster="$cluster", job="$namespace/ingester"}', multiplier='1') +
{ yaxes: g.yaxes('short') },
)
.addPanel(
g.panel('Entries') +
g.queryPanel('sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{cluster="$cluster", job="$namespace/ingester"}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{cluster="$cluster", job="$namespace/ingester"}[5m]))', 'entries'),
),
)
.addRow(
g.row('Flush Stats')
.addPanel(
g.panel('Queue Length') +
g.queryPanel('cortex_ingester_flush_queue_length{cluster="$cluster", job="$namespace/ingester"}', '{{pod}}'),
)
.addPanel(
g.panel('Flush Rate') +
g.qpsPanel('loki_ingester_chunk_age_seconds_count{cluster="$cluster", job="$namespace/ingester"}'),
),
)
.addRow(
g.row('Duration')
.addPanel(
g.panel('Chunk Duration hours (end-start)') +
g.queryPanel(
[
'histogram_quantile(0.5, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster="$cluster", job="$namespace/ingester"}[5m])) by (le))',
'histogram_quantile(0.99, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster="$cluster", job="$namespace/ingester"}[5m])) by (le))',
'sum(rate(loki_ingester_chunk_bounds_hours_sum{cluster="$cluster", job="$namespace/ingester"}[5m])) / sum(rate(loki_ingester_chunk_bounds_hours_count{cluster="$cluster", job="$namespace/ingester"}[5m]))',
],
[
'p50',
'p99',
'avg',
],
),
)
),
},
}
(import 'dashboards/loki-chunks.libsonnet') +
(import 'dashboards/loki-logs.libsonnet') +
(import 'dashboards/loki-operational.libsonnet') +
(import 'dashboards/loki-reads.libsonnet') +
(import 'dashboards/loki-writes.libsonnet')
172 changes: 172 additions & 0 deletions production/loki-mixin/dashboards/loki-chunks.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
local g = import 'grafana-builder/grafana.libsonnet';
local utils = import 'mixin-utils/utils.libsonnet';

{
grafanaDashboards+: {
local dashboards = self,

'loki-chunks.json':{
local cfg = self,

showMultiCluster:: true,
clusterLabel:: 'cluster',
clusterMatchers::
if cfg.showMultiCluster then
[utils.selector.re(cfg.clusterLabel, '$cluster')]
else
[],

namespaceType:: 'query',
namespaceQuery::
if cfg.showMultiCluster then
'kube_pod_container_info{cluster="$cluster", image=~".*loki.*"}'
else
'kube_pod_container_info{image=~".*loki.*"}',

assert (cfg.namespaceType == 'custom' || cfg.namespaceType == 'query') : "Only types 'query' and 'custom' are allowed for dashboard variable 'namespace'",

matchers:: {
ingester: [utils.selector.re('job', '($namespace)/ingester')],
},

local selector(matcherId) =
std.join(',', ['%(label)s%(op)s"%(value)s"' % matcher for matcher in (cfg.clusterMatchers + cfg.matchers[matcherId])]),

ingesterSelector:: selector('ingester'),
ingesterSelectorOnly::
std.join(',', ['%(label)s%(op)s"%(value)s"' % matcher for matcher in cfg.matchers.ingester]),

templateLabels:: (
if cfg.showMultiCluster then [
{
variable:: 'cluster',
label:: cfg.clusterLabel,
query:: 'kube_pod_container_info{image=~".*loki.*"}',
type:: 'query'
},
] else []
) + [
{
variable:: 'namespace',
label:: 'namespace',
query:: cfg.namespaceQuery,
type:: cfg.namespaceType
},
],
} +
g.dashboard('Loki / Chunks')
.addRow(
g.row('Active Series / Chunks')
.addPanel(
g.panel('Series') +
g.queryPanel('sum(loki_ingester_memory_chunks{%s})' % dashboards['loki-chunks.json'].ingesterSelector, 'series'),
)
.addPanel(
g.panel('Chunks per series') +
g.queryPanel(
'sum(loki_ingester_memory_chunks{%s}) / sum(loki_ingester_memory_streams{%s})' % [
dashboards['loki-chunks.json'].ingesterSelector,
dashboards['loki-chunks.json'].ingesterSelectorOnly,
],
'chunks'
),
)
)
.addRow(
g.row('Flush Stats')
.addPanel(
g.panel('Utilization') +
g.latencyPanel('loki_ingester_chunk_utilization', '{%s}' % dashboards['loki-chunks.json'].ingesterSelector, multiplier='1') +
{ yaxes: g.yaxes('percentunit') },
)
.addPanel(
g.panel('Age') +
g.latencyPanel('loki_ingester_chunk_age_seconds', '{%s}' % dashboards['loki-chunks.json'].ingesterSelector),
),
)
.addRow(
g.row('Flush Stats')
.addPanel(
g.panel('Size') +
g.latencyPanel('loki_ingester_chunk_entries', '{%s}' % dashboards['loki-chunks.json'].ingesterSelector, multiplier='1') +
{ yaxes: g.yaxes('short') },
)
.addPanel(
g.panel('Entries') +
g.queryPanel(
'sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{%s}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{%s}[5m]))' % [
dashboards['loki-chunks.json'].ingesterSelector,
dashboards['loki-chunks.json'].ingesterSelector,
],
'entries'
),
),
)
.addRow(
g.row('Flush Stats')
.addPanel(
g.panel('Queue Length') +
g.queryPanel('cortex_ingester_flush_queue_length{%s}' % dashboards['loki-chunks.json'].ingesterSelector, '{{pod}}'),
)
.addPanel(
g.panel('Flush Rate') +
g.qpsPanel('loki_ingester_chunk_age_seconds_count{%s}' % dashboards['loki-chunks.json'].ingesterSelector,),
),
)
.addRow(
g.row('Duration')
.addPanel(
g.panel('Chunk Duration hours (end-start)') +
g.queryPanel(
[
'histogram_quantile(0.5, sum(rate(loki_ingester_chunk_bounds_hours_bucket{%s}[5m])) by (le))' % dashboards['loki-chunks.json'].ingesterSelector,
'histogram_quantile(0.99, sum(rate(loki_ingester_chunk_bounds_hours_bucket{%s}[5m])) by (le))' % dashboards['loki-chunks.json'].ingesterSelector,
'sum(rate(loki_ingester_chunk_bounds_hours_sum{%s}[5m])) / sum(rate(loki_ingester_chunk_bounds_hours_count{%s}[5m]))' % [
dashboards['loki-chunks.json'].ingesterSelector,
dashboards['loki-chunks.json'].ingesterSelector,
],
],
[
'p50',
'p99',
'avg',
],
),
)
){
templating+: {
list+: [
{
allValue: null,
current:
if l.type == 'custom' then {
text: l.query,
value: l.query,
} else {},
datasource: '$datasource',
hide: 0,
includeAll: false,
label: l.variable,
multi: false,
name: l.variable,
options: [],
query:
if l.type == 'query' then
'label_values(%s, %s)' % [l.query, l.label]
else
l.query,
refresh: 1,
regex: '',
sort: 2,
tagValuesQuery: '',
tags: [],
tagsQuery: '',
type: l.type,
useTags: false,
}
for l in dashboards['loki-chunks.json'].templateLabels
],
},
},
}
}
Loading

0 comments on commit 899e8cc

Please sign in to comment.