Skip to content

Commit

Permalink
tdg_dashboard: add GraphQL requests section
Browse files Browse the repository at this point in the history
Add "TDG GraphQL requests" section to TDG dashboard templates.
It consists of panels with following metrics:
- tdg_graphql_query_time_sum
- tdg_graphql_query_time_count
- tdg_graphql_query_fail
- tdg_graphql_mutation_time_sum
- tdg_graphql_mutation_time_count
- tdg_graphql_mutation_fail

Part of #134
  • Loading branch information
DifferentialOrange committed Jun 8, 2022
1 parent f08e333 commit 05a8923
Show file tree
Hide file tree
Showing 8 changed files with 1,711 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- expirationd panels for TDG dashboard
- Tuples panels for TDG dashboard
- File connectors panels for TDG dashboard
- GraphQL requests panels for TDG dashboard


## [1.1.0] - 2022-05-17
Expand Down
6 changes: 6 additions & 0 deletions dashboard/influxdb_tdg_dashboard.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -160,4 +160,10 @@ dashboard.new(
policy=variable.influxdb.policy,
measurement=variable.influxdb.measurement,
)
).addPanels(
section.tdg_graphql(
datasource=variable.datasource.influxdb,
policy=variable.influxdb.policy,
measurement=variable.influxdb.measurement,
)
)
229 changes: 229 additions & 0 deletions dashboard/panels/tdg/graphql.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
local grafana = import 'grafonnet/grafana.libsonnet';

local common_utils = import '../common.libsonnet';

local influxdb = grafana.influxdb;
local prometheus = grafana.prometheus;

{
row:: common_utils.row('TDG GraphQL requests'),

local rps_target(
datasource,
metric_name,
job=null,
rate_time_range=null,
policy=null,
measurement=null,
) =
if datasource == '${DS_PROMETHEUS}' then
prometheus.target(
expr=std.format('rate(%s{job=~"%s"}[%s])',
[metric_name, job, rate_time_range]),
legendFormat='{{operation_name}} ({{schema}}, {{entity}}) — {{alias}}',
)
else if datasource == '${DS_INFLUXDB}' then
influxdb.target(
policy=policy,
measurement=measurement,
group_tags=[
'label_pairs_alias',
'label_pairs_operation_name',
'label_pairs_schema',
'label_pairs_entity',
],
alias='$tag_label_pairs_operation_name ($tag_label_pairs_schema, $tag_label_pairs_entity) — $tag_label_pairs_alias',
).where('metric_name', '=', metric_name)
.selectField('value').addConverter('mean').addConverter('non_negative_derivative', ['1s']),

local average_target(
datasource,
metric_name,
job=null,
policy=null,
measurement=null,
) =
if datasource == '${DS_PROMETHEUS}' then
prometheus.target(
expr=std.format(
|||
%(metric_name_sum)s{job=~"%(job)s"} /
%(metric_name_count)s{job=~"%(job)s"}
|||,
{
metric_name_sum: std.join('_', [metric_name, 'sum']),
metric_name_count: std.join('_', [metric_name, 'count']),
job: job,
}
),
legendFormat='{{operation_name}} ({{schema}}, {{entity}}) — {{alias}}'
)
else if datasource == '${DS_INFLUXDB}' then
influxdb.target(
rawQuery=true,
query=std.format(|||
SELECT mean("%(metric_name_sum)s") / mean("%(metric_name_count)s")
as "average" FROM
(SELECT "value" as "%(metric_name_sum)s" FROM %(policy_prefix)s"%(measurement)s"
WHERE ("metric_name" = '%(metric_name_sum)s') AND $timeFilter),
(SELECT "value" as "%(metric_name_count)s" FROM %(policy_prefix)s"%(measurement)s"
WHERE ("metric_name" = '%(metric_name_count)s') AND $timeFilter)
GROUP BY time($__interval), "label_pairs_alias", "label_pairs_operation_name",
"label_pairs_schema", "label_pairs_entity" fill(none)
|||, {
metric_name_sum: std.join('_', [metric_name, 'sum']),
metric_name_count: std.join('_', [metric_name, 'count']),
policy_prefix: if policy == 'default' then '' else std.format('"%(policy)s".', policy),
measurement: measurement,
}),
alias='$tag_label_pairs_operation_name ($tag_label_pairs_schema, $tag_label_pairs_entity) — $tag_label_pairs_alias'
),

query_success_rps(
title='Success queries',
description=common_utils.rate_warning(|||
A number of successfully executed GraphQL queries.
Graph shows mean requests per second.
|||, datasource),
datasource=null,
policy=null,
measurement=null,
job=null,
rate_time_range=null,
):: common_utils.default_graph(
title=title,
description=description,
datasource=datasource,
labelY1='request per second',
).addTarget(rps_target(
datasource,
'tdg_graphql_query_time_count',
job,
rate_time_range,
policy,
measurement,
)),

query_success_latency(
title='Success query latency',
description=|||
Average time of GraphQL query execution.
Only success requests are count.
|||,
datasource=null,
policy=null,
measurement=null,
job=null,
):: common_utils.default_graph(
title=title,
description=description,
datasource=datasource,
labelY1='average',
format='µs',
).addTarget(average_target(
datasource,
'tdg_graphql_query_time',
job,
policy,
measurement,
)),

query_error_rps(
title='Error queries',
description=common_utils.rate_warning(|||
A number of GraphQL queries failed to execute.
Graph shows mean requests per second.
|||, datasource),
datasource=null,
policy=null,
measurement=null,
job=null,
rate_time_range=null,
):: common_utils.default_graph(
title=title,
description=description,
datasource=datasource,
labelY1='request per second',
).addTarget(rps_target(
datasource,
'tdg_graphql_query_fail',
job,
rate_time_range,
policy,
measurement,
)),

mutation_success_rps(
title='Success mutations',
description=common_utils.rate_warning(|||
A number of successfully executed GraphQL mutations.
Graph shows mean requests per second.
|||, datasource),
datasource=null,
policy=null,
measurement=null,
job=null,
rate_time_range=null,
):: common_utils.default_graph(
title=title,
description=description,
datasource=datasource,
labelY1='request per second',
).addTarget(rps_target(
datasource,
'tdg_graphql_mutation_time_count',
job,
rate_time_range,
policy,
measurement,
)),

mutation_success_latency(
title='Success mutation latency',
description=|||
Average time of GraphQL mutation execution.
Only success requests are count.
|||,
datasource=null,
policy=null,
measurement=null,
job=null,
):: common_utils.default_graph(
title=title,
description=description,
datasource=datasource,
labelY1='average',
format='µs',
).addTarget(average_target(
datasource,
'tdg_graphql_mutation_time',
job,
policy,
measurement,
)),

mutation_error_rps(
title='Error mutations',
description=common_utils.rate_warning(|||
A number of GraphQL mutations failed to execute.
Graph shows mean requests per second.
|||, datasource),
datasource=null,
policy=null,
measurement=null,
job=null,
rate_time_range=null,
):: common_utils.default_graph(
title=title,
description=description,
datasource=datasource,
labelY1='request per second',
).addTarget(rps_target(
datasource,
'tdg_graphql_mutation_fail',
job,
rate_time_range,
policy,
measurement,
)),
}
6 changes: 6 additions & 0 deletions dashboard/prometheus_tdg_dashboard.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -188,4 +188,10 @@ dashboard.new(
job=variable.prometheus.job,
rate_time_range=variable.prometheus.rate_time_range,
)
).addPanels(
section.tdg_graphql(
datasource=variable.datasource.prometheus,
job=variable.prometheus.job,
rate_time_range=variable.prometheus.rate_time_range,
)
)
51 changes: 51 additions & 0 deletions dashboard/section.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ local vinyl = import 'panels/vinyl.libsonnet';

local tdg_expirationd = import 'panels/tdg/expirationd.libsonnet';
local tdg_file_connectors = import 'panels/tdg/file_connectors.libsonnet';
local tdg_graphql = import 'panels/tdg/graphql.libsonnet';
local tdg_kafka_brokers = import 'panels/tdg/kafka/brokers.libsonnet';
local tdg_kafka_common = import 'panels/tdg/kafka/common.libsonnet';
local tdg_kafka_consumer = import 'panels/tdg/kafka/consumer.libsonnet';
Expand Down Expand Up @@ -1789,4 +1790,54 @@ local tdg_tuples = import 'panels/tdg/tuples.libsonnet';
job=job,
),
],

tdg_graphql(datasource, policy=null, measurement=null, job=null, rate_time_range=null):: [
tdg_graphql.row,

tdg_graphql.query_success_rps(
datasource=datasource,
policy=policy,
measurement=measurement,
job=job,
rate_time_range=rate_time_range,
),

tdg_graphql.query_success_latency(
datasource=datasource,
policy=policy,
measurement=measurement,
job=job,
),

tdg_graphql.query_error_rps(
datasource=datasource,
policy=policy,
measurement=measurement,
job=job,
rate_time_range=rate_time_range,
),

tdg_graphql.mutation_success_rps(
datasource=datasource,
policy=policy,
measurement=measurement,
job=job,
rate_time_range=rate_time_range,
),

tdg_graphql.mutation_success_latency(
datasource=datasource,
policy=policy,
measurement=measurement,
job=job,
),

tdg_graphql.mutation_error_rps(
datasource=datasource,
policy=policy,
measurement=measurement,
job=job,
rate_time_range=rate_time_range,
),
],
}
5 changes: 4 additions & 1 deletion example_cluster/telegraf/telegraf.tdg.conf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@
"label_pairs_request",
"label_pairs_kind",
"label_pairs_thread_name",
"label_pairs_type_name"
"label_pairs_type_name",
"label_pairs_operation_name",
"label_pairs_schema",
"label_pairs_entity"
]
insecure_skip_verify = true
interval = "10s"
Expand Down
Loading

0 comments on commit 05a8923

Please sign in to comment.