Skip to content

Commit 88c1053

Browse files
committed
Added units/descriptions to Core metrics
1 parent 25ca22e commit 88c1053

File tree

5 files changed

+171
-39
lines changed

5 files changed

+171
-39
lines changed

client/src/metrics.rs

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ use std::{
66
time::{Duration, Instant},
77
};
88
use temporal_sdk_core_api::telemetry::metrics::{
9-
CoreMeter, Counter, Histogram, MetricAttributes, MetricKeyValue, TemporalMeter,
9+
CoreMeter, Counter, Histogram, MetricAttributes, MetricKeyValue, MetricParameters,
10+
TemporalMeter,
1011
};
1112
use tonic::{body::BoxBody, transport::Channel};
1213
use tower::Service;
@@ -35,12 +36,36 @@ impl MetricsContext {
3536
Self {
3637
kvs: meter.new_attributes(tm.default_attribs),
3738
poll_is_long: false,
38-
svc_request: meter.counter("request".into()),
39-
svc_request_failed: meter.counter("request_failure".into()),
40-
long_svc_request: meter.counter("long_request".into()),
41-
long_svc_request_failed: meter.counter("long_request_failure".into()),
42-
svc_request_latency: meter.histogram("request_latency".into()),
43-
long_svc_request_latency: meter.histogram("long_request_latency".into()),
39+
svc_request: meter.counter(MetricParameters {
40+
name: "request".into(),
41+
description: "Count of client request successes by rpc name".into(),
42+
unit: "".into(),
43+
}),
44+
svc_request_failed: meter.counter(MetricParameters {
45+
name: "request_failure".into(),
46+
description: "Count of client request failures by rpc name".into(),
47+
unit: "".into(),
48+
}),
49+
long_svc_request: meter.counter(MetricParameters {
50+
name: "long_request".into(),
51+
description: "Count of long-poll request successes by rpc name".into(),
52+
unit: "".into(),
53+
}),
54+
long_svc_request_failed: meter.counter(MetricParameters {
55+
name: "long_request_failure".into(),
56+
description: "Count of long-poll request failures by rpc name".into(),
57+
unit: "".into(),
58+
}),
59+
svc_request_latency: meter.histogram(MetricParameters {
60+
name: "request_latency".into(),
61+
unit: "ms".into(),
62+
description: "Histogram of client request latencies".into(),
63+
}),
64+
long_svc_request_latency: meter.histogram(MetricParameters {
65+
name: "long_request_latency".into(),
66+
unit: "ms".into(),
67+
description: "Histogram of client long-poll request latencies".into(),
68+
}),
4469
}
4570
}
4671

core-api/src/telemetry.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,13 @@ pub struct PrometheusExporterOptions {
8282
/// A prefix to be applied to all metrics. Defaults to "temporal_".
8383
#[builder(default = "METRIC_PREFIX")]
8484
pub metric_prefix: &'static str,
85+
/// If set true, all counters will include a "_total" suffix
86+
#[builder(default = "false")]
87+
pub counters_total_suffix: bool,
88+
/// If set true, all histograms will include the unit in their name as a suffix.
89+
/// Ex: "_milliseconds".
90+
#[builder(default = "false")]
91+
pub unit_suffix: bool,
8592
}
8693

8794
/// Configuration for the external export of traces

core/src/telemetry/metrics.rs

Lines changed: 119 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -262,34 +262,125 @@ impl MetricsContext {
262262
impl Instruments {
263263
fn new(meter: &dyn CoreMeter) -> Self {
264264
Self {
265-
wf_completed_counter: meter.counter("workflow_completed".into()),
266-
wf_canceled_counter: meter.counter("workflow_canceled".into()),
267-
wf_failed_counter: meter.counter("workflow_failed".into()),
268-
wf_cont_counter: meter.counter("workflow_continue_as_new".into()),
269-
wf_e2e_latency: meter.histogram(WF_E2E_LATENCY_NAME.into()),
270-
wf_task_queue_poll_empty_counter: meter
271-
.counter("workflow_task_queue_poll_empty".into()),
272-
wf_task_queue_poll_succeed_counter: meter
273-
.counter("workflow_task_queue_poll_succeed".into()),
274-
wf_task_execution_failure_counter: meter
275-
.counter("workflow_task_execution_failed".into()),
276-
wf_task_sched_to_start_latency: meter
277-
.histogram(WF_TASK_SCHED_TO_START_LATENCY_NAME.into()),
278-
wf_task_replay_latency: meter.histogram(WF_TASK_REPLAY_LATENCY_NAME.into()),
279-
wf_task_execution_latency: meter.histogram(WF_TASK_EXECUTION_LATENCY_NAME.into()),
280-
act_poll_no_task: meter.counter("activity_poll_no_task".into()),
281-
act_task_received_counter: meter.counter("activity_task_received".into()),
282-
act_execution_failed: meter.counter("activity_execution_failed".into()),
283-
act_sched_to_start_latency: meter.histogram(ACT_SCHED_TO_START_LATENCY_NAME.into()),
284-
act_exec_latency: meter.histogram(ACT_EXEC_LATENCY_NAME.into()),
265+
wf_completed_counter: meter.counter(MetricParameters {
266+
name: "workflow_completed".into(),
267+
description: "Count of successfully completed workflows".into(),
268+
unit: "".into(),
269+
}),
270+
wf_canceled_counter: meter.counter(MetricParameters {
271+
name: "workflow_canceled".into(),
272+
description: "Count of canceled workflows".into(),
273+
unit: "".into(),
274+
}),
275+
wf_failed_counter: meter.counter(MetricParameters {
276+
name: "workflow_failed".into(),
277+
description: "Count of failed workflows".into(),
278+
unit: "".into(),
279+
}),
280+
wf_cont_counter: meter.counter(MetricParameters {
281+
name: "workflow_continue_as_new".into(),
282+
description: "Count of continued-as-new workflows".into(),
283+
unit: "".into(),
284+
}),
285+
wf_e2e_latency: meter.histogram(MetricParameters {
286+
name: WF_E2E_LATENCY_NAME.into(),
287+
unit: "ms".into(),
288+
description: "Histogram of total workflow execution latencies".into(),
289+
}),
290+
wf_task_queue_poll_empty_counter: meter.counter(MetricParameters {
291+
name: "workflow_task_queue_poll_empty".into(),
292+
description: "Count of workflow task queue poll timeouts (no new task)".into(),
293+
unit: "".into(),
294+
}),
295+
wf_task_queue_poll_succeed_counter: meter.counter(MetricParameters {
296+
name: "workflow_task_queue_poll_succeed".into(),
297+
description: "Count of workflow task queue poll successes".into(),
298+
unit: "".into(),
299+
}),
300+
wf_task_execution_failure_counter: meter.counter(MetricParameters {
301+
name: "workflow_task_execution_failed".into(),
302+
description: "Count of workflow task execution failures".into(),
303+
unit: "".into(),
304+
}),
305+
wf_task_sched_to_start_latency: meter.histogram(MetricParameters {
306+
name: WF_TASK_SCHED_TO_START_LATENCY_NAME.into(),
307+
unit: "ms".into(),
308+
description: "Histogram of workflow task schedule-to-start latencies".into(),
309+
}),
310+
wf_task_replay_latency: meter.histogram(MetricParameters {
311+
name: WF_TASK_REPLAY_LATENCY_NAME.into(),
312+
unit: "ms".into(),
313+
description: "Histogram of workflow task replay latencies".into(),
314+
}),
315+
wf_task_execution_latency: meter.histogram(MetricParameters {
316+
name: WF_TASK_EXECUTION_LATENCY_NAME.into(),
317+
unit: "ms".into(),
318+
description: "Histogram of workflow task execution (not replay) latencies".into(),
319+
}),
320+
act_poll_no_task: meter.counter(MetricParameters {
321+
name: "activity_poll_no_task".into(),
322+
description: "Count of activity task queue poll timeouts (no new task)".into(),
323+
unit: "".into(),
324+
}),
325+
act_task_received_counter: meter.counter(MetricParameters {
326+
name: "activity_task_received".into(),
327+
description: "Count of activity task queue poll successes".into(),
328+
unit: "".into(),
329+
}),
330+
act_execution_failed: meter.counter(MetricParameters {
331+
name: "activity_execution_failed".into(),
332+
description: "Count of activity task execution failures".into(),
333+
unit: "".into(),
334+
}),
335+
act_sched_to_start_latency: meter.histogram(MetricParameters {
336+
name: ACT_SCHED_TO_START_LATENCY_NAME.into(),
337+
unit: "ms".into(),
338+
description: "Histogram of activity schedule-to-start latencies".into(),
339+
}),
340+
act_exec_latency: meter.histogram(MetricParameters {
341+
name: ACT_EXEC_LATENCY_NAME.into(),
342+
unit: "ms".into(),
343+
description: "Histogram of activity execution latencies".into(),
344+
}),
285345
// name kept as worker start for compat with old sdk / what users expect
286-
worker_registered: meter.counter("worker_start".into()),
287-
num_pollers: meter.gauge(NUM_POLLERS_NAME.into()),
288-
task_slots_available: meter.gauge(TASK_SLOTS_AVAILABLE_NAME.into()),
289-
sticky_cache_hit: meter.counter("sticky_cache_hit".into()),
290-
sticky_cache_miss: meter.counter("sticky_cache_miss".into()),
291-
sticky_cache_size: meter.gauge(STICKY_CACHE_SIZE_NAME.into()),
292-
sticky_cache_evictions: meter.counter("sticky_cache_total_forced_eviction".into()),
346+
worker_registered: meter.counter(MetricParameters {
347+
name: "worker_start".into(),
348+
description: "Count of the number of initialized workers".into(),
349+
unit: "".into(),
350+
}),
351+
num_pollers: meter.gauge(MetricParameters {
352+
name: NUM_POLLERS_NAME.into(),
353+
description: "Current number of active pollers per queue type".into(),
354+
unit: "".into(),
355+
}),
356+
task_slots_available: meter.gauge(MetricParameters {
357+
name: TASK_SLOTS_AVAILABLE_NAME.into(),
358+
description: "Current number of available slots per task type".into(),
359+
unit: "".into(),
360+
}),
361+
sticky_cache_hit: meter.counter(MetricParameters {
362+
name: "sticky_cache_hit".into(),
363+
description: "Count of times the workflow cache was used for a new workflow task"
364+
.into(),
365+
unit: "".into(),
366+
}),
367+
sticky_cache_miss: meter.counter(MetricParameters {
368+
name: "sticky_cache_miss".into(),
369+
description:
370+
"Count of times the workflow cache was missing a workflow for a sticky task"
371+
.into(),
372+
unit: "".into(),
373+
}),
374+
sticky_cache_size: meter.gauge(MetricParameters {
375+
name: STICKY_CACHE_SIZE_NAME.into(),
376+
description: "Current number of cached workflows".into(),
377+
unit: "".into(),
378+
}),
379+
sticky_cache_evictions: meter.counter(MetricParameters {
380+
name: "sticky_cache_total_forced_eviction".into(),
381+
description: "Count of evictions of cached workflows".into(),
382+
unit: "".into(),
383+
}),
293384
}
294385
}
295386
}
@@ -529,7 +620,7 @@ pub struct StartedPromServer {
529620
pub fn start_prometheus_metric_exporter(
530621
opts: PrometheusExporterOptions,
531622
) -> Result<StartedPromServer, anyhow::Error> {
532-
let (srv, exporter) = PromServer::new(opts.socket_addr, SDKAggSelector::default())?;
623+
let (srv, exporter) = PromServer::new(&opts, SDKAggSelector::default())?;
533624
let meter_provider = augment_meter_provider_with_defaults(
534625
MeterProvider::builder().with_reader(exporter),
535626
&opts.global_tags,

core/src/telemetry/prometheus_server.rs

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use opentelemetry_prometheus::PrometheusExporter;
88
use opentelemetry_sdk::metrics::reader::AggregationSelector;
99
use prometheus::{Encoder, Registry, TextEncoder};
1010
use std::{convert::Infallible, net::SocketAddr};
11+
use temporal_sdk_core_api::telemetry::PrometheusExporterOptions;
1112

1213
/// Exposes prometheus metrics for scraping
1314
pub(super) struct PromServer {
@@ -17,16 +18,25 @@ pub(super) struct PromServer {
1718

1819
impl PromServer {
1920
pub fn new(
20-
addr: SocketAddr,
21+
opts: &PrometheusExporterOptions,
2122
aggregation: impl AggregationSelector + Send + Sync + 'static,
2223
) -> Result<(Self, PrometheusExporter), anyhow::Error> {
2324
let registry = Registry::new();
2425
let exporter = opentelemetry_prometheus::exporter()
2526
.with_aggregation_selector(aggregation)
2627
.without_scope_info()
27-
.without_counter_suffixes()
2828
.with_registry(registry.clone());
29-
let bound_addr = AddrIncoming::bind(&addr)?;
29+
let exporter = if !opts.counters_total_suffix {
30+
exporter.without_counter_suffixes()
31+
} else {
32+
exporter
33+
};
34+
let exporter = if !opts.unit_suffix {
35+
exporter.without_units()
36+
} else {
37+
exporter
38+
};
39+
let bound_addr = AddrIncoming::bind(&opts.socket_addr)?;
3040
Ok((
3141
Self {
3242
bound_addr,

tests/integ_tests/metrics_tests.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,6 @@ async fn query_of_closed_workflow_doesnt_tick_terminal_metric(
417417

418418
// Verify there is still only one tick
419419
let body = get_text(format!("http://{addr}/metrics")).await;
420-
dbg!(&body);
421420
let matching_line = body
422421
.lines()
423422
.find(|l| l.starts_with(metric_name))

0 commit comments

Comments
 (0)