From c32d6c99c9db06d158b2b09f3b866bf640b8269b Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Mon, 29 Sep 2025 15:13:53 -0700 Subject: [PATCH 01/23] plumb in memory metrics --- client/src/lib.rs | 23 ++ client/src/worker_registry/mod.rs | 35 +-- core-api/Cargo.toml | 1 + core-api/src/lib.rs | 9 + core-api/src/telemetry/metrics.rs | 405 ++++++++++++++++++++++--- core-api/src/worker.rs | 5 + core-c-bridge/src/metric.rs | 5 + core/src/abstractions.rs | 4 + core/src/core_tests/workers.rs | 4 +- core/src/pollers/poll_buffer.rs | 35 ++- core/src/telemetry/metrics.rs | 95 ++++-- core/src/telemetry/mod.rs | 9 + core/src/telemetry/otel.rs | 13 +- core/src/telemetry/prometheus_meter.rs | 63 +++- core/src/worker/activities.rs | 3 + core/src/worker/client.rs | 19 +- core/src/worker/client/mocks.rs | 4 +- core/src/worker/heartbeat.rs | 121 ++++++-- core/src/worker/mod.rs | 264 +++++++++++++--- core/src/worker/workflow/wft_poller.rs | 7 + sdk/Cargo.toml | 1 + tests/common/mod.rs | 8 +- tests/integ_tests/metrics_tests.rs | 100 +++++- tests/workflow_replay_bench.rs | 4 +- 24 files changed, 1074 insertions(+), 163 deletions(-) diff --git a/client/src/lib.rs b/client/src/lib.rs index 4d6088f87..4c824a75f 100644 --- a/client/src/lib.rs +++ b/client/src/lib.rs @@ -1227,6 +1227,14 @@ pub trait WorkflowClientTrait: NamespacedClient { query: String, ) -> Result; + /// List workers registered with server through worker heartbeats + async fn list_workers( + &self, + page_size: i32, + next_page_token: Vec, + query: String, + ) -> Result; + /// Get Cluster Search Attributes async fn get_search_attributes(&self) -> Result; @@ -1793,6 +1801,21 @@ where .into_inner()) } + async fn list_workers( + &self, + page_size: i32, + next_page_token: Vec, + query: String, + ) -> Result { + Ok(WorkflowService::list_workers(&mut self.clone(), + ListWorkersRequest { + namespace: self.namespace().to_owned(), + page_size, + next_page_token, + query, + }).await?.into_inner()) + } + async fn get_search_attributes(&self) -> Result { Ok(WorkflowService::get_search_attributes(&mut self.clone(), GetSearchAttributesRequest {}, diff --git a/client/src/worker_registry/mod.rs b/client/src/worker_registry/mod.rs index f10b128ce..83f5e270c 100644 --- a/client/src/worker_registry/mod.rs +++ b/client/src/worker_registry/mod.rs @@ -133,14 +133,8 @@ impl ClientWorkerSetImpl { if let Some(w) = self.shared_worker.get_mut(worker.namespace()) { let (callback, is_empty) = w.unregister_callback(worker.worker_instance_key()); - if let Some(cb) = callback { - if is_empty { - self.shared_worker.remove(worker.namespace()); - } - - // To maintain single ownership of the callback, we must re-register the callback - // back to the ClientWorker - worker.register_callback(cb); + if callback.is_some() && is_empty { + self.shared_worker.remove(worker.namespace()); } } @@ -212,14 +206,6 @@ impl ClientWorkerSet { .try_reserve_wft_slot(namespace, task_queue) } - /// Unregisters a local worker, typically when that worker starts shutdown. - pub fn unregister_worker( - &self, - worker_instance_key: Uuid, - ) -> Result, anyhow::Error> { - self.worker_manager.write().unregister(worker_instance_key) - } - /// Register a local worker that can provide WFT processing slots and potentially worker heartbeating. pub fn register_worker( &self, @@ -228,6 +214,14 @@ impl ClientWorkerSet { self.worker_manager.write().register(worker) } + /// Unregisters a local worker, typically when that worker starts shutdown. + pub fn unregister_worker( + &self, + worker_instance_key: Uuid, + ) -> Result, anyhow::Error> { + self.worker_manager.write().unregister(worker_instance_key) + } + /// Returns the worker grouping key, which is unique for each worker. pub fn worker_grouping_key(&self) -> Uuid { self.worker_grouping_key @@ -256,7 +250,7 @@ impl std::fmt::Debug for ClientWorkerSet { } /// Contains a worker heartbeat callback, wrapped for mocking -pub type HeartbeatCallback = Box WorkerHeartbeat + Send + Sync>; +pub type HeartbeatCallback = Arc WorkerHeartbeat + Send + Sync>; /// Represents a complete worker that can handle both slot management /// and worker heartbeat functionality. @@ -289,9 +283,6 @@ pub trait ClientWorker: Send + Sync { fn new_shared_namespace_worker( &self, ) -> Result, anyhow::Error>; - - /// Registers a worker heartbeat callback, typically when a worker is unregistered from a client - fn register_callback(&self, callback: HeartbeatCallback); } #[cfg(test)] @@ -453,7 +444,7 @@ mod tests { if heartbeat_enabled { mock_provider .expect_heartbeat_callback() - .returning(|| Some(Box::new(WorkerHeartbeat::default))); + .returning(|| Some(Arc::new(WorkerHeartbeat::default))); let namespace_clone = namespace.clone(); mock_provider @@ -463,8 +454,6 @@ mod tests { namespace_clone.clone(), ))) }); - - mock_provider.expect_register_callback().returning(|_| {}); } mock_provider diff --git a/core-api/Cargo.toml b/core-api/Cargo.toml index ab2640dca..83042665a 100644 --- a/core-api/Cargo.toml +++ b/core-api/Cargo.toml @@ -31,6 +31,7 @@ tonic = { workspace = true } tracing = "0.1" tracing-core = "0.1" url = "2.5" +uuid = { version = "1.18.1", features = ["v4"] } [dependencies.temporal-sdk-core-protos] path = "../sdk-core-protos" diff --git a/core-api/src/lib.rs b/core-api/src/lib.rs index ca65ccae9..f4811f2ff 100644 --- a/core-api/src/lib.rs +++ b/core-api/src/lib.rs @@ -19,6 +19,7 @@ use temporal_sdk_core_protos::coresdk::{ workflow_activation::WorkflowActivation, workflow_completion::WorkflowActivationCompletion, }; +use uuid::Uuid; /// This trait is the primary way by which language specific SDKs interact with the core SDK. /// It represents one worker, which has a (potentially shared) client for connecting to the service @@ -138,6 +139,10 @@ pub trait Worker: Send + Sync { /// This should be called only after [Worker::shutdown] has resolved and/or both polling /// functions have returned `ShutDown` errors. async fn finalize_shutdown(self); + + /// Unique identifier for this worker instance. + /// This must be stable across the worker's lifetime but unique per instance. + fn worker_instance_key(&self) -> Uuid; } #[async_trait::async_trait] @@ -205,6 +210,10 @@ where async fn finalize_shutdown(self) { panic!("Can't finalize shutdown on Arc'd worker") } + + fn worker_instance_key(&self) -> Uuid { + (**self).worker_instance_key() + } } macro_rules! dbg_panic { diff --git a/core-api/src/telemetry/metrics.rs b/core-api/src/telemetry/metrics.rs index 407603f8d..c1e4adb16 100644 --- a/core-api/src/telemetry/metrics.rs +++ b/core-api/src/telemetry/metrics.rs @@ -1,4 +1,5 @@ use crate::dbg_panic; +use std::sync::atomic::{AtomicU64, Ordering}; use std::{ any::Any, borrow::Cow, @@ -26,6 +27,18 @@ pub trait CoreMeter: Send + Sync + Debug { attribs: NewAttributes, ) -> MetricAttributes; fn counter(&self, params: MetricParameters) -> Counter; + + /// Create a counter with in-memory tracking for dual metrics reporting + fn counter_with_in_memory( + &self, + params: MetricParameters, + in_memory_counter: HeartbeatMetricType, + ) -> Counter { + let primary_counter = self.counter(params.clone()); + + Counter::new_with_in_memory(primary_counter.primary.metric.clone(), in_memory_counter) + } + fn histogram(&self, params: MetricParameters) -> Histogram; fn histogram_f64(&self, params: MetricParameters) -> HistogramF64; /// Create a histogram which records Durations. Implementations should choose to emit in @@ -33,8 +46,135 @@ pub trait CoreMeter: Send + Sync + Debug { /// [MetricParameters::unit] should be overwritten by implementations to be `ms` or `s` /// accordingly. fn histogram_duration(&self, params: MetricParameters) -> HistogramDuration; + + fn histogram_duration_with_in_memory( + &self, + params: MetricParameters, + in_memory_hist: HeartbeatMetricType, + ) -> HistogramDuration { + let primary_hist = self.histogram_duration(params.clone()); + + HistogramDuration::new_with_in_memory(primary_hist.primary.metric.clone(), in_memory_hist) + } fn gauge(&self, params: MetricParameters) -> Gauge; + + /// Create a gauge with in-memory tracking for dual metrics reporting + fn gauge_with_in_memory( + &self, + params: MetricParameters, + in_memory_metrics: HeartbeatMetricType, + ) -> Gauge { + let primary_gauge = self.gauge(params.clone()); + Gauge::new_with_in_memory(primary_gauge.primary.metric.clone(), in_memory_metrics) + } + fn gauge_f64(&self, params: MetricParameters) -> GaugeF64; + + fn in_memory_metrics(&self) -> Arc; +} + +#[derive(Clone, Debug)] +pub enum HeartbeatMetricType { + Regular(Arc), + WithLabel(HashMap>), +} + +fn label_value_from_attributes(attributes: &MetricAttributes, key: &str) -> Option { + match attributes { + MetricAttributes::Prometheus { labels } => labels.as_prom_labels().get(key).cloned(), + #[cfg(feature = "otel_impls")] + MetricAttributes::OTel { kvs } => kvs + .iter() + .find(|kv| kv.key.as_str() == key) + .map(|kv| kv.value.to_string()), + _ => None, + } +} + +#[derive(Default, Debug)] +pub struct NumPollersMetric { + pub wft_current_pollers: Arc, + pub sticky_wft_current_pollers: Arc, + pub activity_current_pollers: Arc, + pub nexus_current_pollers: Arc, +} + +impl NumPollersMetric { + pub fn as_map(&self) -> HashMap> { + let mut map = HashMap::new(); + map.insert( + "workflow_task".to_string(), + self.wft_current_pollers.clone(), + ); + map.insert( + "sticky_workflow_task".to_string(), + self.sticky_wft_current_pollers.clone(), + ); + map.insert( + "activity_task".to_string(), + self.activity_current_pollers.clone(), + ); + map.insert("nexus_task".to_string(), self.nexus_current_pollers.clone()); + map + } +} + +#[derive(Default, Debug)] +pub struct WorkerHeartbeatMetrics { + pub sticky_cache_size: Arc, + pub total_sticky_cache_hit: Arc, + pub total_sticky_cache_miss: Arc, + pub num_pollers: NumPollersMetric, + pub workflow_task_execution_failed: Arc, + pub activity_execution_failed: Arc, + pub nexus_task_execution_failed: Arc, + pub local_activity_execution_failed: Arc, + pub activity_execution_latency: Arc, + pub local_activity_execution_latency: Arc, + pub workflow_task_execution_latency: Arc, + pub nexus_task_execution_latency: Arc, +} + +impl WorkerHeartbeatMetrics { + pub fn get_metric(&self, name: &str) -> Option { + match name { + "sticky_cache_size" => { + Some(HeartbeatMetricType::Regular(self.sticky_cache_size.clone())) + } + "sticky_cache_hit" => Some(HeartbeatMetricType::Regular( + self.total_sticky_cache_hit.clone(), + )), + "sticky_cache_miss" => Some(HeartbeatMetricType::Regular( + self.total_sticky_cache_miss.clone(), + )), + "num_pollers" => Some(HeartbeatMetricType::WithLabel(self.num_pollers.as_map())), + "workflow_task_execution_failed" => Some(HeartbeatMetricType::Regular( + self.workflow_task_execution_failed.clone(), + )), + "activity_execution_failed" => Some(HeartbeatMetricType::Regular( + self.activity_execution_failed.clone(), + )), + "nexus_task_execution_failed" => Some(HeartbeatMetricType::Regular( + self.nexus_task_execution_failed.clone(), + )), + "local_activity_execution_failed" => Some(HeartbeatMetricType::Regular( + self.local_activity_execution_failed.clone(), + )), + "activity_execution_latency" => Some(HeartbeatMetricType::Regular( + self.activity_execution_latency.clone(), + )), + "local_activity_execution_latency" => Some(HeartbeatMetricType::Regular( + self.local_activity_execution_latency.clone(), + )), + "workflow_task_execution_latency" => Some(HeartbeatMetricType::Regular( + self.workflow_task_execution_latency.clone(), + )), + "nexus_task_execution_latency" => Some(HeartbeatMetricType::Regular( + self.nexus_task_execution_latency.clone(), + )), + _ => None, + } + } } #[derive(Debug, Clone, derive_builder::Builder)] @@ -108,6 +248,10 @@ impl CoreMeter for Arc { fn gauge_f64(&self, params: MetricParameters) -> GaugeF64 { self.as_ref().gauge_f64(params) } + + fn in_memory_metrics(&self) -> Arc { + self.as_ref().in_memory_metrics() + } } /// Attributes which are provided every time a call to record a specific metric is made. @@ -227,43 +371,93 @@ impl LazyBoundMetric { pub trait CounterBase: Send + Sync { fn adds(&self, value: u64); } -pub type Counter = LazyBoundMetric< + +pub type CounterType = LazyBoundMetric< Arc> + Send + Sync>, Arc, >; + +#[derive(Clone)] +pub struct Counter { + primary: CounterType, + in_memory: Option, +} impl Counter { pub fn new(inner: Arc> + Send + Sync>) -> Self { Self { - metric: inner, - attributes: MetricAttributes::Empty, - bound_cache: OnceLock::new(), + primary: LazyBoundMetric { + metric: inner, + attributes: MetricAttributes::Empty, + bound_cache: OnceLock::new(), + }, + in_memory: None, + } + } + + pub fn new_with_in_memory( + primary: Arc> + Send + Sync>, + in_memory: HeartbeatMetricType, + ) -> Self { + Self { + primary: LazyBoundMetric { + metric: primary, + attributes: MetricAttributes::Empty, + bound_cache: OnceLock::new(), + }, + in_memory: Some(in_memory), } } + pub fn add(&self, value: u64, attributes: &MetricAttributes) { - match self.metric.with_attributes(attributes) { - Ok(base) => { - base.adds(value); - } + match self.primary.metric.with_attributes(attributes) { + Ok(base) => base.adds(value), Err(e) => { - dbg_panic!("Failed to initialize metric, will drop values: {e:?}",); + dbg_panic!("Failed to initialize primary metric, will drop values: {e:?}"); + } + } + + if let Some(ref in_mem) = self.in_memory { + match in_mem { + HeartbeatMetricType::Regular(metric) => { + metric.fetch_add(value, Ordering::Relaxed); + } + HeartbeatMetricType::WithLabel(_) => { + dbg_panic!("No in memory metric should use labels today"); + } } } } + + pub fn update_attributes(&mut self, new_attributes: MetricAttributes) { + self.primary.update_attributes(new_attributes.clone()); + } } impl CounterBase for Counter { fn adds(&self, value: u64) { // TODO: Replace all of these with below when stable // https://doc.rust-lang.org/std/sync/struct.OnceLock.html#method.get_or_try_init - let bound = self.bound_cache.get_or_init(|| { - self.metric - .with_attributes(&self.attributes) + let bound = self.primary.bound_cache.get_or_init(|| { + self.primary + .metric + .with_attributes(&self.primary.attributes) .map(Into::into) .unwrap_or_else(|e| { - dbg_panic!("Failed to initialize metric, will drop values: {e:?}"); + dbg_panic!("Failed to initialize primary metric, will drop values: {e:?}"); Arc::new(NoOpInstrument) as Arc }) }); bound.adds(value); + + if let Some(ref in_mem) = self.in_memory { + match in_mem { + HeartbeatMetricType::Regular(metric) => { + metric.fetch_add(value, Ordering::Relaxed); + } + HeartbeatMetricType::WithLabel(_) => { + dbg_panic!("No in memory metric should use labels today"); + } + } + } } } impl MetricAttributable for Counter { @@ -271,10 +465,15 @@ impl MetricAttributable for Counter { &self, attributes: &MetricAttributes, ) -> Result> { - Ok(Self { - metric: self.metric.clone(), + let primary = LazyBoundMetric { + metric: self.primary.metric.clone(), attributes: attributes.clone(), bound_cache: OnceLock::new(), + }; + + Ok(Counter { + primary, + in_memory: self.in_memory.clone(), }) } } @@ -390,22 +589,45 @@ impl MetricAttributable for HistogramF64 { pub trait HistogramDurationBase: Send + Sync { fn records(&self, value: Duration); } -pub type HistogramDuration = LazyBoundMetric< + +pub type HistogramDurationType = LazyBoundMetric< Arc> + Send + Sync>, Arc, >; + +#[derive(Clone)] +pub struct HistogramDuration { + primary: HistogramDurationType, + in_memory: Option, +} impl HistogramDuration { pub fn new( inner: Arc> + Send + Sync>, ) -> Self { Self { - metric: inner, - attributes: MetricAttributes::Empty, - bound_cache: OnceLock::new(), + primary: LazyBoundMetric { + metric: inner, + attributes: MetricAttributes::Empty, + bound_cache: OnceLock::new(), + }, + in_memory: None, + } + } + pub fn new_with_in_memory( + primary: Arc> + Send + Sync>, + in_memory: HeartbeatMetricType, + ) -> Self { + Self { + primary: LazyBoundMetric { + metric: primary, + attributes: MetricAttributes::Empty, + bound_cache: OnceLock::new(), + }, + in_memory: Some(in_memory), } } pub fn record(&self, value: Duration, attributes: &MetricAttributes) { - match self.metric.with_attributes(attributes) { + match self.primary.metric.with_attributes(attributes) { Ok(base) => { base.records(value); } @@ -413,13 +635,29 @@ impl HistogramDuration { dbg_panic!("Failed to initialize metric, will drop values: {e:?}",); } } + + if let Some(ref in_mem) = self.in_memory { + match in_mem { + HeartbeatMetricType::Regular(metric) => { + metric.fetch_add(1, Ordering::Relaxed); + } + HeartbeatMetricType::WithLabel(_) => { + dbg_panic!("No in memory HistogramDuration should use labels today"); + } + } + } + } + + pub fn update_attributes(&mut self, new_attributes: MetricAttributes) { + self.primary.update_attributes(new_attributes.clone()); } } impl HistogramDurationBase for HistogramDuration { fn records(&self, value: Duration) { - let bound = self.bound_cache.get_or_init(|| { - self.metric - .with_attributes(&self.attributes) + let bound = self.primary.bound_cache.get_or_init(|| { + self.primary + .metric + .with_attributes(&self.primary.attributes) .map(Into::into) .unwrap_or_else(|e| { dbg_panic!("Failed to initialize metric, will drop values: {e:?}"); @@ -427,6 +665,17 @@ impl HistogramDurationBase for HistogramDuration { }) }); bound.records(value); + + if let Some(ref in_mem) = self.in_memory { + match in_mem { + HeartbeatMetricType::Regular(metric) => { + metric.fetch_add(1, Ordering::Relaxed); + } + HeartbeatMetricType::WithLabel(_) => { + dbg_panic!("No in memory HistogramDuration should use labels today"); + } + } + } } } impl MetricAttributable for HistogramDuration { @@ -434,10 +683,15 @@ impl MetricAttributable for HistogramDuration { &self, attributes: &MetricAttributes, ) -> Result> { - Ok(Self { - metric: self.metric.clone(), + let primary = LazyBoundMetric { + metric: self.primary.metric.clone(), attributes: attributes.clone(), bound_cache: OnceLock::new(), + }; + + Ok(HistogramDuration { + primary, + in_memory: self.in_memory.clone(), }) } } @@ -445,41 +699,103 @@ impl MetricAttributable for HistogramDuration { pub trait GaugeBase: Send + Sync { fn records(&self, value: u64); } -pub type Gauge = LazyBoundMetric< + +pub type GaugeType = LazyBoundMetric< Arc> + Send + Sync>, Arc, >; + +#[derive(Clone)] +pub struct Gauge { + primary: GaugeType, + in_memory: Option, +} impl Gauge { pub fn new(inner: Arc> + Send + Sync>) -> Self { Self { - metric: inner, - attributes: MetricAttributes::Empty, - bound_cache: OnceLock::new(), + primary: LazyBoundMetric { + metric: inner, + attributes: MetricAttributes::Empty, + bound_cache: OnceLock::new(), + }, + in_memory: None, + } + } + + pub fn new_with_in_memory( + primary: Arc> + Send + Sync>, + in_memory: HeartbeatMetricType, + ) -> Self { + Self { + primary: LazyBoundMetric { + metric: primary, + attributes: MetricAttributes::Empty, + bound_cache: OnceLock::new(), + }, + in_memory: Some(in_memory), } } + pub fn record(&self, value: u64, attributes: &MetricAttributes) { - match self.metric.with_attributes(attributes) { - Ok(base) => { - base.records(value); - } + match self.primary.metric.with_attributes(attributes) { + Ok(base) => base.records(value), Err(e) => { - dbg_panic!("Failed to initialize metric, will drop values: {e:?}",); + dbg_panic!("Failed to initialize primary metric, will drop values: {e:?}"); } } + + if let Some(ref in_mem) = self.in_memory { + match in_mem { + HeartbeatMetricType::Regular(metric) => { + metric.store(value, Ordering::Relaxed); + } + HeartbeatMetricType::WithLabel(metrics) => { + if let Some(label_value) = + label_value_from_attributes(attributes, "poller_type") + { + if let Some(metric) = metrics.get(&label_value) { + metric.store(value, Ordering::Relaxed); + } + } + } + } + } + } + + pub fn update_attributes(&mut self, new_attributes: MetricAttributes) { + self.primary.update_attributes(new_attributes.clone()); } } impl GaugeBase for Gauge { fn records(&self, value: u64) { - let bound = self.bound_cache.get_or_init(|| { - self.metric - .with_attributes(&self.attributes) + let bound = self.primary.bound_cache.get_or_init(|| { + self.primary + .metric + .with_attributes(&self.primary.attributes) .map(Into::into) .unwrap_or_else(|e| { - dbg_panic!("Failed to initialize metric, will drop values: {e:?}"); + dbg_panic!("Failed to initialize primary metric, will drop values: {e:?}"); Arc::new(NoOpInstrument) as Arc }) }); bound.records(value); + + if let Some(ref in_mem) = self.in_memory { + match in_mem { + HeartbeatMetricType::Regular(metric) => { + metric.store(value, Ordering::Relaxed); + } + HeartbeatMetricType::WithLabel(metrics) => { + if let Some(label_value) = + label_value_from_attributes(&self.primary.attributes, "poller_type") + { + if let Some(metric) = metrics.get(&label_value) { + metric.store(value, Ordering::Relaxed); + } + } + } + } + } } } impl MetricAttributable for Gauge { @@ -487,10 +803,15 @@ impl MetricAttributable for Gauge { &self, attributes: &MetricAttributes, ) -> Result> { - Ok(Self { - metric: self.metric.clone(), + let primary = LazyBoundMetric { + metric: self.primary.metric.clone(), attributes: attributes.clone(), bound_cache: OnceLock::new(), + }; + + Ok(Gauge { + primary, + in_memory: self.in_memory.clone(), }) } } @@ -664,6 +985,10 @@ impl CoreMeter for NoOpCoreMeter { fn gauge_f64(&self, _: MetricParameters) -> GaugeF64 { GaugeF64::new(Arc::new(NoOpInstrument)) } + + fn in_memory_metrics(&self) -> Arc { + Arc::new(WorkerHeartbeatMetrics::default()) + } } macro_rules! impl_metric_attributable { diff --git a/core-api/src/worker.rs b/core-api/src/worker.rs index d92efeec0..507a0dece 100644 --- a/core-api/src/worker.rs +++ b/core-api/src/worker.rs @@ -11,6 +11,7 @@ use temporal_sdk_core_protos::{ coresdk::{ActivitySlotInfo, LocalActivitySlotInfo, NexusSlotInfo, WorkflowSlotInfo}, temporal, temporal::api::enums::v1::VersioningBehavior, + temporal::api::worker::v1::PluginInfo, }; /// Defines per-worker configuration options @@ -161,6 +162,10 @@ pub struct WorkerConfig { /// A versioning strategy for this worker. pub versioning_strategy: WorkerVersioningStrategy, + + /// List of plugins used by lang + #[builder(default)] + pub plugins: Vec, } impl WorkerConfig { diff --git a/core-c-bridge/src/metric.rs b/core-c-bridge/src/metric.rs index 92e46b423..0889dee64 100644 --- a/core-c-bridge/src/metric.rs +++ b/core-c-bridge/src/metric.rs @@ -1,6 +1,7 @@ use crate::{ByteArrayRef, runtime::Runtime}; use std::{any::Any, error::Error, sync::Arc, time::Duration}; use temporal_sdk_core_api::telemetry::metrics; +use temporal_sdk_core_api::telemetry::metrics::WorkerHeartbeatMetrics; pub struct MetricMeter { core: metrics::TemporalMeter, @@ -365,6 +366,10 @@ impl metrics::CoreMeter for CustomMetricMeterRef { fn gauge_f64(&self, params: metrics::MetricParameters) -> metrics::GaugeF64 { metrics::GaugeF64::new(Arc::new(self.new_metric(params, MetricKind::GaugeFloat))) } + + fn in_memory_metrics(&self) -> Arc { + todo!() + } } impl CustomMetricMeterRef { diff --git a/core/src/abstractions.rs b/core/src/abstractions.rs index d4b86cb35..bffa1eb9a 100644 --- a/core/src/abstractions.rs +++ b/core/src/abstractions.rs @@ -160,6 +160,10 @@ where }), } } + + pub(crate) fn max_permits(&self) -> Option { + self.max_permits + } } impl MeteredPermitDealer { diff --git a/core/src/core_tests/workers.rs b/core/src/core_tests/workers.rs index f5288a442..68314c082 100644 --- a/core/src/core_tests/workers.rs +++ b/core/src/core_tests/workers.rs @@ -321,12 +321,12 @@ async fn worker_shutdown_api(#[case] use_cache: bool, #[case] api_success: bool) if api_success { mock.expect_shutdown_worker() .times(1) - .returning(|_| Ok(ShutdownWorkerResponse {})); + .returning(|_, _| Ok(ShutdownWorkerResponse {})); } else { // worker.shutdown() should succeed even if shutdown_worker fails mock.expect_shutdown_worker() .times(1) - .returning(|_| Err(tonic::Status::unavailable("fake shutdown error"))); + .returning(|_, _| Err(tonic::Status::unavailable("fake shutdown error"))); } } else { mock.expect_shutdown_worker().times(0); diff --git a/core/src/pollers/poll_buffer.rs b/core/src/pollers/poll_buffer.rs index 7bc4311fb..c30bf7120 100644 --- a/core/src/pollers/poll_buffer.rs +++ b/core/src/pollers/poll_buffer.rs @@ -8,6 +8,7 @@ use crate::{ }; use futures_util::{FutureExt, StreamExt, future::BoxFuture}; use governor::{Quota, RateLimiter}; +use std::time::SystemTime; use std::{ cmp, fmt::Debug, @@ -74,9 +75,15 @@ impl LongPollBuffer { shutdown: CancellationToken, num_pollers_handler: Option, options: WorkflowTaskOptions, + last_successful_poll_time: Arc>>, ) -> Self { let is_sticky = sticky_queue.is_some(); - let poll_scaler = PollScaler::new(poller_behavior, num_pollers_handler, shutdown.clone()); + let poll_scaler = PollScaler::new( + poller_behavior, + num_pollers_handler, + shutdown.clone(), + last_successful_poll_time, + ); if let Some(wftps) = options.wft_poller_shared.as_ref() { if is_sticky { wftps.set_sticky_active(poll_scaler.active_rx.clone()); @@ -136,6 +143,7 @@ impl LongPollBuffer { } impl LongPollBuffer { + #[allow(clippy::too_many_arguments)] pub(crate) fn new_activity_task( client: Arc, task_queue: String, @@ -144,6 +152,7 @@ impl LongPollBuffer { shutdown: CancellationToken, num_pollers_handler: Option, options: ActivityTaskOptions, + last_successful_poll_time: Arc>>, ) -> Self { let pre_permit_delay = options .max_worker_acts_per_second @@ -183,7 +192,12 @@ impl LongPollBuffer { } }; - let poll_scaler = PollScaler::new(poller_behavior, num_pollers_handler, shutdown.clone()); + let poll_scaler = PollScaler::new( + poller_behavior, + num_pollers_handler, + shutdown.clone(), + last_successful_poll_time, + ); Self::new( poll_fn, permit_dealer, @@ -196,6 +210,7 @@ impl LongPollBuffer { } impl LongPollBuffer { + #[allow(clippy::too_many_arguments)] pub(crate) fn new_nexus_task( client: Arc, task_queue: String, @@ -203,6 +218,7 @@ impl LongPollBuffer { permit_dealer: MeteredPermitDealer, shutdown: CancellationToken, num_pollers_handler: Option, + last_successful_poll_time: Arc>>, send_heartbeat: bool, ) -> Self { let no_retry = if matches!(poller_behavior, PollerBehavior::Autoscaling { .. }) { @@ -232,7 +248,12 @@ impl LongPollBuffer { poll_fn, permit_dealer, shutdown.clone(), - PollScaler::new(poller_behavior, num_pollers_handler, shutdown), + PollScaler::new( + poller_behavior, + num_pollers_handler, + shutdown, + last_successful_poll_time, + ), None:: BoxFuture<'static, ()>>, None::, ) @@ -417,6 +438,7 @@ where behavior: PollerBehavior, num_pollers_handler: Option, shutdown: CancellationToken, + last_successful_poll_time: Arc>>, ) -> Self { let (active_tx, active_rx) = watch::channel(0); let num_pollers_handler = num_pollers_handler.map(Arc::new); @@ -437,6 +459,7 @@ where ingested_this_period: Default::default(), ingested_last_period: Default::default(), scale_up_allowed: AtomicBool::new(true), + last_successful_poll_time, }); let rhc = report_handle.clone(); let ingestor_task = if behavior.is_autoscaling() { @@ -499,6 +522,7 @@ struct PollScalerReportHandle { ingested_this_period: AtomicUsize, ingested_last_period: AtomicUsize, scale_up_allowed: AtomicBool, + last_successful_poll_time: Arc>>, } impl PollScalerReportHandle { @@ -506,6 +530,9 @@ impl PollScalerReportHandle { fn poll_result(&self, res: &Result) -> bool { match res { Ok(res) => { + self.last_successful_poll_time + .lock() + .replace(SystemTime::now()); if let PollerBehavior::SimpleMaximum(_) = self.behavior { // We don't do auto-scaling with the simple max return true; @@ -739,6 +766,7 @@ mod tests { WorkflowTaskOptions { wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(10)))), }, + Arc::new(parking_lot::Mutex::new(None)), ); // Poll a bunch of times, "interrupting" it each time, we should only actually have polled @@ -794,6 +822,7 @@ mod tests { WorkflowTaskOptions { wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(1)))), }, + Arc::new(parking_lot::Mutex::new(None)), ); // Should not see error, unwraps should get empty response diff --git a/core/src/telemetry/metrics.rs b/core/src/telemetry/metrics.rs index fe6aec8e1..c955173b5 100644 --- a/core/src/telemetry/metrics.rs +++ b/core/src/telemetry/metrics.rs @@ -13,7 +13,7 @@ use temporal_sdk_core_api::telemetry::metrics::{ GaugeF64, GaugeF64Base, Histogram, HistogramBase, HistogramDuration, HistogramDurationBase, HistogramF64, HistogramF64Base, LazyBufferInstrument, MetricAttributable, MetricAttributes, MetricCallBufferer, MetricEvent, MetricKeyValue, MetricKind, MetricParameters, MetricUpdateVal, - NewAttributes, NoOpCoreMeter, TemporalMeter, + NewAttributes, NoOpCoreMeter, TemporalMeter, WorkerHeartbeatMetrics, }; use temporal_sdk_core_protos::temporal::api::{ enums::v1::WorkflowTaskFailedCause, failure::v1::Failure, @@ -64,13 +64,14 @@ struct Instruments { sticky_cache_miss: Counter, sticky_cache_size: Gauge, sticky_cache_forced_evictions: Counter, + last_successful_poll_time: HistogramDuration, } impl MetricsContext { pub(crate) fn no_op() -> Self { let meter = Arc::new(NoOpCoreMeter); let kvs = meter.new_attributes(Default::default()); - let instruments = Arc::new(Instruments::new(meter.as_ref())); + let instruments = Arc::new(Instruments::new(meter.as_ref(), None)); Self { kvs, instruments, @@ -80,13 +81,19 @@ impl MetricsContext { #[cfg(test)] pub(crate) fn top_level(namespace: String, tq: String, telemetry: &TelemetryInstance) -> Self { - MetricsContext::top_level_with_meter(namespace, tq, telemetry.get_temporal_metric_meter()) + MetricsContext::top_level_with_meter( + namespace, + tq, + telemetry.get_temporal_metric_meter(), + telemetry.in_memory_metrics(), + ) } pub(crate) fn top_level_with_meter( namespace: String, tq: String, temporal_meter: Option, + in_memory_meter: Option>, ) -> Self { if let Some(mut meter) = temporal_meter { meter @@ -95,7 +102,7 @@ impl MetricsContext { .push(MetricKeyValue::new(KEY_NAMESPACE, namespace)); meter.default_attribs.attributes.push(task_queue(tq)); let kvs = meter.inner.new_attributes(meter.default_attribs); - let mut instruments = Instruments::new(meter.inner.as_ref()); + let mut instruments = Instruments::new(meter.inner.as_ref(), in_memory_meter); instruments.update_attributes(&kvs); Self { kvs, @@ -299,7 +306,37 @@ impl MetricsContext { } impl Instruments { - fn new(meter: &dyn CoreMeter) -> Self { + fn new(meter: &dyn CoreMeter, in_memory: Option>) -> Self { + let create_counter = |params: MetricParameters| -> Counter { + if let Some(in_mem) = in_memory.clone() + && let Some(metric) = in_mem.get_metric(¶ms.name) + { + meter.counter_with_in_memory(params, metric) + } else { + meter.counter(params) + } + }; + + let create_gauge = |params: MetricParameters| -> Gauge { + if let Some(in_mem) = in_memory.clone() + && let Some(metric) = in_mem.get_metric(¶ms.name) + { + meter.gauge_with_in_memory(params, metric) + } else { + meter.gauge(params) + } + }; + + let create_histogram_duration = |params: MetricParameters| -> HistogramDuration { + if let Some(in_mem) = in_memory.clone() + && let Some(metric) = in_mem.get_metric(¶ms.name) + { + meter.histogram_duration_with_in_memory(params, metric) + } else { + meter.histogram_duration(params) + } + }; + Self { wf_completed_counter: meter.counter(MetricParameters { name: "workflow_completed".into(), @@ -331,12 +368,12 @@ impl Instruments { description: "Count of workflow task queue poll timeouts (no new task)".into(), unit: "".into(), }), - wf_task_queue_poll_succeed_counter: meter.counter(MetricParameters { + wf_task_queue_poll_succeed_counter: create_counter(MetricParameters { name: "workflow_task_queue_poll_succeed".into(), description: "Count of workflow task queue poll successes".into(), unit: "".into(), }), - wf_task_execution_failure_counter: meter.counter(MetricParameters { + wf_task_execution_failure_counter: create_counter(MetricParameters { name: "workflow_task_execution_failed".into(), description: "Count of workflow task execution failures".into(), unit: "".into(), @@ -351,7 +388,7 @@ impl Instruments { unit: "duration".into(), description: "Histogram of workflow task replay latencies".into(), }), - wf_task_execution_latency: meter.histogram_duration(MetricParameters { + wf_task_execution_latency: create_histogram_duration(MetricParameters { name: WORKFLOW_TASK_EXECUTION_LATENCY_HISTOGRAM_NAME.into(), unit: "duration".into(), description: "Histogram of workflow task execution (not replay) latencies".into(), @@ -361,12 +398,12 @@ impl Instruments { description: "Count of activity task queue poll timeouts (no new task)".into(), unit: "".into(), }), - act_task_received_counter: meter.counter(MetricParameters { + act_task_received_counter: create_counter(MetricParameters { name: "activity_task_received".into(), description: "Count of activity task queue poll successes".into(), unit: "".into(), }), - act_execution_failed: meter.counter(MetricParameters { + act_execution_failed: create_counter(MetricParameters { name: "activity_execution_failed".into(), description: "Count of activity task execution failures".into(), unit: "".into(), @@ -376,7 +413,7 @@ impl Instruments { unit: "duration".into(), description: "Histogram of activity schedule-to-start latencies".into(), }), - act_exec_latency: meter.histogram_duration(MetricParameters { + act_exec_latency: create_histogram_duration(MetricParameters { name: ACTIVITY_EXEC_LATENCY_HISTOGRAM_NAME.into(), unit: "duration".into(), description: "Histogram of activity execution latencies".into(), @@ -397,7 +434,7 @@ impl Instruments { description: "Count of local activity executions that failed".into(), unit: "".into(), }), - la_exec_latency: meter.histogram_duration(MetricParameters { + la_exec_latency: create_histogram_duration(MetricParameters { name: "local_activity_execution_latency".into(), unit: "duration".into(), description: "Histogram of local activity execution latencies".into(), @@ -409,7 +446,7 @@ impl Instruments { "Histogram of local activity execution latencies for successful local activities" .into(), }), - la_total: meter.counter(MetricParameters { + la_total: create_counter(MetricParameters { name: "local_activity_total".into(), description: "Count of local activities executed".into(), unit: "".into(), @@ -429,12 +466,12 @@ impl Instruments { unit: "duration".into(), description: "Histogram of nexus task end-to-end latencies".into(), }), - nexus_task_execution_latency: meter.histogram_duration(MetricParameters { + nexus_task_execution_latency: create_histogram_duration(MetricParameters { name: "nexus_task_execution_latency".into(), unit: "duration".into(), description: "Histogram of nexus task execution latencies".into(), }), - nexus_task_execution_failed: meter.counter(MetricParameters { + nexus_task_execution_failed: create_counter(MetricParameters { name: "nexus_task_execution_failed".into(), description: "Count of nexus task execution failures".into(), unit: "".into(), @@ -445,7 +482,7 @@ impl Instruments { description: "Count of the number of initialized workers".into(), unit: "".into(), }), - num_pollers: meter.gauge(MetricParameters { + num_pollers: create_gauge(MetricParameters { name: NUM_POLLERS_NAME.into(), description: "Current number of active pollers per queue type".into(), unit: "".into(), @@ -460,20 +497,19 @@ impl Instruments { description: "Current number of used slots per task type".into(), unit: "".into(), }), - sticky_cache_hit: meter.counter(MetricParameters { + sticky_cache_hit: create_counter(MetricParameters { name: "sticky_cache_hit".into(), description: "Count of times the workflow cache was used for a new workflow task" .into(), unit: "".into(), }), - sticky_cache_miss: meter.counter(MetricParameters { + sticky_cache_miss: create_counter(MetricParameters { name: "sticky_cache_miss".into(), description: - "Count of times the workflow cache was missing a workflow for a sticky task" - .into(), + "Count of times the workflow cache was missing a workflow for a sticky task".into(), unit: "".into(), }), - sticky_cache_size: meter.gauge(MetricParameters { + sticky_cache_size: create_gauge(MetricParameters { name: STICKY_CACHE_SIZE_NAME.into(), description: "Current number of cached workflows".into(), unit: "".into(), @@ -483,6 +519,11 @@ impl Instruments { description: "Count of evictions of cached workflows".into(), unit: "".into(), }), + last_successful_poll_time: meter.histogram_duration(MetricParameters { + name: "last_successful_poll_time".into(), + unit: "duration".into(), + description: "Timestamp of the last successful poll time".into(), + }), } } @@ -555,6 +596,8 @@ impl Instruments { .update_attributes(new_attributes.clone()); self.sticky_cache_forced_evictions .update_attributes(new_attributes.clone()); + self.last_successful_poll_time + .update_attributes(new_attributes.clone()); } } @@ -651,7 +694,7 @@ pub const ACTIVITY_EXEC_LATENCY_HISTOGRAM_NAME: &str = "activity_execution_laten pub(super) const NUM_POLLERS_NAME: &str = "num_pollers"; pub(super) const TASK_SLOTS_AVAILABLE_NAME: &str = "worker_task_slots_available"; pub(super) const TASK_SLOTS_USED_NAME: &str = "worker_task_slots_used"; -pub(super) const STICKY_CACHE_SIZE_NAME: &str = "sticky_cache_size"; +pub(crate) const STICKY_CACHE_SIZE_NAME: &str = "sticky_cache_size"; /// Track a failure metric if the failure is not a benign application failure. pub(crate) fn should_record_failure_metric(failure: &Option) -> bool { @@ -841,6 +884,10 @@ where fn gauge_f64(&self, params: MetricParameters) -> GaugeF64 { GaugeF64::new(Arc::new(self.new_instrument(params, MetricKind::Gauge))) } + + fn in_memory_metrics(&self) -> Arc { + todo!() + } } impl MetricCallBufferer for MetricsCallBuffer where @@ -1070,6 +1117,10 @@ impl CoreMeter for PrefixedMetricsMeter { params.name = (self.prefix.clone() + &*params.name).into(); self.meter.gauge_f64(params) } + + fn in_memory_metrics(&self) -> Arc { + self.meter.in_memory_metrics() + } } #[cfg(test)] diff --git a/core/src/telemetry/mod.rs b/core/src/telemetry/mod.rs index 4f4536938..c0a16ea3d 100644 --- a/core/src/telemetry/mod.rs +++ b/core/src/telemetry/mod.rs @@ -39,6 +39,7 @@ use std::{ atomic::{AtomicBool, Ordering}, }, }; +pub(crate) use temporal_sdk_core_api::telemetry::metrics::WorkerHeartbeatMetrics; use temporal_sdk_core_api::telemetry::{ CoreLog, CoreTelemetry, Logger, TelemetryOptions, TelemetryOptionsBuilder, metrics::{CoreMeter, MetricKeyValue, NewAttributes, TemporalMeter}, @@ -67,6 +68,7 @@ pub struct TelemetryInstance { /// the user has not opted into any tracing configuration. trace_subscriber: Option>, attach_service_name: bool, + in_memory_metrics: Option>, // TODO: Should this even be option? } impl TelemetryInstance { @@ -83,6 +85,7 @@ impl TelemetryInstance { metrics, trace_subscriber, attach_service_name, + in_memory_metrics: None, } } @@ -96,6 +99,7 @@ impl TelemetryInstance { /// Some metric meters cannot be initialized until after a tokio runtime has started and after /// other telemetry has initted (ex: prometheus). They can be attached here. pub fn attach_late_init_metrics(&mut self, meter: Arc) { + self.in_memory_metrics = Some(meter.in_memory_metrics().clone()); self.metrics = Some(meter); } @@ -130,6 +134,11 @@ impl TelemetryInstance { vec![] } } + + /// Returns all in memory metrics, used for worker heartbeating. + pub fn in_memory_metrics(&self) -> Option> { + self.in_memory_metrics.clone() + } } thread_local! { diff --git a/core/src/telemetry/otel.rs b/core/src/telemetry/otel.rs index 410e63a83..6ffbad1c0 100644 --- a/core/src/telemetry/otel.rs +++ b/core/src/telemetry/otel.rs @@ -1,5 +1,5 @@ use super::{ - TELEM_SERVICE_NAME, default_buckets_for, + TELEM_SERVICE_NAME, WorkerHeartbeatMetrics, default_buckets_for, metrics::{ ACTIVITY_EXEC_LATENCY_HISTOGRAM_NAME, ACTIVITY_SCHED_TO_START_LATENCY_HISTOGRAM_NAME, DEFAULT_MS_BUCKETS, WORKFLOW_E2E_LATENCY_HISTOGRAM_NAME, @@ -153,13 +153,15 @@ pub fn build_otlp_metric_exporter( MeterProviderBuilder::default().with_reader(reader), &opts.global_tags, opts.use_seconds_for_durations, - opts.histogram_bucket_overrides, + opts.histogram_bucket_overrides.clone(), )? .build(); + Ok::<_, anyhow::Error>(CoreOtelMeter { meter: mp.meter(TELEM_SERVICE_NAME), use_seconds_for_durations: opts.use_seconds_for_durations, _mp: mp, + in_memory_metrics: Arc::new(WorkerHeartbeatMetrics::default()), }) } @@ -170,6 +172,7 @@ pub struct CoreOtelMeter { // we have to hold on to the provider otherwise otel automatically shuts it down on drop // for whatever crazy reason _mp: SdkMeterProvider, + pub in_memory_metrics: Arc, } impl CoreMeter for CoreOtelMeter { @@ -240,6 +243,10 @@ impl CoreMeter for CoreOtelMeter { .build(), )) } + + fn in_memory_metrics(&self) -> Arc { + self.in_memory_metrics.clone() + } } impl CoreOtelMeter { @@ -263,7 +270,7 @@ impl CoreOtelMeter { } } -enum DurationHistogram { +pub(crate) enum DurationHistogram { Milliseconds(opentelemetry::metrics::Histogram), Seconds(opentelemetry::metrics::Histogram), } diff --git a/core/src/telemetry/prometheus_meter.rs b/core/src/telemetry/prometheus_meter.rs index 37810a534..859eecc8f 100644 --- a/core/src/telemetry/prometheus_meter.rs +++ b/core/src/telemetry/prometheus_meter.rs @@ -13,9 +13,10 @@ use std::{ time::Duration, }; use temporal_sdk_core_api::telemetry::metrics::{ - CoreMeter, Counter, CounterBase, Gauge, GaugeBase, GaugeF64, GaugeF64Base, Histogram, - HistogramBase, HistogramDuration, HistogramDurationBase, HistogramF64, HistogramF64Base, - MetricAttributable, MetricAttributes, MetricParameters, NewAttributes, OrderedPromLabelSet, + CoreMeter, Counter, CounterBase, Gauge, GaugeBase, GaugeF64, GaugeF64Base, HeartbeatMetricType, + Histogram, HistogramBase, HistogramDuration, HistogramDurationBase, HistogramF64, + HistogramF64Base, MetricAttributable, MetricAttributes, MetricParameters, NewAttributes, + OrderedPromLabelSet, WorkerHeartbeatMetrics, }; #[derive(derive_more::From, derive_more::TryInto, Debug, Clone)] @@ -460,6 +461,7 @@ pub struct CorePrometheusMeter { use_seconds_for_durations: bool, unit_suffix: bool, bucket_overrides: temporal_sdk_core_api::telemetry::HistogramBucketOverrides, + pub in_memory_metrics: Arc, } impl CorePrometheusMeter { @@ -474,6 +476,7 @@ impl CorePrometheusMeter { use_seconds_for_durations, unit_suffix, bucket_overrides, + in_memory_metrics: Arc::new(WorkerHeartbeatMetrics::default()), } } @@ -540,6 +543,20 @@ impl CoreMeter for CorePrometheusMeter { ))) } + fn counter_with_in_memory( + &self, + params: MetricParameters, + in_memory_counter: HeartbeatMetricType, + ) -> Counter { + let metric_name = params.name.to_string(); + let counter = Arc::new(PromMetric::::new( + metric_name, + params.description.to_string(), + self.registry.clone(), + )); + Counter::new_with_in_memory(counter, in_memory_counter) + } + fn histogram(&self, params: MetricParameters) -> Histogram { let hist = self.create_u64_hist(¶ms); Histogram::new(Arc::new(hist)) @@ -560,6 +577,28 @@ impl CoreMeter for CorePrometheusMeter { })) } + fn histogram_duration_with_in_memory( + &self, + mut params: MetricParameters, + in_memory_hist: HeartbeatMetricType, + ) -> HistogramDuration { + if self.use_seconds_for_durations { + params.unit = "seconds".into(); + HistogramDuration::new_with_in_memory( + Arc::new(DurationHistogram::Seconds(self.create_f64_hist(¶ms))), + in_memory_hist, + ) + } else { + params.unit = "milliseconds".into(); + HistogramDuration::new_with_in_memory( + Arc::new(DurationHistogram::Milliseconds( + self.create_u64_hist(¶ms), + )), + in_memory_hist, + ) + } + } + fn gauge(&self, params: MetricParameters) -> Gauge { let metric_name = params.name.to_string(); Gauge::new(Arc::new(PromMetric::::new( @@ -569,6 +608,20 @@ impl CoreMeter for CorePrometheusMeter { ))) } + fn gauge_with_in_memory( + &self, + params: MetricParameters, + in_memory_metrics: HeartbeatMetricType, + ) -> Gauge { + let metric_name = params.name.to_string(); + let gauge = Arc::new(PromMetric::::new( + metric_name, + params.description.to_string(), + self.registry.clone(), + )); + Gauge::new_with_in_memory(gauge, in_memory_metrics) + } + fn gauge_f64(&self, params: MetricParameters) -> GaugeF64 { let metric_name = params.name.to_string(); GaugeF64::new(Arc::new(PromMetric::::new( @@ -577,6 +630,10 @@ impl CoreMeter for CorePrometheusMeter { self.registry.clone(), ))) } + + fn in_memory_metrics(&self) -> Arc { + self.in_memory_metrics.clone() + } } impl CorePrometheusMeter { diff --git a/core/src/worker/activities.rs b/core/src/worker/activities.rs index 505d5c840..2ded441f6 100644 --- a/core/src/worker/activities.rs +++ b/core/src/worker/activities.rs @@ -773,6 +773,7 @@ mod tests { max_worker_acts_per_second: Some(2.0), max_tps: None, }, + Arc::new(parking_lot::Mutex::new(None)), ); let atm = WorkerActivityTasks::new( sem.clone(), @@ -864,6 +865,7 @@ mod tests { max_worker_acts_per_second: None, max_tps: None, }, + Arc::new(parking_lot::Mutex::new(None)), ); let atm = WorkerActivityTasks::new( sem.clone(), @@ -937,6 +939,7 @@ mod tests { max_worker_acts_per_second: None, max_tps: None, }, + Arc::new(parking_lot::Mutex::new(None)), ); let atm = WorkerActivityTasks::new( sem.clone(), diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index 5d773330e..d1712fe27 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -9,6 +9,7 @@ use temporal_client::{ RetryClient, WorkflowService, }; use temporal_sdk_core_api::worker::WorkerVersioningStrategy; +use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerStatus; use temporal_sdk_core_protos::{ TaskToken, coresdk::{workflow_commands::QueryResult, workflow_completion}, @@ -211,7 +212,11 @@ pub trait WorkerClient: Sync + Send { /// Describe the namespace async fn describe_namespace(&self) -> Result; /// Shutdown the worker - async fn shutdown_worker(&self, sticky_task_queue: String) -> Result; + async fn shutdown_worker( + &self, + sticky_task_queue: String, + worker_heartbeat: Option, + ) -> Result; /// Record a worker heartbeat async fn record_worker_heartbeat( &self, @@ -640,13 +645,21 @@ impl WorkerClient for WorkerClientBag { .into_inner()) } - async fn shutdown_worker(&self, sticky_task_queue: String) -> Result { + async fn shutdown_worker( + &self, + sticky_task_queue: String, + worker_heartbeat: Option, + ) -> Result { + let mut worker_heartbeat = worker_heartbeat; + if let Some(w) = worker_heartbeat.as_mut() { + w.status = WorkerStatus::Shutdown.into(); + } let request = ShutdownWorkerRequest { namespace: self.namespace.clone(), identity: self.identity.clone(), sticky_task_queue, reason: "graceful shutdown".to_string(), - worker_heartbeat: None, + worker_heartbeat, }; Ok( diff --git a/core/src/worker/client/mocks.rs b/core/src/worker/client/mocks.rs index 93984c364..3fb7fed9e 100644 --- a/core/src/worker/client/mocks.rs +++ b/core/src/worker/client/mocks.rs @@ -30,7 +30,7 @@ pub fn mock_worker_client() -> MockWorkerClient { .returning(|| DEFAULT_WORKERS_REGISTRY.clone()); r.expect_is_mock().returning(|| true); r.expect_shutdown_worker() - .returning(|_| Ok(ShutdownWorkerResponse {})); + .returning(|_, _| Ok(ShutdownWorkerResponse {})); r.expect_sdk_name_and_version() .returning(|| ("test-core".to_string(), "0.0.0".to_string())); r.expect_identity() @@ -148,7 +148,7 @@ mockall::mock! { impl Future> + Send + 'b where 'a: 'b, Self: 'b; - fn shutdown_worker<'a, 'b>(&self, sticky_task_queue: String) -> impl Future> + Send + 'b + fn shutdown_worker<'a, 'b>(&self, sticky_task_queue: String, worker_heartbeat: Option) -> impl Future> + Send + 'b where 'a: 'b, Self: 'b; fn record_worker_heartbeat<'a, 'b>( diff --git a/core/src/worker/heartbeat.rs b/core/src/worker/heartbeat.rs index 88774647f..c58e34f38 100644 --- a/core/src/worker/heartbeat.rs +++ b/core/src/worker/heartbeat.rs @@ -3,10 +3,8 @@ use crate::worker::{TaskPollers, WorkerTelemetry}; use parking_lot::Mutex; use prost_types::Duration as PbDuration; use std::collections::HashMap; -use std::{ - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; use temporal_client::SharedNamespaceWorkerTrait; use temporal_sdk_core_api::worker::{ PollerBehavior, WorkerConfigBuilder, WorkerVersioningStrategy, @@ -17,7 +15,7 @@ use tokio_util::sync::CancellationToken; use uuid::Uuid; /// Callback used to collect heartbeat data from each worker at the time of heartbeat -pub(crate) type HeartbeatFn = Box WorkerHeartbeat + Send + Sync>; +pub(crate) type HeartbeatFn = Arc WorkerHeartbeat + Send + Sync>; /// SharedNamespaceWorker is responsible for polling nexus-delivered worker commands and sending /// worker heartbeats to the server. This invokes callbacks on all workers in the same process that @@ -59,8 +57,6 @@ impl SharedNamespaceWorker { true, )?; - let last_heartbeat_time_map = Mutex::new(HashMap::new()); - let reset_notify = Arc::new(Notify::new()); let cancel = CancellationToken::new(); let cancel_clone = cancel.clone(); @@ -73,15 +69,17 @@ impl SharedNamespaceWorker { tokio::spawn(async move { let mut ticker = tokio::time::interval(heartbeat_interval); + let mut last_heartbeat_time = HashMap::new(); + let mut last_processed_tasks = HashMap::new(); loop { tokio::select! { _ = ticker.tick() => { let mut hb_to_send = Vec::new(); for (instance_key, heartbeat_callback) in heartbeat_map_clone.lock().iter() { let mut heartbeat = heartbeat_callback(); - let mut last_heartbeat_time_map = last_heartbeat_time_map.lock(); + let heartbeat_time = last_heartbeat_time.get(instance_key).cloned(); let now = SystemTime::now(); - let elapsed_since_last_heartbeat = last_heartbeat_time_map.get(instance_key).cloned().map( + let elapsed_since_last_heartbeat = heartbeat_time.map( |hb_time| { let dur = now.duration_since(hb_time).unwrap_or(Duration::ZERO); PbDuration { @@ -94,6 +92,8 @@ impl SharedNamespaceWorker { heartbeat.elapsed_since_last_heartbeat = elapsed_since_last_heartbeat; heartbeat.heartbeat_time = Some(now.into()); + process_slot_info(*instance_key, &mut heartbeat, &mut last_processed_tasks); + // All of these heartbeat details rely on a client. To avoid circular // dependencies, this must be populated from within SharedNamespaceWorker // to get info from the current client @@ -104,7 +104,7 @@ impl SharedNamespaceWorker { hb_to_send.push(heartbeat); - last_heartbeat_time_map.insert(*instance_key, now); + last_heartbeat_time.insert(*instance_key, now); } if let Err(e) = client_clone.record_worker_heartbeat(namespace_clone.clone(), hb_to_send).await { if matches!(e.code(), tonic::Code::Unimplemented) { @@ -137,19 +137,12 @@ impl SharedNamespaceWorkerTrait for SharedNamespaceWorker { self.namespace.clone() } - fn register_callback( - &self, - worker_instance_key: Uuid, - heartbeat_callback: Box WorkerHeartbeat + Send + Sync>, - ) { + fn register_callback(&self, worker_instance_key: Uuid, heartbeat_callback: HeartbeatFn) { self.heartbeat_map .lock() .insert(worker_instance_key, heartbeat_callback); } - fn unregister_callback( - &self, - worker_instance_key: Uuid, - ) -> (Option WorkerHeartbeat + Send + Sync>>, bool) { + fn unregister_callback(&self, worker_instance_key: Uuid) -> (Option, bool) { let mut heartbeat_map = self.heartbeat_map.lock(); let heartbeat_callback = heartbeat_map.remove(&worker_instance_key); if heartbeat_map.is_empty() { @@ -163,6 +156,96 @@ impl SharedNamespaceWorkerTrait for SharedNamespaceWorker { } } +#[derive(Default)] +struct SlotsInfo { + last_interval_processed_tasks: i32, + last_interval_failure_tasks: i32, +} + +#[derive(Default)] +struct HeartbeatSlotsInfo { + workflow_task_slots_info: SlotsInfo, + activity_task_slots_info: SlotsInfo, + nexus_task_slots_info: SlotsInfo, + local_activity_slots_info: SlotsInfo, +} + +fn process_slot_info( + worker_instance_key: Uuid, + heartbeat: &mut WorkerHeartbeat, + slots_map: &mut HashMap, +) { + let slots_info = slots_map.entry(worker_instance_key).or_default(); + if let Some(wft_slot_info) = heartbeat.workflow_task_slots_info.as_mut() { + wft_slot_info.last_interval_processed_tasks = wft_slot_info.total_processed_tasks + - slots_info + .workflow_task_slots_info + .last_interval_processed_tasks; + wft_slot_info.last_interval_failure_tasks = wft_slot_info.total_failed_tasks + - slots_info + .workflow_task_slots_info + .last_interval_failure_tasks; + + slots_info + .workflow_task_slots_info + .last_interval_processed_tasks = wft_slot_info.total_processed_tasks; + slots_info + .workflow_task_slots_info + .last_interval_failure_tasks = wft_slot_info.total_failed_tasks; + } + + if let Some(act_slot_info) = heartbeat.activity_task_slots_info.as_mut() { + act_slot_info.last_interval_processed_tasks = act_slot_info.total_processed_tasks + - slots_info + .activity_task_slots_info + .last_interval_processed_tasks; + act_slot_info.last_interval_failure_tasks = act_slot_info.total_failed_tasks + - slots_info + .activity_task_slots_info + .last_interval_failure_tasks; + + slots_info + .activity_task_slots_info + .last_interval_processed_tasks = act_slot_info.total_processed_tasks; + slots_info + .activity_task_slots_info + .last_interval_failure_tasks = act_slot_info.total_failed_tasks; + } + + if let Some(nexus_slot_info) = heartbeat.nexus_task_slots_info.as_mut() { + nexus_slot_info.last_interval_processed_tasks = nexus_slot_info.total_processed_tasks + - slots_info + .nexus_task_slots_info + .last_interval_processed_tasks; + nexus_slot_info.last_interval_failure_tasks = nexus_slot_info.total_failed_tasks + - slots_info.nexus_task_slots_info.last_interval_failure_tasks; + + slots_info + .nexus_task_slots_info + .last_interval_processed_tasks = nexus_slot_info.total_processed_tasks; + slots_info.nexus_task_slots_info.last_interval_failure_tasks = + nexus_slot_info.total_failed_tasks; + } + + if let Some(la_slot_info) = heartbeat.local_activity_slots_info.as_mut() { + la_slot_info.last_interval_processed_tasks = la_slot_info.total_processed_tasks + - slots_info + .local_activity_slots_info + .last_interval_processed_tasks; + la_slot_info.last_interval_failure_tasks = la_slot_info.total_failed_tasks + - slots_info + .local_activity_slots_info + .last_interval_failure_tasks; + + slots_info + .local_activity_slots_info + .last_interval_processed_tasks = la_slot_info.total_processed_tasks; + slots_info + .local_activity_slots_info + .last_interval_failure_tasks = la_slot_info.total_failed_tasks; + } +} + #[cfg(test)] mod tests { use crate::{ diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index 5428c067a..3e92a00e1 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -1,6 +1,6 @@ mod activities; pub(crate) mod client; -mod heartbeat; +pub(crate) mod heartbeat; mod nexus; mod slot_provider; pub(crate) mod tuner; @@ -20,6 +20,7 @@ pub(crate) use activities::{ pub(crate) use wft_poller::WFTPollerShared; pub use workflow::LEGACY_QUERY_ID; +use crate::telemetry::WorkerHeartbeatMetrics; use crate::worker::heartbeat::{HeartbeatFn, SharedNamespaceWorker}; use crate::{ ActivityHeartbeat, CompleteActivityError, PollError, WorkerTrait, @@ -53,6 +54,7 @@ use futures_util::{StreamExt, stream}; use gethostname::gethostname; use parking_lot::{Mutex, RwLock}; use slot_provider::SlotProvider; +use std::time::SystemTime; use std::{ convert::TryInto, future, @@ -62,16 +64,24 @@ use std::{ }, time::Duration, }; +use sysinfo::System; use temporal_client::{ClientWorker, HeartbeatCallback, Slot as SlotTrait}; use temporal_client::{ ConfiguredClient, SharedNamespaceWorkerTrait, TemporalServiceClientWithMetrics, }; use temporal_sdk_core_api::telemetry::metrics::TemporalMeter; +use temporal_sdk_core_api::worker::{ + ActivitySlotKind, LocalActivitySlotKind, NexusSlotKind, SlotKind, WorkflowSlotKind, +}; use temporal_sdk_core_api::{ errors::{CompleteNexusError, WorkerValidationError}, worker::PollerBehavior, }; -use temporal_sdk_core_protos::temporal::api::worker::v1::{WorkerHeartbeat, WorkerHostInfo}; +use temporal_sdk_core_protos::temporal::api::deployment; +use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerStatus; +use temporal_sdk_core_protos::temporal::api::worker::v1::{ + WorkerHeartbeat, WorkerHostInfo, WorkerPollerInfo, WorkerSlotsInfo, +}; use temporal_sdk_core_protos::{ TaskToken, coresdk::{ @@ -131,6 +141,8 @@ pub struct Worker { all_permits_tracker: tokio::sync::Mutex, /// Used to track worker client client_worker_registrator: Arc, + /// Status + status: Arc>, } struct AllPermitsTracker { @@ -152,6 +164,7 @@ pub(crate) struct WorkerTelemetry { metric_meter: Option, temporal_metric_meter: Option, trace_subscriber: Option>, + in_memory_meter: Option>, } #[async_trait::async_trait] @@ -249,18 +262,13 @@ impl WorkerTrait for Worker { ); } self.shutdown_token.cancel(); + *self.status.lock() = WorkerStatus::ShuttingDown; // First, unregister worker from the client - if let Err(e) = self - .client - .workers() - .unregister_worker(self.worker_instance_key) - { - error!( - task_queue=%self.config.task_queue, - namespace=%self.config.namespace, - error=%e, - "Failed to unregister worker on shutdown", - ); + if !self.client_worker_registrator.shared_namespace_worker { + let _res = self + .client + .workers() + .unregister_worker(self.worker_instance_key); } // Second, we want to stop polling of both activity and workflow tasks @@ -288,6 +296,10 @@ impl WorkerTrait for Worker { async fn finalize_shutdown(self) { self.finalize_shutdown().await } + + fn worker_instance_key(&self) -> Uuid { + self.worker_instance_key + } } impl Worker { @@ -363,6 +375,7 @@ impl Worker { metric_meter: telem.get_metric_meter(), temporal_metric_meter: telem.get_temporal_metric_meter(), trace_subscriber: telem.trace_subscriber(), + in_memory_meter: telem.in_memory_metrics(), }); Worker::new_with_pollers_inner( @@ -383,14 +396,16 @@ impl Worker { task_pollers: TaskPollers, worker_telemetry: Option, worker_heartbeat_interval: Option, - shared_namespace_worker: bool, + shared_namespace_worker: bool, // TODO: is this unnecessary? ) -> Result { + // let shared_namespace_worker = in_memory_meter.is_none(); let (metrics, meter) = if let Some(wt) = worker_telemetry.as_ref() { ( MetricsContext::top_level_with_meter( config.namespace.clone(), config.task_queue.clone(), wt.temporal_metric_meter.clone(), + wt.in_memory_meter.clone(), ), wt.metric_meter.clone(), ) @@ -434,6 +449,12 @@ impl Worker { ); let act_permits = act_slots.get_extant_count_rcv(); let (external_wft_tx, external_wft_rx) = unbounded_channel(); + + let wf_last_suc_poll_time = Arc::new(Mutex::new(None)); + let wf_sticky_last_suc_poll_time = Arc::new(Mutex::new(None)); + let act_last_suc_poll_time = Arc::new(Mutex::new(None)); + let nexus_last_suc_poll_time = Arc::new(Mutex::new(None)); + let nexus_slots = MeteredPermitDealer::new( tuner.nexus_task_slot_supplier(), metrics.with_new_attrs([nexus_worker_type()]), @@ -450,6 +471,8 @@ impl Worker { &metrics, &shutdown_token, &wft_slots, + wf_last_suc_poll_time.clone(), + wf_sticky_last_suc_poll_time.clone(), ); let wft_stream = if !client.is_mock() { // Some replay tests combine a mock client with real pollers, @@ -464,6 +487,7 @@ impl Worker { None } else { let act_metrics = metrics.with_new_attrs([activity_poller()]); + // activity poller let ap = LongPollBuffer::new_activity_task( client.clone(), config.task_queue.clone(), @@ -475,11 +499,13 @@ impl Worker { max_worker_acts_per_second: config.max_worker_activities_per_second, max_tps: config.max_task_queue_activities_per_second, }, + act_last_suc_poll_time.clone(), ); Some(Box::from(ap) as BoxedActPoller) }; let np_metrics = metrics.with_new_attrs([nexus_poller()]); + let nexus_poll_buffer = Box::new(LongPollBuffer::new_nexus_task( client.clone(), config.task_queue.clone(), @@ -487,6 +513,7 @@ impl Worker { nexus_slots.clone(), shutdown_token.child_token(), Some(move |np| np_metrics.record_num_pollers(np)), + nexus_last_suc_poll_time.clone(), shared_namespace_worker, )) as BoxedNexusPoller; @@ -531,13 +558,13 @@ impl Worker { let la_permits = la_permit_dealer.get_extant_count_rcv(); let local_act_mgr = Arc::new(LocalActivityManager::new( config.namespace.clone(), - la_permit_dealer, + la_permit_dealer.clone(), hb_tx, metrics.clone(), )); let at_task_mgr = act_poller.map(|ap| { WorkerActivityTasks::new( - act_slots, + act_slots.clone(), ap, client.clone(), metrics.clone(), @@ -548,7 +575,7 @@ impl Worker { ) }); let poll_on_non_local_activities = at_task_mgr.is_some(); - if !poll_on_non_local_activities { + if !poll_on_non_local_activities && !shared_namespace_worker { info!("Activity polling is disabled for this worker"); }; let la_sink = LAReqSink::new(local_act_mgr.clone()); @@ -567,6 +594,7 @@ impl Worker { external_wft_tx, ); let worker_instance_key = Uuid::new_v4(); + let worker_status = Arc::new(Mutex::new(WorkerStatus::Running)); let sdk_name_and_ver = client.sdk_name_and_version(); let worker_heartbeat = worker_heartbeat_interval.map(|hb_interval| { @@ -575,6 +603,15 @@ impl Worker { worker_instance_key, hb_interval, worker_telemetry.clone(), + wft_slots.clone(), + act_slots, + nexus_slots, + la_permit_dealer, + wf_last_suc_poll_time, + wf_sticky_last_suc_poll_time, + act_last_suc_poll_time, + nexus_last_suc_poll_time, + worker_status.clone(), ) }); @@ -583,6 +620,7 @@ impl Worker { slot_provider: provider, heartbeat_manager: worker_heartbeat, client: RwLock::new(client.clone()), + shared_namespace_worker, }); if !shared_namespace_worker { @@ -650,6 +688,7 @@ impl Worker { }), nexus_mgr, client_worker_registrator, + status: worker_status, }) } @@ -658,8 +697,14 @@ impl Worker { async fn shutdown(&self) { self.initiate_shutdown(); if let Some(name) = self.workflows.get_sticky_queue_name() { + let heartbeat = self + .client_worker_registrator + .heartbeat_manager + .as_ref() + .map(|hm| hm.heartbeat_callback.clone()()); + // This is a best effort call and we can still shutdown the worker if it fails - match self.client.shutdown_worker(name).await { + match self.client.shutdown_worker(name, heartbeat).await { Err(err) if !matches!( err.code(), @@ -955,6 +1000,7 @@ struct ClientWorkerRegistrator { slot_provider: SlotProvider, heartbeat_manager: Option, client: RwLock>, + shared_namespace_worker: bool, } impl ClientWorker for ClientWorkerRegistrator { @@ -979,12 +1025,12 @@ impl ClientWorker for ClientWorkerRegistrator { fn heartbeat_callback(&self) -> Option { if let Some(hb_mgr) = self.heartbeat_manager.as_ref() { - let mut heartbeat_manager = hb_mgr.heartbeat_callback.lock(); - heartbeat_manager.take() + Some(hb_mgr.heartbeat_callback.clone()) } else { None } } + fn new_shared_namespace_worker( &self, ) -> Result, anyhow::Error> { @@ -999,12 +1045,6 @@ impl ClientWorker for ClientWorkerRegistrator { bail!("Shared namespace worker creation never be called without a heartbeat manager"); } } - - fn register_callback(&self, callback: HeartbeatCallback) { - if let Some(hb_mgr) = self.heartbeat_manager.as_ref() { - hb_mgr.heartbeat_callback.lock().replace(callback); - } - } } struct WorkerHeartbeatManager { @@ -1013,33 +1053,66 @@ struct WorkerHeartbeatManager { /// Telemetry instance, needed to initialize [SharedNamespaceWorker] when replacing client telemetry: Option, /// Heartbeat callback - heartbeat_callback: Mutex WorkerHeartbeat + Send + Sync>>>, + heartbeat_callback: Arc WorkerHeartbeat + Send + Sync>, } impl WorkerHeartbeatManager { + #[allow(clippy::too_many_arguments)] fn new( config: WorkerConfig, worker_instance_key: Uuid, heartbeat_interval: Duration, telemetry_instance: Option, + wft_slots: MeteredPermitDealer, + act_slots: MeteredPermitDealer, + nexus_slots: MeteredPermitDealer, + la_slots: MeteredPermitDealer, + wf_last_suc_poll_time: Arc>>, + wf_sticky_last_suc_poll_time: Arc>>, + act_last_suc_poll_time: Arc>>, + nexus_last_suc_poll_time: Arc>>, + status: Arc>, ) -> Self { - let worker_instance_key_clone = worker_instance_key.to_string(); let task_queue = config.task_queue.clone(); + let deployment_version = config.computed_deployment_version(); + let deployment_version = + deployment_version.map(|dv| deployment::v1::WorkerDeploymentVersion { + deployment_name: dv.deployment_name, + build_id: dv.build_id, + }); - // TODO: requires the metrics changes to get the rest of these fields - let worker_heartbeat_callback: HeartbeatFn = Box::new(move || { - WorkerHeartbeat { - worker_instance_key: worker_instance_key_clone.clone(), + let telemetry_instance_clone = telemetry_instance.clone(); + + let worker_heartbeat_callback: HeartbeatFn = Arc::new(move || { + let mut sys = System::new_all(); + sys.refresh_all(); + std::thread::sleep(sysinfo::MINIMUM_CPU_UPDATE_INTERVAL); + sys.refresh_cpu_usage(); + let current_host_cpu_usage: f32 = + sys.cpus().iter().map(|cpu| cpu.cpu_usage()).sum::() / sys.cpus().len() as f32; + let total_mem = sys.total_memory() as f64; + let used_mem = sys.used_memory() as f64; + let current_host_mem_usage = (used_mem / total_mem) as f32; + + let mut worker_heartbeat = WorkerHeartbeat { + worker_instance_key: worker_instance_key.to_string(), host_info: Some(WorkerHostInfo { host_name: gethostname().to_string_lossy().to_string(), process_id: std::process::id().to_string(), - ..Default::default() + current_host_cpu_usage, + current_host_mem_usage, + + // Set by SharedNamespaceWorker because it relies on the client + process_key: String::new(), }), task_queue: task_queue.clone(), - deployment_version: None, + deployment_version: deployment_version.clone(), + + status: (*status.lock()) as i32, + start_time: Some(SystemTime::now().into()), + plugins: config.plugins.clone(), - status: 0, - start_time: Some(std::time::SystemTime::now().into()), + // Metrics dependent, set below workflow_task_slots_info: None, activity_task_slots_info: None, nexus_task_slots_info: None, @@ -1051,19 +1124,103 @@ impl WorkerHeartbeatManager { total_sticky_cache_hit: 0, total_sticky_cache_miss: 0, current_sticky_cache_size: 0, - plugins: vec![], // sdk_name, sdk_version, and worker_identity must be set by // SharedNamespaceWorker because they rely on the client, and // need to be pulled from the current client used by SharedNamespaceWorker - ..Default::default() + worker_identity: String::new(), + heartbeat_time: None, + elapsed_since_last_heartbeat: None, + sdk_name: String::new(), + sdk_version: String::new(), + }; + + if let Some(telem_instance) = telemetry_instance_clone.as_ref() + && let Some(in_mem) = telem_instance.in_memory_meter.as_ref() + { + worker_heartbeat.total_sticky_cache_hit = + in_mem.total_sticky_cache_hit.load(Ordering::Relaxed) as i32; + worker_heartbeat.total_sticky_cache_miss = + in_mem.total_sticky_cache_miss.load(Ordering::Relaxed) as i32; + worker_heartbeat.current_sticky_cache_size = + in_mem.sticky_cache_size.load(Ordering::Relaxed) as i32; + // TODO: Is this ever not Some()? + worker_heartbeat.workflow_poller_info = Some(WorkerPollerInfo { + current_pollers: in_mem + .num_pollers + .wft_current_pollers + .load(Ordering::Relaxed) as i32, + last_successful_poll_time: wf_last_suc_poll_time.lock().map(|time| time.into()), + is_autoscaling: config.workflow_task_poller_behavior.is_autoscaling(), + }); + + worker_heartbeat.workflow_sticky_poller_info = Some(WorkerPollerInfo { + current_pollers: in_mem + .num_pollers + .sticky_wft_current_pollers + .load(Ordering::Relaxed) as i32, + last_successful_poll_time: wf_sticky_last_suc_poll_time + .lock() + .map(|time| time.into()), + is_autoscaling: config.workflow_task_poller_behavior.is_autoscaling(), + }); + worker_heartbeat.activity_poller_info = Some(WorkerPollerInfo { + current_pollers: in_mem + .num_pollers + .activity_current_pollers + .load(Ordering::Relaxed) as i32, + last_successful_poll_time: act_last_suc_poll_time + .lock() + .map(|time| time.into()), + is_autoscaling: config.activity_task_poller_behavior.is_autoscaling(), + }); + worker_heartbeat.nexus_poller_info = Some(WorkerPollerInfo { + current_pollers: in_mem + .num_pollers + .nexus_current_pollers + .load(Ordering::Relaxed) as i32, + last_successful_poll_time: nexus_last_suc_poll_time + .lock() + .map(|time| time.into()), + is_autoscaling: config.nexus_task_poller_behavior.is_autoscaling(), + }); + + worker_heartbeat.workflow_task_slots_info = make_slots_info( + &wft_slots, + in_mem + .workflow_task_execution_latency + .load(Ordering::Relaxed), + in_mem + .workflow_task_execution_failed + .load(Ordering::Relaxed), + ); + worker_heartbeat.activity_task_slots_info = make_slots_info( + &act_slots, + in_mem.activity_execution_latency.load(Ordering::Relaxed), + in_mem.activity_execution_failed.load(Ordering::Relaxed), + ); + worker_heartbeat.nexus_task_slots_info = make_slots_info( + &nexus_slots, + in_mem.nexus_task_execution_latency.load(Ordering::Relaxed), + in_mem.nexus_task_execution_failed.load(Ordering::Relaxed), + ); + worker_heartbeat.local_activity_slots_info = make_slots_info( + &la_slots, + in_mem + .local_activity_execution_latency + .load(Ordering::Relaxed), + in_mem + .local_activity_execution_failed + .load(Ordering::Relaxed), + ); } + worker_heartbeat }); WorkerHeartbeatManager { heartbeat_interval, telemetry: telemetry_instance, - heartbeat_callback: Mutex::new(Some(worker_heartbeat_callback)), + heartbeat_callback: worker_heartbeat_callback, } } } @@ -1105,6 +1262,35 @@ fn wft_poller_behavior(config: &WorkerConfig, is_sticky: bool) -> PollerBehavior } } +fn make_slots_info( + dealer: &MeteredPermitDealer, + total_processed: u64, + total_failed: u64, +) -> Option +where + SK: SlotKind + 'static, +{ + let avail_usize = dealer.available_permits()?; + let max_usize = dealer.max_permits()?; + + let avail = i32::try_from(avail_usize).unwrap_or(i32::MAX); + let max = i32::try_from(max_usize).unwrap_or(i32::MAX); + + let used = (max - avail).max(0); + + Some(WorkerSlotsInfo { + current_available_slots: avail, + current_used_slots: used, + slot_supplier_kind: SK::kind().to_string(), + total_processed_tasks: i32::try_from(total_processed).unwrap_or(i32::MAX), + total_failed_tasks: i32::try_from(total_failed).unwrap_or(i32::MAX), + + // Filled in by heartbeat later + last_interval_processed_tasks: 0, + last_interval_failure_tasks: 0, + }) +} + #[cfg(test)] mod tests { use super::*; diff --git a/core/src/worker/workflow/wft_poller.rs b/core/src/worker/workflow/wft_poller.rs index 3cc2da579..5d7faf2dd 100644 --- a/core/src/worker/workflow/wft_poller.rs +++ b/core/src/worker/workflow/wft_poller.rs @@ -7,12 +7,15 @@ use crate::{ worker::{client::WorkerClient, wft_poller_behavior}, }; use futures_util::{Stream, stream}; +use parking_lot::Mutex; use std::sync::{Arc, OnceLock}; +use std::time::SystemTime; use temporal_sdk_core_api::worker::{WorkerConfig, WorkflowSlotKind}; use temporal_sdk_core_protos::temporal::api::workflowservice::v1::PollWorkflowTaskQueueResponse; use tokio::sync::watch; use tokio_util::sync::CancellationToken; +#[allow(clippy::too_many_arguments)] pub(crate) fn make_wft_poller( config: &WorkerConfig, sticky_queue_name: &Option, @@ -20,6 +23,8 @@ pub(crate) fn make_wft_poller( metrics: &MetricsContext, shutdown_token: &CancellationToken, wft_slots: &MeteredPermitDealer, + last_successful_poll_time: Arc>>, + sticky_last_successful_poll_time: Arc>>, ) -> impl Stream< Item = Result< ( @@ -52,6 +57,7 @@ pub(crate) fn make_wft_poller( WorkflowTaskOptions { wft_poller_shared: wft_poller_shared.clone(), }, + last_successful_poll_time, ); let sticky_queue_poller = sticky_queue_name.as_ref().map(|sqn| { let sticky_metrics = metrics.with_new_attrs([workflow_sticky_poller()]); @@ -66,6 +72,7 @@ pub(crate) fn make_wft_poller( sticky_metrics.record_num_pollers(np); }), WorkflowTaskOptions { wft_poller_shared }, + sticky_last_successful_poll_time, ) }); let wf_task_poll_buffer = Box::new(WorkflowTaskPoller::new( diff --git a/sdk/Cargo.toml b/sdk/Cargo.toml index 97039c2cb..1c019bcdb 100644 --- a/sdk/Cargo.toml +++ b/sdk/Cargo.toml @@ -22,6 +22,7 @@ tokio = { version = "1.47", features = ["rt", "rt-multi-thread", "parking_lot", tokio-util = { version = "0.7" } tokio-stream = "0.1" tracing = "0.1" +uuid = { version = "1.18.1", features = ["v4"] } [dependencies.temporal-sdk-core] path = "../core" diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 35b2825c3..859629512 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -38,6 +38,7 @@ use temporal_sdk::{ WorkerInterceptor, }, }; +pub(crate) use temporal_sdk_core::test_help::NAMESPACE; use temporal_sdk_core::{ ClientOptions, ClientOptionsBuilder, CoreRuntime, RuntimeOptions, RuntimeOptionsBuilder, WorkerConfigBuilder, init_replay_worker, init_worker, @@ -67,8 +68,7 @@ use temporal_sdk_core_protos::{ use tokio::{sync::OnceCell, task::AbortHandle}; use tracing::{debug, warn}; use url::Url; - -pub(crate) use temporal_sdk_core::test_help::NAMESPACE; +use uuid::Uuid; /// The env var used to specify where the integ tests should point pub(crate) const INTEG_SERVER_TARGET_ENV_VAR: &str = "TEMPORAL_SERVICE_ADDRESS"; pub(crate) const INTEG_NAMESPACE_ENV_VAR: &str = "TEMPORAL_NAMESPACE"; @@ -498,6 +498,10 @@ impl TestWorker { &mut self.inner } + pub(crate) fn worker_instance_key(&self) -> Uuid { + self.core_worker.worker_instance_key() + } + // TODO: Maybe trait-ify? pub(crate) fn register_wf>( &mut self, diff --git a/tests/integ_tests/metrics_tests.rs b/tests/integ_tests/metrics_tests.rs index dc7caa812..d325e404b 100644 --- a/tests/integ_tests/metrics_tests.rs +++ b/tests/integ_tests/metrics_tests.rs @@ -22,8 +22,10 @@ use temporal_sdk::{ ActContext, ActivityError, ActivityOptions, CancellableFuture, LocalActivityOptions, NexusOperationOptions, WfContext, }; +use temporal_sdk_core::telemetry::start_prometheus_metric_exporter; use temporal_sdk_core::{ - CoreRuntime, FixedSizeSlotSupplier, TokioRuntimeBuilder, TunerBuilder, init_worker, + CoreRuntime, FixedSizeSlotSupplier, RuntimeOptionsBuilder, TokioRuntimeBuilder, TunerBuilder, + init_worker, telemetry::{WORKFLOW_TASK_EXECUTION_LATENCY_HISTOGRAM_NAME, build_otlp_metric_exporter}, }; use temporal_sdk_core_api::{ @@ -1327,3 +1329,99 @@ async fn prometheus_label_nonsense() { assert!(body.contains("some_counter{thing=\"foo\"} 2")); assert!(body.contains("some_counter{blerp=\"baz\"} 2")); } + +// Tests that rely on Prometheus running in a docker container need to start +// with `docker_` and set the `DOCKER_PROMETHEUS_RUNNING` env variable to run +#[rstest::rstest] +#[tokio::test] +async fn docker_worker_heartbeat_basic(#[values("otel", "prom")] backing: &str) { + let runtimeopts = RuntimeOptionsBuilder::default() + .telemetry_options(get_integ_telem_options()) + .heartbeat_interval(Some(Duration::from_millis(100))) + .build() + .unwrap(); + let mut rt = CoreRuntime::new_assume_tokio(runtimeopts).unwrap(); + match backing { + "otel" => { + let url = Some("grpc://localhost:4317") + .map(|x| x.parse::().unwrap()) + .unwrap(); + let mut opts_build = OtelCollectorOptionsBuilder::default(); + let opts = opts_build.url(url).build().unwrap(); + // If wanna add more options: https://github.com/temporalio/sdk-ruby/blob/143e421d82d16e58bd45226998363d55e4bc3bbb/temporalio/ext/src/runtime.rs#L113C21-L135C22 + + rt.telemetry_mut() + .attach_late_init_metrics(Arc::new(build_otlp_metric_exporter(opts).unwrap())); + } + "prom" => { + let mut opts_build = PrometheusExporterOptionsBuilder::default(); + opts_build.socket_addr(ANY_PORT.parse().unwrap()); + let opts = opts_build.build().unwrap(); + rt.telemetry_mut() + .attach_late_init_metrics(start_prometheus_metric_exporter(opts).unwrap().meter); + } + _ => unreachable!(), + } + let wf_name = "runtime_new_otel"; + let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); + starter + .worker_config + .max_outstanding_workflow_tasks(5_usize) + .max_cached_workflows(5_usize) + .max_outstanding_activities(5_usize); + let mut worker = starter.worker().await; + let worker_instance_key = worker.worker_instance_key(); + + // Run a workflow + worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { + ctx.activity(ActivityOptions { + activity_type: "pass_fail_act".to_string(), + input: "pass".as_json_payload().expect("serializes fine"), + start_to_close_timeout: Some(Duration::from_secs(1)), + ..Default::default() + }) + .await; + Ok(().into()) + }); + worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { + println!("STARTING ACTIVITY"); + Ok(i) + }); + + starter.start_with_worker(wf_name, &mut worker).await; + + // for i in 1..5 { + // worker.submit_wf( + // format!("{wf_name}-{i}"), + // wf_name, + // vec![], + // starter.workflow_options.clone(), + // ) + // .await + // .unwrap(); + // } + worker.run_until_done().await.unwrap(); + + // TODO: clone_no_worker() for new worker + + // TODO: ListWorkers + let client = starter.get_client().await; + let workers_list = client + .list_workers(100, Vec::new(), String::new()) + .await + .unwrap(); + // println!("workers_list: {workers_list:#?}"); + + // TODO: need to find worker with matching worker_heartbeat + let worker_info = workers_list.workers_info.iter().find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }); + println!("worker_instance_key {worker_instance_key:?}"); + println!("worker_info: {worker_info:#?}"); + + // TODO: add some asserts to ensure data shows up +} diff --git a/tests/workflow_replay_bench.rs b/tests/workflow_replay_bench.rs index d80796b0a..323559bab 100644 --- a/tests/workflow_replay_bench.rs +++ b/tests/workflow_replay_bench.rs @@ -16,7 +16,9 @@ use std::{ time::Duration, }; use temporal_sdk::{WfContext, WorkflowFunction}; -use temporal_sdk_core::{CoreRuntime, replay::HistoryForReplay}; +use temporal_sdk_core::{ + CoreRuntime, replay::HistoryForReplay, +}; use temporal_sdk_core_api::telemetry::metrics::{ MetricKeyValue, MetricParametersBuilder, NewAttributes, }; From 0ad6be2deffea00802df29f86726321bb5cdfb4b Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 30 Sep 2025 15:57:20 -0700 Subject: [PATCH 02/23] simplify worker::new(), fix some heartbeat metrics, new test file --- core-api/src/telemetry/metrics.rs | 162 ++++++------ core-api/src/worker.rs | 8 +- core-c-bridge/src/metric.rs | 4 +- core/src/abstractions.rs | 43 ++++ core/src/core_tests/workflow_tasks.rs | 1 - core/src/lib.rs | 1 - core/src/replay/mod.rs | 2 +- core/src/telemetry/metrics.rs | 5 +- core/src/telemetry/mod.rs | 2 +- core/src/telemetry/otel.rs | 4 +- core/src/worker/client.rs | 101 +++++++- core/src/worker/client/mocks.rs | 1 + core/src/worker/heartbeat.rs | 125 +-------- core/src/worker/mod.rs | 94 +++---- core/src/worker/tuner/fixed_size.rs | 4 + core/src/worker/tuner/resource_based.rs | 4 + tests/integ_tests/metrics_tests.rs | 100 +------- tests/integ_tests/worker_heartbeat_tests.rs | 267 ++++++++++++++++++++ tests/main.rs | 1 + tests/workflow_replay_bench.rs | 4 +- 20 files changed, 552 insertions(+), 381 deletions(-) create mode 100644 tests/integ_tests/worker_heartbeat_tests.rs diff --git a/core-api/src/telemetry/metrics.rs b/core-api/src/telemetry/metrics.rs index c1e4adb16..8a8e44400 100644 --- a/core-api/src/telemetry/metrics.rs +++ b/core-api/src/telemetry/metrics.rs @@ -34,7 +34,7 @@ pub trait CoreMeter: Send + Sync + Debug { params: MetricParameters, in_memory_counter: HeartbeatMetricType, ) -> Counter { - let primary_counter = self.counter(params.clone()); + let primary_counter = self.counter(params); Counter::new_with_in_memory(primary_counter.primary.metric.clone(), in_memory_counter) } @@ -47,6 +47,7 @@ pub trait CoreMeter: Send + Sync + Debug { /// accordingly. fn histogram_duration(&self, params: MetricParameters) -> HistogramDuration; + /// Create a histogram duration with in-memory tracking for dual metrics reporting fn histogram_duration_with_in_memory( &self, params: MetricParameters, @@ -73,12 +74,47 @@ pub trait CoreMeter: Send + Sync + Debug { fn in_memory_metrics(&self) -> Arc; } +/// Provides a generic way to record metrics in memory. +/// This can be done either with individual metrics or more fine-grained metrics +/// that vary by a set of labels for the same metric. #[derive(Clone, Debug)] pub enum HeartbeatMetricType { - Regular(Arc), + Individual(Arc), WithLabel(HashMap>), } +impl HeartbeatMetricType { + fn record_counter(&self, delta: u64) { + match self { + HeartbeatMetricType::Individual(metric) => { + metric.fetch_add(delta, Ordering::Relaxed); + } + HeartbeatMetricType::WithLabel(_) => { + dbg_panic!("Only gauge should support in-memory metric with labels"); + } + } + } + + fn record_histogram_observation(&self) { + self.record_counter(1); + } + + fn record_gauge(&self, value: u64, attributes: &MetricAttributes) { + match self { + HeartbeatMetricType::Individual(metric) => { + metric.store(value, Ordering::Relaxed); + } + HeartbeatMetricType::WithLabel(metrics) => { + if let Some(label_value) = label_value_from_attributes(attributes, "poller_type") { + if let Some(metric) = metrics.get(label_value.as_str()) { + metric.store(value, Ordering::Relaxed); + } + } + } + } + } +} + fn label_value_from_attributes(attributes: &MetricAttributes, key: &str) -> Option { match attributes { MetricAttributes::Prometheus { labels } => labels.as_prom_labels().get(key).cloned(), @@ -101,21 +137,21 @@ pub struct NumPollersMetric { impl NumPollersMetric { pub fn as_map(&self) -> HashMap> { - let mut map = HashMap::new(); - map.insert( - "workflow_task".to_string(), - self.wft_current_pollers.clone(), - ); - map.insert( - "sticky_workflow_task".to_string(), - self.sticky_wft_current_pollers.clone(), - ); - map.insert( - "activity_task".to_string(), - self.activity_current_pollers.clone(), - ); - map.insert("nexus_task".to_string(), self.nexus_current_pollers.clone()); - map + HashMap::from([ + ( + "workflow_task".to_string(), + self.wft_current_pollers.clone(), + ), + ( + "sticky_workflow_task".to_string(), + self.sticky_wft_current_pollers.clone(), + ), + ( + "activity_task".to_string(), + self.activity_current_pollers.clone(), + ), + ("nexus_task".to_string(), self.nexus_current_pollers.clone()), + ]) } } @@ -138,38 +174,38 @@ pub struct WorkerHeartbeatMetrics { impl WorkerHeartbeatMetrics { pub fn get_metric(&self, name: &str) -> Option { match name { - "sticky_cache_size" => { - Some(HeartbeatMetricType::Regular(self.sticky_cache_size.clone())) - } - "sticky_cache_hit" => Some(HeartbeatMetricType::Regular( + "sticky_cache_size" => Some(HeartbeatMetricType::Individual( + self.sticky_cache_size.clone(), + )), + "sticky_cache_hit" => Some(HeartbeatMetricType::Individual( self.total_sticky_cache_hit.clone(), )), - "sticky_cache_miss" => Some(HeartbeatMetricType::Regular( + "sticky_cache_miss" => Some(HeartbeatMetricType::Individual( self.total_sticky_cache_miss.clone(), )), "num_pollers" => Some(HeartbeatMetricType::WithLabel(self.num_pollers.as_map())), - "workflow_task_execution_failed" => Some(HeartbeatMetricType::Regular( + "workflow_task_execution_failed" => Some(HeartbeatMetricType::Individual( self.workflow_task_execution_failed.clone(), )), - "activity_execution_failed" => Some(HeartbeatMetricType::Regular( + "activity_execution_failed" => Some(HeartbeatMetricType::Individual( self.activity_execution_failed.clone(), )), - "nexus_task_execution_failed" => Some(HeartbeatMetricType::Regular( + "nexus_task_execution_failed" => Some(HeartbeatMetricType::Individual( self.nexus_task_execution_failed.clone(), )), - "local_activity_execution_failed" => Some(HeartbeatMetricType::Regular( + "local_activity_execution_failed" => Some(HeartbeatMetricType::Individual( self.local_activity_execution_failed.clone(), )), - "activity_execution_latency" => Some(HeartbeatMetricType::Regular( + "activity_execution_latency" => Some(HeartbeatMetricType::Individual( self.activity_execution_latency.clone(), )), - "local_activity_execution_latency" => Some(HeartbeatMetricType::Regular( + "local_activity_execution_latency" => Some(HeartbeatMetricType::Individual( self.local_activity_execution_latency.clone(), )), - "workflow_task_execution_latency" => Some(HeartbeatMetricType::Regular( + "workflow_task_execution_latency" => Some(HeartbeatMetricType::Individual( self.workflow_task_execution_latency.clone(), )), - "nexus_task_execution_latency" => Some(HeartbeatMetricType::Regular( + "nexus_task_execution_latency" => Some(HeartbeatMetricType::Individual( self.nexus_task_execution_latency.clone(), )), _ => None, @@ -417,14 +453,7 @@ impl Counter { } if let Some(ref in_mem) = self.in_memory { - match in_mem { - HeartbeatMetricType::Regular(metric) => { - metric.fetch_add(value, Ordering::Relaxed); - } - HeartbeatMetricType::WithLabel(_) => { - dbg_panic!("No in memory metric should use labels today"); - } - } + in_mem.record_counter(value); } } @@ -449,14 +478,7 @@ impl CounterBase for Counter { bound.adds(value); if let Some(ref in_mem) = self.in_memory { - match in_mem { - HeartbeatMetricType::Regular(metric) => { - metric.fetch_add(value, Ordering::Relaxed); - } - HeartbeatMetricType::WithLabel(_) => { - dbg_panic!("No in memory metric should use labels today"); - } - } + in_mem.record_counter(value); } } } @@ -637,14 +659,7 @@ impl HistogramDuration { } if let Some(ref in_mem) = self.in_memory { - match in_mem { - HeartbeatMetricType::Regular(metric) => { - metric.fetch_add(1, Ordering::Relaxed); - } - HeartbeatMetricType::WithLabel(_) => { - dbg_panic!("No in memory HistogramDuration should use labels today"); - } - } + in_mem.record_histogram_observation(); } } @@ -667,14 +682,7 @@ impl HistogramDurationBase for HistogramDuration { bound.records(value); if let Some(ref in_mem) = self.in_memory { - match in_mem { - HeartbeatMetricType::Regular(metric) => { - metric.fetch_add(1, Ordering::Relaxed); - } - HeartbeatMetricType::WithLabel(_) => { - dbg_panic!("No in memory HistogramDuration should use labels today"); - } - } + in_mem.record_histogram_observation(); } } } @@ -745,20 +753,7 @@ impl Gauge { } if let Some(ref in_mem) = self.in_memory { - match in_mem { - HeartbeatMetricType::Regular(metric) => { - metric.store(value, Ordering::Relaxed); - } - HeartbeatMetricType::WithLabel(metrics) => { - if let Some(label_value) = - label_value_from_attributes(attributes, "poller_type") - { - if let Some(metric) = metrics.get(&label_value) { - metric.store(value, Ordering::Relaxed); - } - } - } - } + in_mem.record_gauge(value, attributes); } } @@ -781,20 +776,7 @@ impl GaugeBase for Gauge { bound.records(value); if let Some(ref in_mem) = self.in_memory { - match in_mem { - HeartbeatMetricType::Regular(metric) => { - metric.store(value, Ordering::Relaxed); - } - HeartbeatMetricType::WithLabel(metrics) => { - if let Some(label_value) = - label_value_from_attributes(&self.primary.attributes, "poller_type") - { - if let Some(metric) = metrics.get(&label_value) { - metric.store(value, Ordering::Relaxed); - } - } - } - } + in_mem.record_gauge(value, &self.primary.attributes); } } } diff --git a/core-api/src/worker.rs b/core-api/src/worker.rs index 507a0dece..40257b078 100644 --- a/core-api/src/worker.rs +++ b/core-api/src/worker.rs @@ -1,6 +1,6 @@ use crate::{errors::WorkflowErrorType, telemetry::metrics::TemporalMeter}; use std::{ - any::Any, + any::{Any, type_name}, collections::{HashMap, HashSet}, str::FromStr, sync::Arc, @@ -362,6 +362,12 @@ pub trait SlotSupplier { fn available_slots(&self) -> Option { None } + + /// Returns a human-friendly identifier describing this supplier implementation for + /// diagnostics and telemetry. + fn slot_supplier_kind(&self) -> String { + format!("{}", type_name::()) + } } pub trait SlotReservationContext: Send + Sync { diff --git a/core-c-bridge/src/metric.rs b/core-c-bridge/src/metric.rs index 0889dee64..576ab31d7 100644 --- a/core-c-bridge/src/metric.rs +++ b/core-c-bridge/src/metric.rs @@ -2,6 +2,7 @@ use crate::{ByteArrayRef, runtime::Runtime}; use std::{any::Any, error::Error, sync::Arc, time::Duration}; use temporal_sdk_core_api::telemetry::metrics; use temporal_sdk_core_api::telemetry::metrics::WorkerHeartbeatMetrics; +use tracing::error; pub struct MetricMeter { core: metrics::TemporalMeter, @@ -368,7 +369,8 @@ impl metrics::CoreMeter for CustomMetricMeterRef { } fn in_memory_metrics(&self) -> Arc { - todo!() + error!("in_memory_metrics() is not supported for CustomMetricMeterRef"); + Arc::new(WorkerHeartbeatMetrics::default()) } } diff --git a/core/src/abstractions.rs b/core/src/abstractions.rs index bffa1eb9a..482b17d27 100644 --- a/core/src/abstractions.rs +++ b/core/src/abstractions.rs @@ -25,6 +25,7 @@ use tokio_util::sync::CancellationToken; #[derive(Clone)] pub(crate) struct MeteredPermitDealer { supplier: Arc + Send + Sync>, + slot_supplier_kind: SlotSupplierKind, /// The number of permit owners who have acquired a permit, but are not yet meaningfully using /// that permit. This is useful for giving a more semantically accurate count of used task /// slots, since we typically wait for a permit first before polling, but that slot isn't used @@ -54,6 +55,35 @@ pub(crate) struct PermitDealerContextData { pub(crate) worker_deployment_version: Option, } +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) enum SlotSupplierKind { + Fixed, + ResourceBased, + Custom(String), +} + +impl SlotSupplierKind { + fn from_label(label: &str) -> Self { + if label == "Fixed" { + SlotSupplierKind::Fixed + } else if label == "ResourceBased" { + SlotSupplierKind::ResourceBased + } else { + SlotSupplierKind::Custom(label.to_string()) + } + } +} + +impl std::fmt::Display for SlotSupplierKind { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + SlotSupplierKind::Fixed => f.write_str("Fixed"), + SlotSupplierKind::ResourceBased => f.write_str("ResourceBased"), + SlotSupplierKind::Custom(name) => f.write_str(name.as_str()), + } + } +} + impl MeteredPermitDealer where SK: SlotKind + 'static, @@ -65,8 +95,11 @@ where context_data: Arc, meter: Option, ) -> Self { + let supplier_kind_label = supplier.slot_supplier_kind(); + let slot_supplier_kind = SlotSupplierKind::from_label(supplier_kind_label.as_ref()); Self { supplier, + slot_supplier_kind, unused_claimants: Arc::new(AtomicUsize::new(0)), extant_permits: watch::channel(0), metrics_ctx, @@ -81,6 +114,10 @@ where self.supplier.available_slots() } + pub(crate) fn slot_supplier_kind(&self) -> &SlotSupplierKind { + &self.slot_supplier_kind + } + #[cfg(test)] pub(crate) fn unused_permits(&self) -> Option { self.available_permits() @@ -496,4 +533,10 @@ pub(crate) mod tests { // Now it'll proceed acquire_fut.await; } + + #[test] + fn captures_slot_supplier_kind() { + let dealer = fixed_size_permit_dealer::(1); + assert_eq!(*dealer.slot_supplier_kind(), SlotSupplierKind::Fixed); + } } diff --git a/core/src/core_tests/workflow_tasks.rs b/core/src/core_tests/workflow_tasks.rs index e5550938b..8d5df2b7a 100644 --- a/core/src/core_tests/workflow_tasks.rs +++ b/core/src/core_tests/workflow_tasks.rs @@ -2996,7 +2996,6 @@ async fn both_normal_and_sticky_pollers_poll_concurrently() { Arc::new(mock_client), None, None, - false, ) .unwrap(); diff --git a/core/src/lib.rs b/core/src/lib.rs index 3995b7562..0324422e5 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -123,7 +123,6 @@ where client_bag.clone(), Some(&runtime.telemetry), runtime.heartbeat_interval, - false, ) } diff --git a/core/src/replay/mod.rs b/core/src/replay/mod.rs index 1e4990000..03f0003be 100644 --- a/core/src/replay/mod.rs +++ b/core/src/replay/mod.rs @@ -114,7 +114,7 @@ where hist_allow_tx.send("Failed".to_string()).unwrap(); async move { Ok(RespondWorkflowTaskFailedResponse::default()) }.boxed() }); - let mut worker = Worker::new(self.config, None, Arc::new(client), None, None, false)?; + let mut worker = Worker::new(self.config, None, Arc::new(client), None, None)?; worker.set_post_activate_hook(post_activate); shutdown_tok(worker.shutdown_token()); Ok(worker) diff --git a/core/src/telemetry/metrics.rs b/core/src/telemetry/metrics.rs index c955173b5..1a3bb47dc 100644 --- a/core/src/telemetry/metrics.rs +++ b/core/src/telemetry/metrics.rs @@ -694,7 +694,7 @@ pub const ACTIVITY_EXEC_LATENCY_HISTOGRAM_NAME: &str = "activity_execution_laten pub(super) const NUM_POLLERS_NAME: &str = "num_pollers"; pub(super) const TASK_SLOTS_AVAILABLE_NAME: &str = "worker_task_slots_available"; pub(super) const TASK_SLOTS_USED_NAME: &str = "worker_task_slots_used"; -pub(crate) const STICKY_CACHE_SIZE_NAME: &str = "sticky_cache_size"; +pub(super) const STICKY_CACHE_SIZE_NAME: &str = "sticky_cache_size"; /// Track a failure metric if the failure is not a benign application failure. pub(crate) fn should_record_failure_metric(failure: &Option) -> bool { @@ -886,7 +886,8 @@ where } fn in_memory_metrics(&self) -> Arc { - todo!() + error!("in_memory_metrics() is not supported for MetricsCallBuffer"); + Arc::new(WorkerHeartbeatMetrics::default()) } } impl MetricCallBufferer for MetricsCallBuffer diff --git a/core/src/telemetry/mod.rs b/core/src/telemetry/mod.rs index c0a16ea3d..6ae0b2137 100644 --- a/core/src/telemetry/mod.rs +++ b/core/src/telemetry/mod.rs @@ -64,11 +64,11 @@ pub struct TelemetryInstance { metric_prefix: String, logs_out: Option>, metrics: Option>, + in_memory_metrics: Option>, /// The tracing subscriber which is associated with this telemetry instance. May be `None` if /// the user has not opted into any tracing configuration. trace_subscriber: Option>, attach_service_name: bool, - in_memory_metrics: Option>, // TODO: Should this even be option? } impl TelemetryInstance { diff --git a/core/src/telemetry/otel.rs b/core/src/telemetry/otel.rs index 6ffbad1c0..8a2bdb988 100644 --- a/core/src/telemetry/otel.rs +++ b/core/src/telemetry/otel.rs @@ -153,7 +153,7 @@ pub fn build_otlp_metric_exporter( MeterProviderBuilder::default().with_reader(reader), &opts.global_tags, opts.use_seconds_for_durations, - opts.histogram_bucket_overrides.clone(), + opts.histogram_bucket_overrides, )? .build(); @@ -270,7 +270,7 @@ impl CoreOtelMeter { } } -pub(crate) enum DurationHistogram { +enum DurationHistogram { Milliseconds(opentelemetry::metrics::Histogram), Seconds(opentelemetry::metrics::Histogram), } diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index d1712fe27..74b184887 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -2,7 +2,10 @@ pub(crate) mod mocks; use crate::protosext::legacy_query_failure; -use parking_lot::RwLock; +use parking_lot::{Mutex, RwLock}; +use prost_types::Duration as PbDuration; +use std::collections::HashMap; +use std::time::SystemTime; use std::{sync::Arc, time::Duration}; use temporal_client::{ Client, ClientWorkerSet, IsWorkerTaskLongPoll, Namespace, NamespacedClient, NoRetryOnMatching, @@ -10,6 +13,7 @@ use temporal_client::{ }; use temporal_sdk_core_api::worker::WorkerVersioningStrategy; use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerStatus; +use temporal_sdk_core_protos::temporal::api::worker::v1::WorkerSlotsInfo; use temporal_sdk_core_protos::{ TaskToken, coresdk::{workflow_commands::QueryResult, workflow_completion}, @@ -49,6 +53,7 @@ pub(crate) struct WorkerClientBag { namespace: String, identity: String, worker_versioning_strategy: WorkerVersioningStrategy, + worker_heartbeat_map: Arc>>, } impl WorkerClientBag { @@ -63,6 +68,7 @@ impl WorkerClientBag { namespace, identity, worker_versioning_strategy, + worker_heartbeat_map: Arc::new(Mutex::new(HashMap::new())), } } @@ -238,6 +244,9 @@ pub trait WorkerClient: Sync + Send { fn identity(&self) -> String; /// Get worker grouping key fn worker_grouping_key(&self) -> Uuid; + /// Sets the client-reliant fields for WorkerHeartbeat. This also updates client-level tracking + /// of heartbeat fields, like last heartbeat timestamp. + fn set_heartbeat_client_fields(&self, heartbeat: &mut WorkerHeartbeat); } /// Configuration options shared by workflow, activity, and Nexus polling calls @@ -653,6 +662,7 @@ impl WorkerClient for WorkerClientBag { let mut worker_heartbeat = worker_heartbeat; if let Some(w) = worker_heartbeat.as_mut() { w.status = WorkerStatus::Shutdown.into(); + self.set_heartbeat_client_fields(w); } let request = ShutdownWorkerRequest { namespace: self.namespace.clone(), @@ -721,6 +731,67 @@ impl WorkerClient for WorkerClientBag { .get_client() .worker_grouping_key() } + + fn set_heartbeat_client_fields(&self, heartbeat: &mut WorkerHeartbeat) { + if let Some(host_info) = heartbeat.host_info.as_mut() { + host_info.process_key = self.worker_grouping_key().to_string(); + } + heartbeat.worker_identity = self.identity(); + let sdk_name_and_ver = self.sdk_name_and_version(); + heartbeat.sdk_name = sdk_name_and_ver.0; + heartbeat.sdk_version = sdk_name_and_ver.1; + + let now = SystemTime::now(); + heartbeat.heartbeat_time = Some(now.into()); + let mut heartbeat_map = self.worker_heartbeat_map.lock(); + let client_heartbeat_data = heartbeat_map + .entry(heartbeat.worker_instance_key.clone()) + .or_default(); + let elapsed_since_last_heartbeat = + client_heartbeat_data.last_heartbeat_time.map(|hb_time| { + let dur = now.duration_since(hb_time).unwrap_or(Duration::ZERO); + PbDuration { + seconds: dur.as_secs() as i64, + nanos: dur.subsec_nanos() as i32, + } + }); + heartbeat.elapsed_since_last_heartbeat = elapsed_since_last_heartbeat; + client_heartbeat_data.last_heartbeat_time = Some(now); + + if let Some(wft_slot_info) = heartbeat.workflow_task_slots_info.as_mut() { + wft_slot_info.last_interval_processed_tasks = wft_slot_info.total_processed_tasks + - client_heartbeat_data + .workflow_task_slots_info + .total_processed_tasks; + wft_slot_info.last_interval_failure_tasks = wft_slot_info.total_failed_tasks + - client_heartbeat_data + .workflow_task_slots_info + .total_failed_tasks; + + client_heartbeat_data + .workflow_task_slots_info + .total_processed_tasks = wft_slot_info.total_processed_tasks; + client_heartbeat_data + .workflow_task_slots_info + .total_failed_tasks = wft_slot_info.total_failed_tasks; + } + update_slots( + &mut heartbeat.workflow_task_slots_info, + &mut client_heartbeat_data.workflow_task_slots_info, + ); + update_slots( + &mut heartbeat.activity_task_slots_info, + &mut client_heartbeat_data.activity_task_slots_info, + ); + update_slots( + &mut heartbeat.nexus_task_slots_info, + &mut client_heartbeat_data.nexus_task_slots_info, + ); + update_slots( + &mut heartbeat.local_activity_slots_info, + &mut client_heartbeat_data.local_activity_slots_info, + ); + } } impl NamespacedClient for WorkerClientBag { @@ -758,3 +829,31 @@ pub struct WorkflowTaskCompletion { /// Versioning behavior of the workflow, if any. pub versioning_behavior: VersioningBehavior, } + +#[derive(Clone, Default)] +struct SlotsInfo { + total_processed_tasks: i32, + total_failed_tasks: i32, +} + +#[derive(Clone, Default)] +struct ClientHeartbeatData { + last_heartbeat_time: Option, + + workflow_task_slots_info: SlotsInfo, + activity_task_slots_info: SlotsInfo, + nexus_task_slots_info: SlotsInfo, + local_activity_slots_info: SlotsInfo, +} + +fn update_slots(slots_info: &mut Option, client_heartbeat_data: &mut SlotsInfo) { + if let Some(wft_slot_info) = slots_info.as_mut() { + wft_slot_info.last_interval_processed_tasks = + wft_slot_info.total_processed_tasks - client_heartbeat_data.total_processed_tasks; + wft_slot_info.last_interval_failure_tasks = + wft_slot_info.total_failed_tasks - client_heartbeat_data.total_failed_tasks; + + client_heartbeat_data.total_processed_tasks = wft_slot_info.total_processed_tasks; + client_heartbeat_data.total_failed_tasks = wft_slot_info.total_failed_tasks; + } +} diff --git a/core/src/worker/client/mocks.rs b/core/src/worker/client/mocks.rs index 3fb7fed9e..b8ed1de7b 100644 --- a/core/src/worker/client/mocks.rs +++ b/core/src/worker/client/mocks.rs @@ -164,5 +164,6 @@ mockall::mock! { fn sdk_name_and_version(&self) -> (String, String); fn identity(&self) -> String; fn worker_grouping_key(&self) -> Uuid; + fn set_heartbeat_client_fields(&self, heartbeat: &mut WorkerHeartbeat); } } diff --git a/core/src/worker/heartbeat.rs b/core/src/worker/heartbeat.rs index c58e34f38..7ec2f7aa5 100644 --- a/core/src/worker/heartbeat.rs +++ b/core/src/worker/heartbeat.rs @@ -1,10 +1,9 @@ use crate::WorkerClient; use crate::worker::{TaskPollers, WorkerTelemetry}; use parking_lot::Mutex; -use prost_types::Duration as PbDuration; use std::collections::HashMap; use std::sync::Arc; -use std::time::{Duration, SystemTime}; +use std::time::Duration; use temporal_client::SharedNamespaceWorkerTrait; use temporal_sdk_core_api::worker::{ PollerBehavior, WorkerConfigBuilder, WorkerVersioningStrategy, @@ -47,7 +46,7 @@ impl SharedNamespaceWorker { .nexus_task_poller_behavior(PollerBehavior::SimpleMaximum(1_usize)) .build() .expect("all required fields should be implemented"); - let worker = crate::worker::Worker::new_with_pollers_inner( + let worker = crate::worker::Worker::new_with_pollers( config, None, client.clone(), @@ -69,42 +68,17 @@ impl SharedNamespaceWorker { tokio::spawn(async move { let mut ticker = tokio::time::interval(heartbeat_interval); - let mut last_heartbeat_time = HashMap::new(); - let mut last_processed_tasks = HashMap::new(); loop { tokio::select! { _ = ticker.tick() => { let mut hb_to_send = Vec::new(); - for (instance_key, heartbeat_callback) in heartbeat_map_clone.lock().iter() { + for (_instance_key, heartbeat_callback) in heartbeat_map_clone.lock().iter() { let mut heartbeat = heartbeat_callback(); - let heartbeat_time = last_heartbeat_time.get(instance_key).cloned(); - let now = SystemTime::now(); - let elapsed_since_last_heartbeat = heartbeat_time.map( - |hb_time| { - let dur = now.duration_since(hb_time).unwrap_or(Duration::ZERO); - PbDuration { - seconds: dur.as_secs() as i64, - nanos: dur.subsec_nanos() as i32, - } - } - ); - - heartbeat.elapsed_since_last_heartbeat = elapsed_since_last_heartbeat; - heartbeat.heartbeat_time = Some(now.into()); - - process_slot_info(*instance_key, &mut heartbeat, &mut last_processed_tasks); - // All of these heartbeat details rely on a client. To avoid circular // dependencies, this must be populated from within SharedNamespaceWorker // to get info from the current client - heartbeat.worker_identity = client_clone.identity(); - let sdk_name_and_ver = client_clone.sdk_name_and_version(); - heartbeat.sdk_name = sdk_name_and_ver.0; - heartbeat.sdk_version = sdk_name_and_ver.1; - + client_clone.set_heartbeat_client_fields(&mut heartbeat); hb_to_send.push(heartbeat); - - last_heartbeat_time.insert(*instance_key, now); } if let Err(e) = client_clone.record_worker_heartbeat(namespace_clone.clone(), hb_to_send).await { if matches!(e.code(), tonic::Code::Unimplemented) { @@ -156,96 +130,6 @@ impl SharedNamespaceWorkerTrait for SharedNamespaceWorker { } } -#[derive(Default)] -struct SlotsInfo { - last_interval_processed_tasks: i32, - last_interval_failure_tasks: i32, -} - -#[derive(Default)] -struct HeartbeatSlotsInfo { - workflow_task_slots_info: SlotsInfo, - activity_task_slots_info: SlotsInfo, - nexus_task_slots_info: SlotsInfo, - local_activity_slots_info: SlotsInfo, -} - -fn process_slot_info( - worker_instance_key: Uuid, - heartbeat: &mut WorkerHeartbeat, - slots_map: &mut HashMap, -) { - let slots_info = slots_map.entry(worker_instance_key).or_default(); - if let Some(wft_slot_info) = heartbeat.workflow_task_slots_info.as_mut() { - wft_slot_info.last_interval_processed_tasks = wft_slot_info.total_processed_tasks - - slots_info - .workflow_task_slots_info - .last_interval_processed_tasks; - wft_slot_info.last_interval_failure_tasks = wft_slot_info.total_failed_tasks - - slots_info - .workflow_task_slots_info - .last_interval_failure_tasks; - - slots_info - .workflow_task_slots_info - .last_interval_processed_tasks = wft_slot_info.total_processed_tasks; - slots_info - .workflow_task_slots_info - .last_interval_failure_tasks = wft_slot_info.total_failed_tasks; - } - - if let Some(act_slot_info) = heartbeat.activity_task_slots_info.as_mut() { - act_slot_info.last_interval_processed_tasks = act_slot_info.total_processed_tasks - - slots_info - .activity_task_slots_info - .last_interval_processed_tasks; - act_slot_info.last_interval_failure_tasks = act_slot_info.total_failed_tasks - - slots_info - .activity_task_slots_info - .last_interval_failure_tasks; - - slots_info - .activity_task_slots_info - .last_interval_processed_tasks = act_slot_info.total_processed_tasks; - slots_info - .activity_task_slots_info - .last_interval_failure_tasks = act_slot_info.total_failed_tasks; - } - - if let Some(nexus_slot_info) = heartbeat.nexus_task_slots_info.as_mut() { - nexus_slot_info.last_interval_processed_tasks = nexus_slot_info.total_processed_tasks - - slots_info - .nexus_task_slots_info - .last_interval_processed_tasks; - nexus_slot_info.last_interval_failure_tasks = nexus_slot_info.total_failed_tasks - - slots_info.nexus_task_slots_info.last_interval_failure_tasks; - - slots_info - .nexus_task_slots_info - .last_interval_processed_tasks = nexus_slot_info.total_processed_tasks; - slots_info.nexus_task_slots_info.last_interval_failure_tasks = - nexus_slot_info.total_failed_tasks; - } - - if let Some(la_slot_info) = heartbeat.local_activity_slots_info.as_mut() { - la_slot_info.last_interval_processed_tasks = la_slot_info.total_processed_tasks - - slots_info - .local_activity_slots_info - .last_interval_processed_tasks; - la_slot_info.last_interval_failure_tasks = la_slot_info.total_failed_tasks - - slots_info - .local_activity_slots_info - .last_interval_failure_tasks; - - slots_info - .local_activity_slots_info - .last_interval_processed_tasks = la_slot_info.total_processed_tasks; - slots_info - .local_activity_slots_info - .last_interval_failure_tasks = la_slot_info.total_failed_tasks; - } -} - #[cfg(test)] mod tests { use crate::{ @@ -308,7 +192,6 @@ mod tests { client.clone(), None, Some(Duration::from_millis(100)), - false, ) .unwrap(); diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index 3e92a00e1..c2e35ad7a 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -141,7 +141,7 @@ pub struct Worker { all_permits_tracker: tokio::sync::Mutex, /// Used to track worker client client_worker_registrator: Arc, - /// Status + /// Status of the worker status: Arc>, } @@ -313,18 +313,24 @@ impl Worker { client: Arc, telem_instance: Option<&TelemetryInstance>, worker_heartbeat_interval: Option, - shared_namespace_worker: bool, ) -> Result { info!(task_queue=%config.task_queue, namespace=%config.namespace, "Initializing worker"); + let worker_telemetry = telem_instance.map(|telem| WorkerTelemetry { + metric_meter: telem.get_metric_meter(), + temporal_metric_meter: telem.get_temporal_metric_meter(), + trace_subscriber: telem.trace_subscriber(), + in_memory_meter: telem.in_memory_metrics(), + }); + Self::new_with_pollers( config, sticky_queue_name, client, TaskPollers::Real, - telem_instance, + worker_telemetry, worker_heartbeat_interval, - shared_namespace_worker, + false, ) } @@ -359,46 +365,18 @@ impl Worker { #[cfg(test)] pub(crate) fn new_test(config: WorkerConfig, client: impl WorkerClient + 'static) -> Self { - Self::new(config, None, Arc::new(client), None, None, false).unwrap() + Self::new(config, None, Arc::new(client), None, None).unwrap() } pub(crate) fn new_with_pollers( - config: WorkerConfig, - sticky_queue_name: Option, - client: Arc, - task_pollers: TaskPollers, - telem_instance: Option<&TelemetryInstance>, - worker_heartbeat_interval: Option, - shared_namespace_worker: bool, - ) -> Result { - let worker_telemetry = telem_instance.map(|telem| WorkerTelemetry { - metric_meter: telem.get_metric_meter(), - temporal_metric_meter: telem.get_temporal_metric_meter(), - trace_subscriber: telem.trace_subscriber(), - in_memory_meter: telem.in_memory_metrics(), - }); - - Worker::new_with_pollers_inner( - config, - sticky_queue_name, - client, - task_pollers, - worker_telemetry, - worker_heartbeat_interval, - shared_namespace_worker, - ) - } - - pub(crate) fn new_with_pollers_inner( config: WorkerConfig, sticky_queue_name: Option, client: Arc, task_pollers: TaskPollers, worker_telemetry: Option, worker_heartbeat_interval: Option, - shared_namespace_worker: bool, // TODO: is this unnecessary? + shared_namespace_worker: bool, ) -> Result { - // let shared_namespace_worker = in_memory_meter.is_none(); let (metrics, meter) = if let Some(wt) = worker_telemetry.as_ref() { ( MetricsContext::top_level_with_meter( @@ -487,7 +465,6 @@ impl Worker { None } else { let act_metrics = metrics.with_new_attrs([activity_poller()]); - // activity poller let ap = LongPollBuffer::new_activity_task( client.clone(), config.task_queue.clone(), @@ -1073,26 +1050,16 @@ impl WorkerHeartbeatManager { nexus_last_suc_poll_time: Arc>>, status: Arc>, ) -> Self { - let task_queue = config.task_queue.clone(); - let deployment_version = config.computed_deployment_version(); - let deployment_version = - deployment_version.map(|dv| deployment::v1::WorkerDeploymentVersion { - deployment_name: dv.deployment_name, - build_id: dv.build_id, - }); - let telemetry_instance_clone = telemetry_instance.clone(); - let worker_heartbeat_callback: HeartbeatFn = Arc::new(move || { - let mut sys = System::new_all(); - sys.refresh_all(); - std::thread::sleep(sysinfo::MINIMUM_CPU_UPDATE_INTERVAL); - sys.refresh_cpu_usage(); - let current_host_cpu_usage: f32 = - sys.cpus().iter().map(|cpu| cpu.cpu_usage()).sum::() / sys.cpus().len() as f32; - let total_mem = sys.total_memory() as f64; - let used_mem = sys.used_memory() as f64; - let current_host_mem_usage = (used_mem / total_mem) as f32; + let deployment_version = config.computed_deployment_version().map(|dv| { + deployment::v1::WorkerDeploymentVersion { + deployment_name: dv.deployment_name, + build_id: dv.build_id, + } + }); + + let (current_host_cpu_usage, current_host_mem_usage) = get_host_data(); let mut worker_heartbeat = WorkerHeartbeat { worker_instance_key: worker_instance_key.to_string(), @@ -1105,8 +1072,8 @@ impl WorkerHeartbeatManager { // Set by SharedNamespaceWorker because it relies on the client process_key: String::new(), }), - task_queue: task_queue.clone(), - deployment_version: deployment_version.clone(), + task_queue: config.task_queue.clone(), + deployment_version, status: (*status.lock()) as i32, start_time: Some(SystemTime::now().into()), @@ -1144,6 +1111,7 @@ impl WorkerHeartbeatManager { in_mem.total_sticky_cache_miss.load(Ordering::Relaxed) as i32; worker_heartbeat.current_sticky_cache_size = in_mem.sticky_cache_size.load(Ordering::Relaxed) as i32; + // TODO: Is this ever not Some()? worker_heartbeat.workflow_poller_info = Some(WorkerPollerInfo { current_pollers: in_mem @@ -1153,7 +1121,6 @@ impl WorkerHeartbeatManager { last_successful_poll_time: wf_last_suc_poll_time.lock().map(|time| time.into()), is_autoscaling: config.workflow_task_poller_behavior.is_autoscaling(), }); - worker_heartbeat.workflow_sticky_poller_info = Some(WorkerPollerInfo { current_pollers: in_mem .num_pollers @@ -1281,7 +1248,7 @@ where Some(WorkerSlotsInfo { current_available_slots: avail, current_used_slots: used, - slot_supplier_kind: SK::kind().to_string(), + slot_supplier_kind: dealer.slot_supplier_kind().to_string(), total_processed_tasks: i32::try_from(total_processed).unwrap_or(i32::MAX), total_failed_tasks: i32::try_from(total_failed).unwrap_or(i32::MAX), @@ -1291,6 +1258,19 @@ where }) } +fn get_host_data() -> (f32, f32) { + let mut sys = System::new_all(); + sys.refresh_all(); + std::thread::sleep(sysinfo::MINIMUM_CPU_UPDATE_INTERVAL); + sys.refresh_cpu_usage(); + let current_host_cpu_usage: f32 = + sys.cpus().iter().map(|cpu| cpu.cpu_usage()).sum::() / sys.cpus().len() as f32; + let total_mem = sys.total_memory() as f64; + let used_mem = sys.used_memory() as f64; + let current_host_mem_usage = (used_mem / total_mem) as f32; + (current_host_cpu_usage, current_host_mem_usage) +} + #[cfg(test)] mod tests { use super::*; diff --git a/core/src/worker/tuner/fixed_size.rs b/core/src/worker/tuner/fixed_size.rs index aa737dc8b..e1bf53d6e 100644 --- a/core/src/worker/tuner/fixed_size.rs +++ b/core/src/worker/tuner/fixed_size.rs @@ -50,4 +50,8 @@ where fn available_slots(&self) -> Option { Some(self.sem.available_permits()) } + + fn slot_supplier_kind(&self) -> String { + "Fixed".to_string() + } } diff --git a/core/src/worker/tuner/resource_based.rs b/core/src/worker/tuner/resource_based.rs index 173418413..3dbed0aef 100644 --- a/core/src/worker/tuner/resource_based.rs +++ b/core/src/worker/tuner/resource_based.rs @@ -314,6 +314,10 @@ where } } } + + fn slot_supplier_kind(&self) -> String { + "ResourceBased".to_string() + } } impl ResourceBasedSlotsForType diff --git a/tests/integ_tests/metrics_tests.rs b/tests/integ_tests/metrics_tests.rs index d325e404b..dc7caa812 100644 --- a/tests/integ_tests/metrics_tests.rs +++ b/tests/integ_tests/metrics_tests.rs @@ -22,10 +22,8 @@ use temporal_sdk::{ ActContext, ActivityError, ActivityOptions, CancellableFuture, LocalActivityOptions, NexusOperationOptions, WfContext, }; -use temporal_sdk_core::telemetry::start_prometheus_metric_exporter; use temporal_sdk_core::{ - CoreRuntime, FixedSizeSlotSupplier, RuntimeOptionsBuilder, TokioRuntimeBuilder, TunerBuilder, - init_worker, + CoreRuntime, FixedSizeSlotSupplier, TokioRuntimeBuilder, TunerBuilder, init_worker, telemetry::{WORKFLOW_TASK_EXECUTION_LATENCY_HISTOGRAM_NAME, build_otlp_metric_exporter}, }; use temporal_sdk_core_api::{ @@ -1329,99 +1327,3 @@ async fn prometheus_label_nonsense() { assert!(body.contains("some_counter{thing=\"foo\"} 2")); assert!(body.contains("some_counter{blerp=\"baz\"} 2")); } - -// Tests that rely on Prometheus running in a docker container need to start -// with `docker_` and set the `DOCKER_PROMETHEUS_RUNNING` env variable to run -#[rstest::rstest] -#[tokio::test] -async fn docker_worker_heartbeat_basic(#[values("otel", "prom")] backing: &str) { - let runtimeopts = RuntimeOptionsBuilder::default() - .telemetry_options(get_integ_telem_options()) - .heartbeat_interval(Some(Duration::from_millis(100))) - .build() - .unwrap(); - let mut rt = CoreRuntime::new_assume_tokio(runtimeopts).unwrap(); - match backing { - "otel" => { - let url = Some("grpc://localhost:4317") - .map(|x| x.parse::().unwrap()) - .unwrap(); - let mut opts_build = OtelCollectorOptionsBuilder::default(); - let opts = opts_build.url(url).build().unwrap(); - // If wanna add more options: https://github.com/temporalio/sdk-ruby/blob/143e421d82d16e58bd45226998363d55e4bc3bbb/temporalio/ext/src/runtime.rs#L113C21-L135C22 - - rt.telemetry_mut() - .attach_late_init_metrics(Arc::new(build_otlp_metric_exporter(opts).unwrap())); - } - "prom" => { - let mut opts_build = PrometheusExporterOptionsBuilder::default(); - opts_build.socket_addr(ANY_PORT.parse().unwrap()); - let opts = opts_build.build().unwrap(); - rt.telemetry_mut() - .attach_late_init_metrics(start_prometheus_metric_exporter(opts).unwrap().meter); - } - _ => unreachable!(), - } - let wf_name = "runtime_new_otel"; - let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); - starter - .worker_config - .max_outstanding_workflow_tasks(5_usize) - .max_cached_workflows(5_usize) - .max_outstanding_activities(5_usize); - let mut worker = starter.worker().await; - let worker_instance_key = worker.worker_instance_key(); - - // Run a workflow - worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { - ctx.activity(ActivityOptions { - activity_type: "pass_fail_act".to_string(), - input: "pass".as_json_payload().expect("serializes fine"), - start_to_close_timeout: Some(Duration::from_secs(1)), - ..Default::default() - }) - .await; - Ok(().into()) - }); - worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { - println!("STARTING ACTIVITY"); - Ok(i) - }); - - starter.start_with_worker(wf_name, &mut worker).await; - - // for i in 1..5 { - // worker.submit_wf( - // format!("{wf_name}-{i}"), - // wf_name, - // vec![], - // starter.workflow_options.clone(), - // ) - // .await - // .unwrap(); - // } - worker.run_until_done().await.unwrap(); - - // TODO: clone_no_worker() for new worker - - // TODO: ListWorkers - let client = starter.get_client().await; - let workers_list = client - .list_workers(100, Vec::new(), String::new()) - .await - .unwrap(); - // println!("workers_list: {workers_list:#?}"); - - // TODO: need to find worker with matching worker_heartbeat - let worker_info = workers_list.workers_info.iter().find(|worker_info| { - if let Some(hb) = worker_info.worker_heartbeat.as_ref() { - hb.worker_instance_key == worker_instance_key.to_string() - } else { - false - } - }); - println!("worker_instance_key {worker_instance_key:?}"); - println!("worker_info: {worker_info:#?}"); - - // TODO: add some asserts to ensure data shows up -} diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs new file mode 100644 index 000000000..3c1999212 --- /dev/null +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -0,0 +1,267 @@ +use crate::common::{ANY_PORT, CoreWfStarter, get_integ_telem_options}; +use prost_types::Duration as PbDuration; +use prost_types::Timestamp; +use std::sync::Arc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use temporal_client::WorkflowClientTrait; +use temporal_sdk::{ActContext, ActivityOptions, WfContext}; +use temporal_sdk_core::telemetry::{build_otlp_metric_exporter, start_prometheus_metric_exporter}; +use temporal_sdk_core::{CoreRuntime, RuntimeOptionsBuilder}; +use temporal_sdk_core_api::telemetry::{ + OtelCollectorOptionsBuilder, PrometheusExporterOptionsBuilder, +}; +use temporal_sdk_core_protos::coresdk::AsJsonPayloadExt; +use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerStatus; +use tokio::sync::Semaphore; +use url::Url; + +fn within_two_minutes_ts(ts: Timestamp) -> bool { + let ts_time = UNIX_EPOCH + Duration::new(ts.seconds as u64, ts.nanos as u32); + + let now = SystemTime::now(); + // ts should be at most 2 minutes before the current time + now.duration_since(ts_time).unwrap() <= Duration::from_secs(2 * 60) +} + +fn within_duration(dur: PbDuration, threshold: Duration) -> bool { + let std_dur = Duration::new(dur.seconds as u64, dur.nanos as u32); + std_dur <= threshold +} + +// Tests that rely on Prometheus running in a docker container need to start +// with `docker_` and set the `DOCKER_PROMETHEUS_RUNNING` env variable to run +#[rstest::rstest] +#[tokio::test] +async fn docker_worker_heartbeat_basic( + #[values( + "otel", + // "prom" +)] + backing: &str, +) { + let runtimeopts = RuntimeOptionsBuilder::default() + .telemetry_options(get_integ_telem_options()) + .heartbeat_interval(Some(Duration::from_millis(100))) + .build() + .unwrap(); + let mut rt = CoreRuntime::new_assume_tokio(runtimeopts).unwrap(); + match backing { + "otel" => { + let url = Some("grpc://localhost:4317") + .map(|x| x.parse::().unwrap()) + .unwrap(); + let mut opts_build = OtelCollectorOptionsBuilder::default(); + let opts = opts_build.url(url).build().unwrap(); + // If wanna add more options: https://github.com/temporalio/sdk-ruby/blob/143e421d82d16e58bd45226998363d55e4bc3bbb/temporalio/ext/src/runtime.rs#L113C21-L135C22 + + rt.telemetry_mut() + .attach_late_init_metrics(Arc::new(build_otlp_metric_exporter(opts).unwrap())); + } + "prom" => { + let mut opts_build = PrometheusExporterOptionsBuilder::default(); + opts_build.socket_addr(ANY_PORT.parse().unwrap()); + let opts = opts_build.build().unwrap(); + rt.telemetry_mut() + .attach_late_init_metrics(start_prometheus_metric_exporter(opts).unwrap().meter); + } + _ => unreachable!(), + } + let wf_name = format!("worker_heartbeat_basic_{backing}"); + static ACTS_STARTED: Semaphore = Semaphore::const_new(0); + static ACTS_DONE: Semaphore = Semaphore::const_new(0); + let mut starter = CoreWfStarter::new_with_runtime(&wf_name, rt); + starter + .worker_config + .max_outstanding_workflow_tasks(5_usize) + .max_cached_workflows(5_usize) + .max_outstanding_activities(5_usize); + let mut worker = starter.worker().await; + let worker_instance_key = worker.worker_instance_key(); + + // Run a workflow + worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { + println!("asdfasdfasdfasdfasdfasdf"); + ctx.activity(ActivityOptions { + activity_type: "pass_fail_act".to_string(), + input: "pass".as_json_payload().expect("serializes fine"), + start_to_close_timeout: Some(Duration::from_secs(1)), + ..Default::default() + }) + .await; + Ok(().into()) + }); + worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { + println!("pass_fail_act"); + ACTS_STARTED.add_permits(1); + let _ = ACTS_DONE.acquire().await.unwrap(); + Ok(i) + }); + + starter + .start_with_worker(wf_name.clone(), &mut worker) + .await; + + let test_fut = async { + let _ = ACTS_STARTED.acquire().await.unwrap(); + let client = starter.get_client().await; + let workers_list = client + .list_workers(100, Vec::new(), String::new()) + .await + .unwrap(); + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + println!("1231231231231: {heartbeat:#?}"); + let workflow_poller_info = heartbeat.workflow_poller_info.unwrap(); + assert_eq!(workflow_poller_info.current_pollers, 1); + let sticky_poller_info = heartbeat.workflow_sticky_poller_info.unwrap(); + assert_eq!(sticky_poller_info.current_pollers, 3); + let nexus_poller_info = heartbeat.nexus_poller_info.unwrap(); + assert_eq!(nexus_poller_info.current_pollers, 0); + let activity_poller_info = heartbeat.activity_poller_info.unwrap(); + assert_eq!(activity_poller_info.current_pollers, 4); + assert_eq!(heartbeat.current_sticky_cache_size, 1); + ACTS_DONE.add_permits(1); + }; + + // for i in 1..5 { + // worker.submit_wf( + // format!("{wf_name}-{i}"), + // wf_name, + // vec![], + // starter.workflow_options.clone(), + // ) + // .await + // .unwrap(); + // } + + let runner = async move { + worker.run_until_done().await.unwrap(); + }; + tokio::join!(test_fut, runner); + + // TODO: clone_no_worker() for new worker + let client = starter.get_client().await; + let workers_list = client + .list_workers(100, Vec::new(), String::new()) + .await + .unwrap(); + // Since list_workers finds all workers in the namespace, must find specific worker used in this + // test + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + println!("heartbeat: {heartbeat:#?}"); + assert!(heartbeat.task_queue.starts_with(&wf_name)); + assert_eq!(heartbeat.worker_identity, "integ_tester"); + assert_eq!(heartbeat.sdk_name, "temporal-core"); + assert_eq!(heartbeat.sdk_version, "0.1.0"); + assert_eq!(heartbeat.status, WorkerStatus::Shutdown as i32); + assert!(within_two_minutes_ts(heartbeat.start_time.unwrap())); + assert!(within_two_minutes_ts(heartbeat.heartbeat_time.unwrap())); + assert!(within_duration( + heartbeat.elapsed_since_last_heartbeat.unwrap(), + Duration::from_secs(1) + )); + + // TODO: + // workflow_task_slots_info: Some( + // WorkerSlotsInfo { + // current_available_slots: 5, + // current_used_slots: 0, + // slot_supplier_kind: "Workflow", + // total_processed_tasks: 2, + // total_failed_tasks: 0, + // last_interval_processed_tasks: 0, + // last_interval_failure_tasks: 0, + // }, + // ), + // activity_task_slots_info: None, + // nexus_task_slots_info: None, + // local_activity_slots_info: None, + + // workflow_poller_info: Some( + // WorkerPollerInfo { + // current_pollers: 0, + // last_successful_poll_time: Some( + // Timestamp { + // seconds: 1759209070, + // nanos: 909377000, + // }, + // ), + // is_autoscaling: false, + // }, + // ), + // workflow_sticky_poller_info: Some( + // WorkerPollerInfo { + // current_pollers: 0, + // last_successful_poll_time: Some( + // Timestamp { + // seconds: 1759209072, + // nanos: 281956000, + // }, + // ), + // is_autoscaling: false, + // }, + // ), + // activity_poller_info: Some( + // WorkerPollerInfo { + // current_pollers: 0, + // last_successful_poll_time: Some( + // Timestamp { + // seconds: 1759209071, + // nanos: 737598000, + // }, + // ), + // is_autoscaling: false, + // }, + // ), + // nexus_poller_info: Some( + // WorkerPollerInfo { + // current_pollers: 0, + // last_successful_poll_time: None, + // is_autoscaling: false, + // }, + // ), + let workflow_poller_info = heartbeat.workflow_poller_info.unwrap(); + assert!(!workflow_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + workflow_poller_info.last_successful_poll_time.unwrap() + )); + let sticky_poller_info = heartbeat.workflow_sticky_poller_info.unwrap(); + assert!(!sticky_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + sticky_poller_info.last_successful_poll_time.unwrap() + )); + let nexus_poller_info = heartbeat.nexus_poller_info.unwrap(); + assert!(!nexus_poller_info.is_autoscaling); + assert!(nexus_poller_info.last_successful_poll_time.is_none()); + let activity_poller_info = heartbeat.activity_poller_info.unwrap(); + assert!(!activity_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + activity_poller_info.last_successful_poll_time.unwrap() + )); + + assert_eq!(heartbeat.total_sticky_cache_hit, 2); + // total_sticky_cache_miss: 0, + // current_sticky_cache_size: 0, + // plugins: [], +} diff --git a/tests/main.rs b/tests/main.rs index 8a71f03ca..d48a68bd5 100644 --- a/tests/main.rs +++ b/tests/main.rs @@ -22,6 +22,7 @@ mod integ_tests { mod queries_tests; mod update_tests; mod visibility_tests; + mod worker_heartbeat_tests; mod worker_tests; mod worker_versioning_tests; mod workflow_tests; diff --git a/tests/workflow_replay_bench.rs b/tests/workflow_replay_bench.rs index 323559bab..d80796b0a 100644 --- a/tests/workflow_replay_bench.rs +++ b/tests/workflow_replay_bench.rs @@ -16,9 +16,7 @@ use std::{ time::Duration, }; use temporal_sdk::{WfContext, WorkflowFunction}; -use temporal_sdk_core::{ - CoreRuntime, replay::HistoryForReplay, -}; +use temporal_sdk_core::{CoreRuntime, replay::HistoryForReplay}; use temporal_sdk_core_api::telemetry::metrics::{ MetricKeyValue, MetricParametersBuilder, NewAttributes, }; From d314a354213b984624e67766f313f2de0f3d686b Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Wed, 1 Oct 2025 11:31:50 -0700 Subject: [PATCH 03/23] CounterImpl, final_heartbeat, more specific metric label dbg_panic msg, counter_with_in_mem and and_then() --- core-api/src/telemetry/metrics.rs | 23 ++++++---- core/src/telemetry/metrics.rs | 70 ++++++++++++++----------------- core/src/worker/client.rs | 10 ++--- core/src/worker/mod.rs | 4 +- 4 files changed, 55 insertions(+), 52 deletions(-) diff --git a/core-api/src/telemetry/metrics.rs b/core-api/src/telemetry/metrics.rs index 8a8e44400..26f41b256 100644 --- a/core-api/src/telemetry/metrics.rs +++ b/core-api/src/telemetry/metrics.rs @@ -90,13 +90,20 @@ impl HeartbeatMetricType { metric.fetch_add(delta, Ordering::Relaxed); } HeartbeatMetricType::WithLabel(_) => { - dbg_panic!("Only gauge should support in-memory metric with labels"); + dbg_panic!("Counter does not support in-memory metric with labels"); } } } fn record_histogram_observation(&self) { - self.record_counter(1); + match self { + HeartbeatMetricType::Individual(metric) => { + metric.fetch_add(1, Ordering::Relaxed); + } + HeartbeatMetricType::WithLabel(_) => { + dbg_panic!("Histogram does not support in-memory metric with labels"); + } + } } fn record_gauge(&self, value: u64, attributes: &MetricAttributes) { @@ -408,14 +415,14 @@ pub trait CounterBase: Send + Sync { fn adds(&self, value: u64); } -pub type CounterType = LazyBoundMetric< +pub type CounterImpl = LazyBoundMetric< Arc> + Send + Sync>, Arc, >; #[derive(Clone)] pub struct Counter { - primary: CounterType, + primary: CounterImpl, in_memory: Option, } impl Counter { @@ -612,14 +619,14 @@ pub trait HistogramDurationBase: Send + Sync { fn records(&self, value: Duration); } -pub type HistogramDurationType = LazyBoundMetric< +pub type HistogramDurationImpl = LazyBoundMetric< Arc> + Send + Sync>, Arc, >; #[derive(Clone)] pub struct HistogramDuration { - primary: HistogramDurationType, + primary: HistogramDurationImpl, in_memory: Option, } impl HistogramDuration { @@ -708,14 +715,14 @@ pub trait GaugeBase: Send + Sync { fn records(&self, value: u64); } -pub type GaugeType = LazyBoundMetric< +pub type GaugeImpl = LazyBoundMetric< Arc> + Send + Sync>, Arc, >; #[derive(Clone)] pub struct Gauge { - primary: GaugeType, + primary: GaugeImpl, in_memory: Option, } impl Gauge { diff --git a/core/src/telemetry/metrics.rs b/core/src/telemetry/metrics.rs index 1a3bb47dc..4726fe89d 100644 --- a/core/src/telemetry/metrics.rs +++ b/core/src/telemetry/metrics.rs @@ -307,34 +307,28 @@ impl MetricsContext { impl Instruments { fn new(meter: &dyn CoreMeter, in_memory: Option>) -> Self { - let create_counter = |params: MetricParameters| -> Counter { - if let Some(in_mem) = in_memory.clone() - && let Some(metric) = in_mem.get_metric(¶ms.name) - { - meter.counter_with_in_memory(params, metric) - } else { - meter.counter(params) - } + let counter_with_in_mem = |params: MetricParameters| -> Counter { + in_memory + .clone() + .and_then(|in_mem| in_mem.get_metric(¶ms.name)) + .map(|metric| meter.counter_with_in_memory(params.clone(), metric)) + .unwrap_or_else(|| meter.counter(params)) }; - let create_gauge = |params: MetricParameters| -> Gauge { - if let Some(in_mem) = in_memory.clone() - && let Some(metric) = in_mem.get_metric(¶ms.name) - { - meter.gauge_with_in_memory(params, metric) - } else { - meter.gauge(params) - } + let gauge_with_in_mem = |params: MetricParameters| -> Gauge { + in_memory + .clone() + .and_then(|in_mem| in_mem.get_metric(¶ms.name)) + .map(|metric| meter.gauge_with_in_memory(params.clone(), metric)) + .unwrap_or_else(|| meter.gauge(params)) }; - let create_histogram_duration = |params: MetricParameters| -> HistogramDuration { - if let Some(in_mem) = in_memory.clone() - && let Some(metric) = in_mem.get_metric(¶ms.name) - { - meter.histogram_duration_with_in_memory(params, metric) - } else { - meter.histogram_duration(params) - } + let histogram_with_in_mem = |params: MetricParameters| -> HistogramDuration { + in_memory + .clone() + .and_then(|in_mem| in_mem.get_metric(¶ms.name)) + .map(|metric| meter.histogram_duration_with_in_memory(params.clone(), metric)) + .unwrap_or_else(|| meter.histogram_duration(params)) }; Self { @@ -368,12 +362,12 @@ impl Instruments { description: "Count of workflow task queue poll timeouts (no new task)".into(), unit: "".into(), }), - wf_task_queue_poll_succeed_counter: create_counter(MetricParameters { + wf_task_queue_poll_succeed_counter: counter_with_in_mem(MetricParameters { name: "workflow_task_queue_poll_succeed".into(), description: "Count of workflow task queue poll successes".into(), unit: "".into(), }), - wf_task_execution_failure_counter: create_counter(MetricParameters { + wf_task_execution_failure_counter: counter_with_in_mem(MetricParameters { name: "workflow_task_execution_failed".into(), description: "Count of workflow task execution failures".into(), unit: "".into(), @@ -388,7 +382,7 @@ impl Instruments { unit: "duration".into(), description: "Histogram of workflow task replay latencies".into(), }), - wf_task_execution_latency: create_histogram_duration(MetricParameters { + wf_task_execution_latency: histogram_with_in_mem(MetricParameters { name: WORKFLOW_TASK_EXECUTION_LATENCY_HISTOGRAM_NAME.into(), unit: "duration".into(), description: "Histogram of workflow task execution (not replay) latencies".into(), @@ -398,12 +392,12 @@ impl Instruments { description: "Count of activity task queue poll timeouts (no new task)".into(), unit: "".into(), }), - act_task_received_counter: create_counter(MetricParameters { + act_task_received_counter: counter_with_in_mem(MetricParameters { name: "activity_task_received".into(), description: "Count of activity task queue poll successes".into(), unit: "".into(), }), - act_execution_failed: create_counter(MetricParameters { + act_execution_failed: counter_with_in_mem(MetricParameters { name: "activity_execution_failed".into(), description: "Count of activity task execution failures".into(), unit: "".into(), @@ -413,7 +407,7 @@ impl Instruments { unit: "duration".into(), description: "Histogram of activity schedule-to-start latencies".into(), }), - act_exec_latency: create_histogram_duration(MetricParameters { + act_exec_latency: histogram_with_in_mem(MetricParameters { name: ACTIVITY_EXEC_LATENCY_HISTOGRAM_NAME.into(), unit: "duration".into(), description: "Histogram of activity execution latencies".into(), @@ -434,7 +428,7 @@ impl Instruments { description: "Count of local activity executions that failed".into(), unit: "".into(), }), - la_exec_latency: create_histogram_duration(MetricParameters { + la_exec_latency: histogram_with_in_mem(MetricParameters { name: "local_activity_execution_latency".into(), unit: "duration".into(), description: "Histogram of local activity execution latencies".into(), @@ -446,7 +440,7 @@ impl Instruments { "Histogram of local activity execution latencies for successful local activities" .into(), }), - la_total: create_counter(MetricParameters { + la_total: counter_with_in_mem(MetricParameters { name: "local_activity_total".into(), description: "Count of local activities executed".into(), unit: "".into(), @@ -466,12 +460,12 @@ impl Instruments { unit: "duration".into(), description: "Histogram of nexus task end-to-end latencies".into(), }), - nexus_task_execution_latency: create_histogram_duration(MetricParameters { + nexus_task_execution_latency: histogram_with_in_mem(MetricParameters { name: "nexus_task_execution_latency".into(), unit: "duration".into(), description: "Histogram of nexus task execution latencies".into(), }), - nexus_task_execution_failed: create_counter(MetricParameters { + nexus_task_execution_failed: counter_with_in_mem(MetricParameters { name: "nexus_task_execution_failed".into(), description: "Count of nexus task execution failures".into(), unit: "".into(), @@ -482,7 +476,7 @@ impl Instruments { description: "Count of the number of initialized workers".into(), unit: "".into(), }), - num_pollers: create_gauge(MetricParameters { + num_pollers: gauge_with_in_mem(MetricParameters { name: NUM_POLLERS_NAME.into(), description: "Current number of active pollers per queue type".into(), unit: "".into(), @@ -497,19 +491,19 @@ impl Instruments { description: "Current number of used slots per task type".into(), unit: "".into(), }), - sticky_cache_hit: create_counter(MetricParameters { + sticky_cache_hit: counter_with_in_mem(MetricParameters { name: "sticky_cache_hit".into(), description: "Count of times the workflow cache was used for a new workflow task" .into(), unit: "".into(), }), - sticky_cache_miss: create_counter(MetricParameters { + sticky_cache_miss: counter_with_in_mem(MetricParameters { name: "sticky_cache_miss".into(), description: "Count of times the workflow cache was missing a workflow for a sticky task".into(), unit: "".into(), }), - sticky_cache_size: create_gauge(MetricParameters { + sticky_cache_size: gauge_with_in_mem(MetricParameters { name: STICKY_CACHE_SIZE_NAME.into(), description: "Current number of cached workflows".into(), unit: "".into(), diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index 74b184887..65250fc58 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -221,7 +221,7 @@ pub trait WorkerClient: Sync + Send { async fn shutdown_worker( &self, sticky_task_queue: String, - worker_heartbeat: Option, + final_heartbeat: Option, ) -> Result; /// Record a worker heartbeat async fn record_worker_heartbeat( @@ -657,10 +657,10 @@ impl WorkerClient for WorkerClientBag { async fn shutdown_worker( &self, sticky_task_queue: String, - worker_heartbeat: Option, + final_heartbeat: Option, ) -> Result { - let mut worker_heartbeat = worker_heartbeat; - if let Some(w) = worker_heartbeat.as_mut() { + let mut final_heartbeat = final_heartbeat; + if let Some(w) = final_heartbeat.as_mut() { w.status = WorkerStatus::Shutdown.into(); self.set_heartbeat_client_fields(w); } @@ -669,7 +669,7 @@ impl WorkerClient for WorkerClientBag { identity: self.identity.clone(), sticky_task_queue, reason: "graceful shutdown".to_string(), - worker_heartbeat, + worker_heartbeat: final_heartbeat, }; Ok( diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index c2e35ad7a..a44458488 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -262,7 +262,9 @@ impl WorkerTrait for Worker { ); } self.shutdown_token.cancel(); - *self.status.lock() = WorkerStatus::ShuttingDown; + { + *self.status.lock() = WorkerStatus::ShuttingDown; + } // First, unregister worker from the client if !self.client_worker_registrator.shared_namespace_worker { let _res = self From b00b80e016658d8cc70e98d97b86d09fd2f0da2a Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Wed, 1 Oct 2025 16:42:36 -0700 Subject: [PATCH 04/23] Support in-mem metrics when metrics aren't configured --- core-api/src/telemetry/metrics.rs | 113 +++++++++++++++++++++---- core-c-bridge/src/metric.rs | 28 +++--- core/src/telemetry/metrics.rs | 35 ++++---- core/src/telemetry/mod.rs | 8 -- core/src/telemetry/otel.rs | 6 -- core/src/telemetry/prometheus_meter.rs | 56 ------------ core/src/worker/client.rs | 2 + core/src/worker/mod.rs | 11 +-- sdk/Cargo.toml | 1 - 9 files changed, 135 insertions(+), 125 deletions(-) diff --git a/core-api/src/telemetry/metrics.rs b/core-api/src/telemetry/metrics.rs index 26f41b256..30ed6a7f3 100644 --- a/core-api/src/telemetry/metrics.rs +++ b/core-api/src/telemetry/metrics.rs @@ -70,8 +70,6 @@ pub trait CoreMeter: Send + Sync + Debug { } fn gauge_f64(&self, params: MetricParameters) -> GaugeF64; - - fn in_memory_metrics(&self) -> Arc; } /// Provides a generic way to record metrics in memory. @@ -130,6 +128,10 @@ fn label_value_from_attributes(attributes: &MetricAttributes, key: &str) -> Opti .iter() .find(|kv| kv.key.as_str() == key) .map(|kv| kv.value.to_string()), + MetricAttributes::Buffer(buffer_attrs) => buffer_attrs.get().label_value(key), + MetricAttributes::Dynamic(custom_metrics_attribute) => { + custom_metrics_attribute.label_value(key) + } _ => None, } } @@ -291,10 +293,6 @@ impl CoreMeter for Arc { fn gauge_f64(&self, params: MetricParameters) -> GaugeF64 { self.as_ref().gauge_f64(params) } - - fn in_memory_metrics(&self) -> Arc { - self.as_ref().in_memory_metrics() - } } /// Attributes which are provided every time a call to record a specific metric is made. @@ -319,6 +317,11 @@ pub trait CustomMetricAttributes: Debug + Send + Sync { /// Must be implemented to work around existing type system restrictions, see /// [here](https://internals.rust-lang.org/t/downcast-not-from-any-but-from-any-trait/16736/12) fn as_any(self: Arc) -> Arc; + + /// Return the stringified value for a label key, if available. + fn label_value(&self, _key: &str) -> Option { + None + } } /// Options that are attached to metrics on a per-call basis @@ -943,12 +946,32 @@ impl LazyRef { #[derive(Debug)] pub struct NoOpCoreMeter; impl CoreMeter for NoOpCoreMeter { - fn new_attributes(&self, _: NewAttributes) -> MetricAttributes { - MetricAttributes::Dynamic(Arc::new(NoOpAttributes)) + fn new_attributes(&self, attribs: NewAttributes) -> MetricAttributes { + MetricAttributes::Dynamic(Arc::new(InMemoryMetricAttributes::from_new_attributes( + attribs.attributes, + ))) } - fn extend_attributes(&self, existing: MetricAttributes, _: NewAttributes) -> MetricAttributes { - existing + fn extend_attributes( + &self, + existing: MetricAttributes, + attribs: NewAttributes, + ) -> MetricAttributes { + let new_attrs = InMemoryMetricAttributes::from_new_attributes(attribs.attributes); + let merged = if let MetricAttributes::Dynamic(existing_attrs) = existing { + if let Ok(in_mem) = existing_attrs + .clone() + .as_any() + .downcast::() + { + in_mem.merge(&new_attrs) + } else { + new_attrs + } + } else { + new_attrs + }; + MetricAttributes::Dynamic(Arc::new(merged)) } fn counter(&self, _: MetricParameters) -> Counter { @@ -974,10 +997,6 @@ impl CoreMeter for NoOpCoreMeter { fn gauge_f64(&self, _: MetricParameters) -> GaugeF64 { GaugeF64::new(Arc::new(NoOpInstrument)) } - - fn in_memory_metrics(&self) -> Arc { - Arc::new(WorkerHeartbeatMetrics::default()) - } } macro_rules! impl_metric_attributable { @@ -1015,12 +1034,72 @@ impl_no_op!(HistogramDurationBase, Duration); impl_no_op!(GaugeBase, u64); impl_no_op!(GaugeF64Base, f64); -#[derive(Debug, Clone)] -pub struct NoOpAttributes; -impl CustomMetricAttributes for NoOpAttributes { +#[derive(Debug, Clone, Default)] +struct InMemoryMetricAttributes { + labels: HashMap, +} + +impl InMemoryMetricAttributes { + fn from_new_attributes(attributes: Vec) -> Self { + let mut labels = HashMap::new(); + for kv in attributes { + labels.insert(kv.key, kv.value.to_string()); + } + Self { labels } + } + + fn merge(&self, other: &InMemoryMetricAttributes) -> Self { + let mut labels = self.labels.clone(); + for (key, value) in &other.labels { + labels.insert(key.clone(), value.clone()); + } + Self { labels } + } +} + +impl CustomMetricAttributes for InMemoryMetricAttributes { fn as_any(self: Arc) -> Arc { self as Arc } + + fn label_value(&self, key: &str) -> Option { + self.labels.get(key).cloned() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::{ + collections::HashMap, + sync::{ + Arc, + atomic::{AtomicU64, Ordering}, + }, + }; + + #[test] + fn in_memory_attributes_provide_label_values() { + let meter = NoOpCoreMeter; + let base_attrs = meter.new_attributes(NewAttributes::default()); + let attrs = meter.extend_attributes( + base_attrs, + NewAttributes::from(vec![MetricKeyValue::new("poller_type", "workflow_task")]), + ); + + let value = Arc::new(AtomicU64::new(0)); + let mut metrics = HashMap::new(); + metrics.insert("workflow_task".to_string(), value.clone()); + let heartbeat_metric = HeartbeatMetricType::WithLabel(metrics); + + heartbeat_metric.record_gauge(3, &attrs); + + assert_eq!(value.load(Ordering::Relaxed), 3); + assert_eq!( + label_value_from_attributes(&attrs, "poller_type").as_deref(), + Some("workflow_task") + ); + } } #[cfg(feature = "otel_impls")] diff --git a/core-c-bridge/src/metric.rs b/core-c-bridge/src/metric.rs index 576ab31d7..3f27b2e71 100644 --- a/core-c-bridge/src/metric.rs +++ b/core-c-bridge/src/metric.rs @@ -1,5 +1,5 @@ use crate::{ByteArrayRef, runtime::Runtime}; -use std::{any::Any, error::Error, sync::Arc, time::Duration}; +use std::{any::Any, collections::HashMap, error::Error, sync::Arc, time::Duration}; use temporal_sdk_core_api::telemetry::metrics; use temporal_sdk_core_api::telemetry::metrics::WorkerHeartbeatMetrics; use tracing::error; @@ -367,11 +367,6 @@ impl metrics::CoreMeter for CustomMetricMeterRef { fn gauge_f64(&self, params: metrics::MetricParameters) -> metrics::GaugeF64 { metrics::GaugeF64::new(Arc::new(self.new_metric(params, MetricKind::GaugeFloat))) } - - fn in_memory_metrics(&self) -> Arc { - error!("in_memory_metrics() is not supported for CustomMetricMeterRef"); - Arc::new(WorkerHeartbeatMetrics::default()) - } } impl CustomMetricMeterRef { @@ -388,16 +383,21 @@ impl CustomMetricMeterRef { ) -> metrics::MetricAttributes { unsafe { let meter = &*(self.meter_impl.0); - let append_from = match append_from { + let (append_from, mut label_cache) = match append_from { Some(metrics::MetricAttributes::Dynamic(v)) => { - v.clone() + let existing = v + .clone() .as_any() .downcast::() - .expect("Attributes not CustomMetricAttributes as expected") - .attributes + .expect("Attributes not CustomMetricAttributes as expected"); + (existing.attributes, existing.labels.as_ref().clone()) } - _ => std::ptr::null(), + _ => (std::ptr::null(), HashMap::new()), }; + for kv in &attribs.attributes { + label_cache.insert(kv.key.clone(), kv.value.to_string()); + } + let label_cache = Arc::new(label_cache); // Build a set of CustomMetricAttributes with _references_ to the // pieces in attribs. We count on both this vec and the attribs vec // living beyond the callback invocation. @@ -442,6 +442,7 @@ impl CustomMetricMeterRef { metrics::MetricAttributes::Dynamic(Arc::new(CustomMetricAttributes { meter_impl: self.meter_impl.clone(), attributes: raw_attrs, + labels: label_cache, })) } } @@ -486,6 +487,7 @@ impl Drop for CustomMetricMeterImpl { struct CustomMetricAttributes { meter_impl: Arc, attributes: *const libc::c_void, + labels: Arc>, } unsafe impl Send for CustomMetricAttributes {} @@ -495,6 +497,10 @@ impl metrics::CustomMetricAttributes for CustomMetricAttributes { fn as_any(self: Arc) -> Arc { self as Arc } + + fn label_value(&self, key: &str) -> Option { + self.labels.get(key).cloned() + } } impl Drop for CustomMetricAttributes { diff --git a/core/src/telemetry/metrics.rs b/core/src/telemetry/metrics.rs index 4726fe89d..e3527c7c2 100644 --- a/core/src/telemetry/metrics.rs +++ b/core/src/telemetry/metrics.rs @@ -25,6 +25,7 @@ pub(crate) struct MetricsContext { meter: Arc, kvs: MetricAttributes, instruments: Arc, + in_memory_metrics: Option>, } #[derive(Clone)] @@ -71,29 +72,25 @@ impl MetricsContext { pub(crate) fn no_op() -> Self { let meter = Arc::new(NoOpCoreMeter); let kvs = meter.new_attributes(Default::default()); - let instruments = Arc::new(Instruments::new(meter.as_ref(), None)); + let in_memory_metrics = Some(Arc::new(WorkerHeartbeatMetrics::default())); + let instruments = Arc::new(Instruments::new(meter.as_ref(), in_memory_metrics.clone())); Self { kvs, instruments, meter, + in_memory_metrics, } } #[cfg(test)] pub(crate) fn top_level(namespace: String, tq: String, telemetry: &TelemetryInstance) -> Self { - MetricsContext::top_level_with_meter( - namespace, - tq, - telemetry.get_temporal_metric_meter(), - telemetry.in_memory_metrics(), - ) + MetricsContext::top_level_with_meter(namespace, tq, telemetry.get_temporal_metric_meter()) } pub(crate) fn top_level_with_meter( namespace: String, tq: String, temporal_meter: Option, - in_memory_meter: Option>, ) -> Self { if let Some(mut meter) = temporal_meter { meter @@ -102,12 +99,14 @@ impl MetricsContext { .push(MetricKeyValue::new(KEY_NAMESPACE, namespace)); meter.default_attribs.attributes.push(task_queue(tq)); let kvs = meter.inner.new_attributes(meter.default_attribs); - let mut instruments = Instruments::new(meter.inner.as_ref(), in_memory_meter); + let in_memory_metrics = Some(Arc::new(WorkerHeartbeatMetrics::default())); + let mut instruments = Instruments::new(meter.inner.as_ref(), in_memory_metrics.clone()); instruments.update_attributes(&kvs); Self { kvs, instruments: Arc::new(instruments), meter: meter.inner, + in_memory_metrics, } } else { Self::no_op() @@ -128,9 +127,14 @@ impl MetricsContext { instruments: Arc::new(instruments), kvs, meter: self.meter.clone(), + in_memory_metrics: self.in_memory_metrics.clone(), } } + pub(crate) fn in_memory_meter(&self) -> Option> { + self.in_memory_metrics.clone() + } + /// A workflow task queue poll succeeded pub(crate) fn wf_tq_poll_ok(&self) { self.instruments.wf_task_queue_poll_succeed_counter.adds(1); @@ -878,11 +882,6 @@ where fn gauge_f64(&self, params: MetricParameters) -> GaugeF64 { GaugeF64::new(Arc::new(self.new_instrument(params, MetricKind::Gauge))) } - - fn in_memory_metrics(&self) -> Arc { - error!("in_memory_metrics() is not supported for MetricsCallBuffer"); - Arc::new(WorkerHeartbeatMetrics::default()) - } } impl MetricCallBufferer for MetricsCallBuffer where @@ -1112,10 +1111,6 @@ impl CoreMeter for PrefixedMetricsMeter { params.name = (self.prefix.clone() + &*params.name).into(); self.meter.gauge_f64(params) } - - fn in_memory_metrics(&self) -> Arc { - self.meter.in_memory_metrics() - } } #[cfg(test)] @@ -1134,6 +1129,10 @@ mod tests { fn as_any(self: Arc) -> Arc { self as Arc } + + fn label_value(&self, _key: &str) -> Option { + None + } } impl DummyCustomAttrs { fn as_id(ba: &BufferAttributes) -> usize { diff --git a/core/src/telemetry/mod.rs b/core/src/telemetry/mod.rs index 6ae0b2137..94457f697 100644 --- a/core/src/telemetry/mod.rs +++ b/core/src/telemetry/mod.rs @@ -64,7 +64,6 @@ pub struct TelemetryInstance { metric_prefix: String, logs_out: Option>, metrics: Option>, - in_memory_metrics: Option>, /// The tracing subscriber which is associated with this telemetry instance. May be `None` if /// the user has not opted into any tracing configuration. trace_subscriber: Option>, @@ -85,7 +84,6 @@ impl TelemetryInstance { metrics, trace_subscriber, attach_service_name, - in_memory_metrics: None, } } @@ -99,7 +97,6 @@ impl TelemetryInstance { /// Some metric meters cannot be initialized until after a tokio runtime has started and after /// other telemetry has initted (ex: prometheus). They can be attached here. pub fn attach_late_init_metrics(&mut self, meter: Arc) { - self.in_memory_metrics = Some(meter.in_memory_metrics().clone()); self.metrics = Some(meter); } @@ -134,11 +131,6 @@ impl TelemetryInstance { vec![] } } - - /// Returns all in memory metrics, used for worker heartbeating. - pub fn in_memory_metrics(&self) -> Option> { - self.in_memory_metrics.clone() - } } thread_local! { diff --git a/core/src/telemetry/otel.rs b/core/src/telemetry/otel.rs index 8a2bdb988..46234b444 100644 --- a/core/src/telemetry/otel.rs +++ b/core/src/telemetry/otel.rs @@ -161,7 +161,6 @@ pub fn build_otlp_metric_exporter( meter: mp.meter(TELEM_SERVICE_NAME), use_seconds_for_durations: opts.use_seconds_for_durations, _mp: mp, - in_memory_metrics: Arc::new(WorkerHeartbeatMetrics::default()), }) } @@ -172,7 +171,6 @@ pub struct CoreOtelMeter { // we have to hold on to the provider otherwise otel automatically shuts it down on drop // for whatever crazy reason _mp: SdkMeterProvider, - pub in_memory_metrics: Arc, } impl CoreMeter for CoreOtelMeter { @@ -243,10 +241,6 @@ impl CoreMeter for CoreOtelMeter { .build(), )) } - - fn in_memory_metrics(&self) -> Arc { - self.in_memory_metrics.clone() - } } impl CoreOtelMeter { diff --git a/core/src/telemetry/prometheus_meter.rs b/core/src/telemetry/prometheus_meter.rs index 859eecc8f..f825c9922 100644 --- a/core/src/telemetry/prometheus_meter.rs +++ b/core/src/telemetry/prometheus_meter.rs @@ -461,7 +461,6 @@ pub struct CorePrometheusMeter { use_seconds_for_durations: bool, unit_suffix: bool, bucket_overrides: temporal_sdk_core_api::telemetry::HistogramBucketOverrides, - pub in_memory_metrics: Arc, } impl CorePrometheusMeter { @@ -476,7 +475,6 @@ impl CorePrometheusMeter { use_seconds_for_durations, unit_suffix, bucket_overrides, - in_memory_metrics: Arc::new(WorkerHeartbeatMetrics::default()), } } @@ -543,20 +541,6 @@ impl CoreMeter for CorePrometheusMeter { ))) } - fn counter_with_in_memory( - &self, - params: MetricParameters, - in_memory_counter: HeartbeatMetricType, - ) -> Counter { - let metric_name = params.name.to_string(); - let counter = Arc::new(PromMetric::::new( - metric_name, - params.description.to_string(), - self.registry.clone(), - )); - Counter::new_with_in_memory(counter, in_memory_counter) - } - fn histogram(&self, params: MetricParameters) -> Histogram { let hist = self.create_u64_hist(¶ms); Histogram::new(Arc::new(hist)) @@ -577,28 +561,6 @@ impl CoreMeter for CorePrometheusMeter { })) } - fn histogram_duration_with_in_memory( - &self, - mut params: MetricParameters, - in_memory_hist: HeartbeatMetricType, - ) -> HistogramDuration { - if self.use_seconds_for_durations { - params.unit = "seconds".into(); - HistogramDuration::new_with_in_memory( - Arc::new(DurationHistogram::Seconds(self.create_f64_hist(¶ms))), - in_memory_hist, - ) - } else { - params.unit = "milliseconds".into(); - HistogramDuration::new_with_in_memory( - Arc::new(DurationHistogram::Milliseconds( - self.create_u64_hist(¶ms), - )), - in_memory_hist, - ) - } - } - fn gauge(&self, params: MetricParameters) -> Gauge { let metric_name = params.name.to_string(); Gauge::new(Arc::new(PromMetric::::new( @@ -608,20 +570,6 @@ impl CoreMeter for CorePrometheusMeter { ))) } - fn gauge_with_in_memory( - &self, - params: MetricParameters, - in_memory_metrics: HeartbeatMetricType, - ) -> Gauge { - let metric_name = params.name.to_string(); - let gauge = Arc::new(PromMetric::::new( - metric_name, - params.description.to_string(), - self.registry.clone(), - )); - Gauge::new_with_in_memory(gauge, in_memory_metrics) - } - fn gauge_f64(&self, params: MetricParameters) -> GaugeF64 { let metric_name = params.name.to_string(); GaugeF64::new(Arc::new(PromMetric::::new( @@ -630,10 +578,6 @@ impl CoreMeter for CorePrometheusMeter { self.registry.clone(), ))) } - - fn in_memory_metrics(&self) -> Arc { - self.in_memory_metrics.clone() - } } impl CorePrometheusMeter { diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index 65250fc58..c522d5331 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -664,6 +664,7 @@ impl WorkerClient for WorkerClientBag { w.status = WorkerStatus::Shutdown.into(); self.set_heartbeat_client_fields(w); } + println!("AAA shutdown worker heartbeat: {final_heartbeat:#?}"); let request = ShutdownWorkerRequest { namespace: self.namespace.clone(), identity: self.identity.clone(), @@ -689,6 +690,7 @@ impl WorkerClient for WorkerClientBag { identity: self.identity.clone(), worker_heartbeat, }; + println!("AAA record_worker_heartbeat: {request:#?}"); Ok(self .cloned_client() .record_worker_heartbeat(request) diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index a44458488..408bb3343 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -164,7 +164,6 @@ pub(crate) struct WorkerTelemetry { metric_meter: Option, temporal_metric_meter: Option, trace_subscriber: Option>, - in_memory_meter: Option>, } #[async_trait::async_trait] @@ -322,7 +321,6 @@ impl Worker { metric_meter: telem.get_metric_meter(), temporal_metric_meter: telem.get_temporal_metric_meter(), trace_subscriber: telem.trace_subscriber(), - in_memory_meter: telem.in_memory_metrics(), }); Self::new_with_pollers( @@ -385,7 +383,6 @@ impl Worker { config.namespace.clone(), config.task_queue.clone(), wt.temporal_metric_meter.clone(), - wt.in_memory_meter.clone(), ), wt.metric_meter.clone(), ) @@ -582,6 +579,7 @@ impl Worker { worker_instance_key, hb_interval, worker_telemetry.clone(), + metrics.in_memory_meter(), wft_slots.clone(), act_slots, nexus_slots, @@ -1042,6 +1040,7 @@ impl WorkerHeartbeatManager { worker_instance_key: Uuid, heartbeat_interval: Duration, telemetry_instance: Option, + in_mem_metrics: Option>, wft_slots: MeteredPermitDealer, act_slots: MeteredPermitDealer, nexus_slots: MeteredPermitDealer, @@ -1052,7 +1051,6 @@ impl WorkerHeartbeatManager { nexus_last_suc_poll_time: Arc>>, status: Arc>, ) -> Self { - let telemetry_instance_clone = telemetry_instance.clone(); let worker_heartbeat_callback: HeartbeatFn = Arc::new(move || { let deployment_version = config.computed_deployment_version().map(|dv| { deployment::v1::WorkerDeploymentVersion { @@ -1104,9 +1102,7 @@ impl WorkerHeartbeatManager { sdk_version: String::new(), }; - if let Some(telem_instance) = telemetry_instance_clone.as_ref() - && let Some(in_mem) = telem_instance.in_memory_meter.as_ref() - { + if let Some(in_mem) = in_mem_metrics.as_ref() { worker_heartbeat.total_sticky_cache_hit = in_mem.total_sticky_cache_hit.load(Ordering::Relaxed) as i32; worker_heartbeat.total_sticky_cache_miss = @@ -1114,7 +1110,6 @@ impl WorkerHeartbeatManager { worker_heartbeat.current_sticky_cache_size = in_mem.sticky_cache_size.load(Ordering::Relaxed) as i32; - // TODO: Is this ever not Some()? worker_heartbeat.workflow_poller_info = Some(WorkerPollerInfo { current_pollers: in_mem .num_pollers diff --git a/sdk/Cargo.toml b/sdk/Cargo.toml index 1c019bcdb..97039c2cb 100644 --- a/sdk/Cargo.toml +++ b/sdk/Cargo.toml @@ -22,7 +22,6 @@ tokio = { version = "1.47", features = ["rt", "rt-multi-thread", "parking_lot", tokio-util = { version = "0.7" } tokio-stream = "0.1" tracing = "0.1" -uuid = { version = "1.18.1", features = ["v4"] } [dependencies.temporal-sdk-core] path = "../core" From 894d916fd2d48ba57bbaa6be413f574a6b0ce891 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Fri, 3 Oct 2025 15:56:33 -0700 Subject: [PATCH 05/23] Move sys_info refresh to dedicated thread, use tuner's existing sys info --- core-api/src/worker.rs | 2 +- core-c-bridge/src/metric.rs | 2 - core/src/abstractions.rs | 4 - core/src/lib.rs | 7 +- core/src/telemetry/otel.rs | 2 +- core/src/telemetry/prometheus_meter.rs | 7 +- core/src/worker/client.rs | 2 - core/src/worker/mod.rs | 129 ++++---- core/src/worker/tuner.rs | 19 +- core/src/worker/tuner/resource_based.rs | 123 +++++--- tests/integ_tests/worker_heartbeat_tests.rs | 311 ++++++++++++-------- 11 files changed, 357 insertions(+), 251 deletions(-) diff --git a/core-api/src/worker.rs b/core-api/src/worker.rs index 40257b078..817ab1e90 100644 --- a/core-api/src/worker.rs +++ b/core-api/src/worker.rs @@ -366,7 +366,7 @@ pub trait SlotSupplier { /// Returns a human-friendly identifier describing this supplier implementation for /// diagnostics and telemetry. fn slot_supplier_kind(&self) -> String { - format!("{}", type_name::()) + type_name::().to_string() } } diff --git a/core-c-bridge/src/metric.rs b/core-c-bridge/src/metric.rs index 3f27b2e71..9d06aab00 100644 --- a/core-c-bridge/src/metric.rs +++ b/core-c-bridge/src/metric.rs @@ -1,8 +1,6 @@ use crate::{ByteArrayRef, runtime::Runtime}; use std::{any::Any, collections::HashMap, error::Error, sync::Arc, time::Duration}; use temporal_sdk_core_api::telemetry::metrics; -use temporal_sdk_core_api::telemetry::metrics::WorkerHeartbeatMetrics; -use tracing::error; pub struct MetricMeter { core: metrics::TemporalMeter, diff --git a/core/src/abstractions.rs b/core/src/abstractions.rs index 482b17d27..0d5a53206 100644 --- a/core/src/abstractions.rs +++ b/core/src/abstractions.rs @@ -197,10 +197,6 @@ where }), } } - - pub(crate) fn max_permits(&self) -> Option { - self.max_permits - } } impl MeteredPermitDealer { diff --git a/core/src/lib.rs b/core/src/lib.rs index 0324422e5..a73eb43ad 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -41,10 +41,9 @@ pub use temporal_sdk_core_protos as protos; pub use temporal_sdk_core_protos::TaskToken; pub use url::Url; pub use worker::{ - FixedSizeSlotSupplier, RealSysInfo, ResourceBasedSlotsOptions, - ResourceBasedSlotsOptionsBuilder, ResourceBasedTuner, ResourceSlotOptions, SlotSupplierOptions, - TunerBuilder, TunerHolder, TunerHolderOptions, TunerHolderOptionsBuilder, Worker, WorkerConfig, - WorkerConfigBuilder, + FixedSizeSlotSupplier, ResourceBasedSlotsOptions, ResourceBasedSlotsOptionsBuilder, + ResourceBasedTuner, ResourceSlotOptions, SlotSupplierOptions, TunerBuilder, TunerHolder, + TunerHolderOptions, TunerHolderOptionsBuilder, Worker, WorkerConfig, WorkerConfigBuilder, }; /// Expose [WorkerClient] symbols diff --git a/core/src/telemetry/otel.rs b/core/src/telemetry/otel.rs index 46234b444..50ecddd39 100644 --- a/core/src/telemetry/otel.rs +++ b/core/src/telemetry/otel.rs @@ -1,5 +1,5 @@ use super::{ - TELEM_SERVICE_NAME, WorkerHeartbeatMetrics, default_buckets_for, + TELEM_SERVICE_NAME, default_buckets_for, metrics::{ ACTIVITY_EXEC_LATENCY_HISTOGRAM_NAME, ACTIVITY_SCHED_TO_START_LATENCY_HISTOGRAM_NAME, DEFAULT_MS_BUCKETS, WORKFLOW_E2E_LATENCY_HISTOGRAM_NAME, diff --git a/core/src/telemetry/prometheus_meter.rs b/core/src/telemetry/prometheus_meter.rs index f825c9922..37810a534 100644 --- a/core/src/telemetry/prometheus_meter.rs +++ b/core/src/telemetry/prometheus_meter.rs @@ -13,10 +13,9 @@ use std::{ time::Duration, }; use temporal_sdk_core_api::telemetry::metrics::{ - CoreMeter, Counter, CounterBase, Gauge, GaugeBase, GaugeF64, GaugeF64Base, HeartbeatMetricType, - Histogram, HistogramBase, HistogramDuration, HistogramDurationBase, HistogramF64, - HistogramF64Base, MetricAttributable, MetricAttributes, MetricParameters, NewAttributes, - OrderedPromLabelSet, WorkerHeartbeatMetrics, + CoreMeter, Counter, CounterBase, Gauge, GaugeBase, GaugeF64, GaugeF64Base, Histogram, + HistogramBase, HistogramDuration, HistogramDurationBase, HistogramF64, HistogramF64Base, + MetricAttributable, MetricAttributes, MetricParameters, NewAttributes, OrderedPromLabelSet, }; #[derive(derive_more::From, derive_more::TryInto, Debug, Clone)] diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index c522d5331..65250fc58 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -664,7 +664,6 @@ impl WorkerClient for WorkerClientBag { w.status = WorkerStatus::Shutdown.into(); self.set_heartbeat_client_fields(w); } - println!("AAA shutdown worker heartbeat: {final_heartbeat:#?}"); let request = ShutdownWorkerRequest { namespace: self.namespace.clone(), identity: self.identity.clone(), @@ -690,7 +689,6 @@ impl WorkerClient for WorkerClientBag { identity: self.identity.clone(), worker_heartbeat, }; - println!("AAA record_worker_heartbeat: {request:#?}"); Ok(self .cloned_client() .record_worker_heartbeat(request) diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index 408bb3343..b0774c871 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -8,10 +8,11 @@ mod workflow; pub use temporal_sdk_core_api::worker::{WorkerConfig, WorkerConfigBuilder}; pub use tuner::{ - FixedSizeSlotSupplier, RealSysInfo, ResourceBasedSlotsOptions, - ResourceBasedSlotsOptionsBuilder, ResourceBasedTuner, ResourceSlotOptions, SlotSupplierOptions, - TunerBuilder, TunerHolder, TunerHolderOptions, TunerHolderOptionsBuilder, + FixedSizeSlotSupplier, ResourceBasedSlotsOptions, ResourceBasedSlotsOptionsBuilder, + ResourceBasedTuner, ResourceSlotOptions, SlotSupplierOptions, TunerBuilder, TunerHolder, + TunerHolderOptions, TunerHolderOptionsBuilder, }; +pub(crate) use tuner::{RealSysInfo, SystemResourceInfo}; pub(crate) use activities::{ ExecutingLAId, LocalActRequest, LocalActivityExecutionResult, LocalActivityResolution, @@ -64,7 +65,6 @@ use std::{ }, time::Duration, }; -use sysinfo::System; use temporal_client::{ClientWorker, HeartbeatCallback, Slot as SlotTrait}; use temporal_client::{ ConfiguredClient, SharedNamespaceWorkerTrait, TemporalServiceClientWithMetrics, @@ -390,11 +390,13 @@ impl Worker { (MetricsContext::no_op(), None) }; - let tuner = config - .tuner - .as_ref() - .cloned() - .unwrap_or_else(|| Arc::new(TunerBuilder::from_config(&config).build())); + let mut sys_info = None; + let tuner = config.tuner.as_ref().cloned().unwrap_or_else(|| { + let mut tuner_builder = TunerBuilder::from_config(&config); + sys_info = tuner_builder.get_sys_info(); + Arc::new(tuner_builder.build()) + }); + let sys_info = sys_info.unwrap_or_else(|| Arc::new(RealSysInfo::new())); metrics.worker_registered(); let shutdown_token = CancellationToken::new(); @@ -574,21 +576,25 @@ impl Worker { let sdk_name_and_ver = client.sdk_name_and_version(); let worker_heartbeat = worker_heartbeat_interval.map(|hb_interval| { - WorkerHeartbeatManager::new( - config.clone(), - worker_instance_key, - hb_interval, - worker_telemetry.clone(), - metrics.in_memory_meter(), - wft_slots.clone(), + let hb_metrics = WorkerHeartbeatManagerMetrics { + in_mem_metrics: metrics.in_memory_meter(), + wft_slots: wft_slots.clone(), act_slots, nexus_slots, - la_permit_dealer, + la_slots: la_permit_dealer, wf_last_suc_poll_time, wf_sticky_last_suc_poll_time, act_last_suc_poll_time, nexus_last_suc_poll_time, - worker_status.clone(), + status: worker_status.clone(), + sys_info, + }; + WorkerHeartbeatManager::new( + config.clone(), + worker_instance_key, + hb_interval, + worker_telemetry.clone(), + hb_metrics, ) }); @@ -1024,6 +1030,21 @@ impl ClientWorker for ClientWorkerRegistrator { } } +// TODO: better name? +struct WorkerHeartbeatManagerMetrics { + in_mem_metrics: Option>, + wft_slots: MeteredPermitDealer, + act_slots: MeteredPermitDealer, + nexus_slots: MeteredPermitDealer, + la_slots: MeteredPermitDealer, + wf_last_suc_poll_time: Arc>>, + wf_sticky_last_suc_poll_time: Arc>>, + act_last_suc_poll_time: Arc>>, + nexus_last_suc_poll_time: Arc>>, + status: Arc>, + sys_info: Arc, +} + struct WorkerHeartbeatManager { /// Heartbeat interval, defaults to 60s heartbeat_interval: Duration, @@ -1034,22 +1055,12 @@ struct WorkerHeartbeatManager { } impl WorkerHeartbeatManager { - #[allow(clippy::too_many_arguments)] fn new( config: WorkerConfig, worker_instance_key: Uuid, heartbeat_interval: Duration, telemetry_instance: Option, - in_mem_metrics: Option>, - wft_slots: MeteredPermitDealer, - act_slots: MeteredPermitDealer, - nexus_slots: MeteredPermitDealer, - la_slots: MeteredPermitDealer, - wf_last_suc_poll_time: Arc>>, - wf_sticky_last_suc_poll_time: Arc>>, - act_last_suc_poll_time: Arc>>, - nexus_last_suc_poll_time: Arc>>, - status: Arc>, + heartbeat_manager_metrics: WorkerHeartbeatManagerMetrics, ) -> Self { let worker_heartbeat_callback: HeartbeatFn = Arc::new(move || { let deployment_version = config.computed_deployment_version().map(|dv| { @@ -1059,15 +1070,15 @@ impl WorkerHeartbeatManager { } }); - let (current_host_cpu_usage, current_host_mem_usage) = get_host_data(); - let mut worker_heartbeat = WorkerHeartbeat { worker_instance_key: worker_instance_key.to_string(), host_info: Some(WorkerHostInfo { host_name: gethostname().to_string_lossy().to_string(), process_id: std::process::id().to_string(), - current_host_cpu_usage, - current_host_mem_usage, + current_host_cpu_usage: heartbeat_manager_metrics.sys_info.used_cpu_percent() + as f32, + current_host_mem_usage: heartbeat_manager_metrics.sys_info.used_mem_percent() + as f32, // Set by SharedNamespaceWorker because it relies on the client process_key: String::new(), @@ -1075,7 +1086,7 @@ impl WorkerHeartbeatManager { task_queue: config.task_queue.clone(), deployment_version, - status: (*status.lock()) as i32, + status: (*heartbeat_manager_metrics.status.lock()) as i32, start_time: Some(SystemTime::now().into()), plugins: config.plugins.clone(), @@ -1102,7 +1113,7 @@ impl WorkerHeartbeatManager { sdk_version: String::new(), }; - if let Some(in_mem) = in_mem_metrics.as_ref() { + if let Some(in_mem) = heartbeat_manager_metrics.in_mem_metrics.as_ref() { worker_heartbeat.total_sticky_cache_hit = in_mem.total_sticky_cache_hit.load(Ordering::Relaxed) as i32; worker_heartbeat.total_sticky_cache_miss = @@ -1115,7 +1126,10 @@ impl WorkerHeartbeatManager { .num_pollers .wft_current_pollers .load(Ordering::Relaxed) as i32, - last_successful_poll_time: wf_last_suc_poll_time.lock().map(|time| time.into()), + last_successful_poll_time: heartbeat_manager_metrics + .wf_last_suc_poll_time + .lock() + .map(|time| time.into()), is_autoscaling: config.workflow_task_poller_behavior.is_autoscaling(), }); worker_heartbeat.workflow_sticky_poller_info = Some(WorkerPollerInfo { @@ -1123,7 +1137,8 @@ impl WorkerHeartbeatManager { .num_pollers .sticky_wft_current_pollers .load(Ordering::Relaxed) as i32, - last_successful_poll_time: wf_sticky_last_suc_poll_time + last_successful_poll_time: heartbeat_manager_metrics + .wf_sticky_last_suc_poll_time .lock() .map(|time| time.into()), is_autoscaling: config.workflow_task_poller_behavior.is_autoscaling(), @@ -1133,7 +1148,8 @@ impl WorkerHeartbeatManager { .num_pollers .activity_current_pollers .load(Ordering::Relaxed) as i32, - last_successful_poll_time: act_last_suc_poll_time + last_successful_poll_time: heartbeat_manager_metrics + .act_last_suc_poll_time .lock() .map(|time| time.into()), is_autoscaling: config.activity_task_poller_behavior.is_autoscaling(), @@ -1143,14 +1159,15 @@ impl WorkerHeartbeatManager { .num_pollers .nexus_current_pollers .load(Ordering::Relaxed) as i32, - last_successful_poll_time: nexus_last_suc_poll_time + last_successful_poll_time: heartbeat_manager_metrics + .nexus_last_suc_poll_time .lock() .map(|time| time.into()), is_autoscaling: config.nexus_task_poller_behavior.is_autoscaling(), }); worker_heartbeat.workflow_task_slots_info = make_slots_info( - &wft_slots, + &heartbeat_manager_metrics.wft_slots, in_mem .workflow_task_execution_latency .load(Ordering::Relaxed), @@ -1159,17 +1176,17 @@ impl WorkerHeartbeatManager { .load(Ordering::Relaxed), ); worker_heartbeat.activity_task_slots_info = make_slots_info( - &act_slots, + &heartbeat_manager_metrics.act_slots, in_mem.activity_execution_latency.load(Ordering::Relaxed), in_mem.activity_execution_failed.load(Ordering::Relaxed), ); worker_heartbeat.nexus_task_slots_info = make_slots_info( - &nexus_slots, + &heartbeat_manager_metrics.nexus_slots, in_mem.nexus_task_execution_latency.load(Ordering::Relaxed), in_mem.nexus_task_execution_failed.load(Ordering::Relaxed), ); worker_heartbeat.local_activity_slots_info = make_slots_info( - &la_slots, + &heartbeat_manager_metrics.la_slots, in_mem .local_activity_execution_latency .load(Ordering::Relaxed), @@ -1234,17 +1251,14 @@ fn make_slots_info( where SK: SlotKind + 'static, { - let avail_usize = dealer.available_permits()?; - let max_usize = dealer.max_permits()?; - - let avail = i32::try_from(avail_usize).unwrap_or(i32::MAX); - let max = i32::try_from(max_usize).unwrap_or(i32::MAX); - - let used = (max - avail).max(0); + let permits = dealer.get_extant_count_rcv(); + let avail = dealer + .available_permits() + .map_or(-1, |e| i32::try_from(e).unwrap_or(-1)); Some(WorkerSlotsInfo { current_available_slots: avail, - current_used_slots: used, + current_used_slots: *permits.borrow() as i32, slot_supplier_kind: dealer.slot_supplier_kind().to_string(), total_processed_tasks: i32::try_from(total_processed).unwrap_or(i32::MAX), total_failed_tasks: i32::try_from(total_failed).unwrap_or(i32::MAX), @@ -1255,19 +1269,6 @@ where }) } -fn get_host_data() -> (f32, f32) { - let mut sys = System::new_all(); - sys.refresh_all(); - std::thread::sleep(sysinfo::MINIMUM_CPU_UPDATE_INTERVAL); - sys.refresh_cpu_usage(); - let current_host_cpu_usage: f32 = - sys.cpus().iter().map(|cpu| cpu.cpu_usage()).sum::() / sys.cpus().len() as f32; - let total_mem = sys.total_memory() as f64; - let used_mem = sys.used_memory() as f64; - let current_host_mem_usage = (used_mem / total_mem) as f32; - (current_host_cpu_usage, current_host_mem_usage) -} - #[cfg(test)] mod tests { use super::*; diff --git a/core/src/worker/tuner.rs b/core/src/worker/tuner.rs index ed592a6f7..0a7cadcc9 100644 --- a/core/src/worker/tuner.rs +++ b/core/src/worker/tuner.rs @@ -3,10 +3,12 @@ mod resource_based; pub use fixed_size::FixedSizeSlotSupplier; pub use resource_based::{ - RealSysInfo, ResourceBasedSlotsOptions, ResourceBasedSlotsOptionsBuilder, ResourceBasedTuner, + ResourceBasedSlotsOptions, ResourceBasedSlotsOptionsBuilder, ResourceBasedTuner, ResourceSlotOptions, }; +pub(crate) use resource_based::{RealSysInfo, SystemResourceInfo}; + use std::sync::Arc; use temporal_sdk_core_api::worker::{ ActivitySlotKind, LocalActivitySlotKind, NexusSlotKind, SlotKind, SlotSupplier, WorkerConfig, @@ -126,6 +128,9 @@ impl TunerHolderOptions { } None => {} } + if let Some(tuner) = rb_tuner { + builder.sys_info(tuner.sys_info()); + } Ok(builder.build()) } } @@ -187,6 +192,7 @@ pub struct TunerBuilder { local_activity_slot_supplier: Option + Send + Sync>>, nexus_slot_supplier: Option + Send + Sync>>, + sys_info: Option>, } impl TunerBuilder { @@ -243,6 +249,17 @@ impl TunerBuilder { self } + /// Sets a field that implements [SystemResourceInfo] + pub fn sys_info(&mut self, sys_info: Arc) -> &mut Self { + self.sys_info = Some(sys_info); + self + } + + /// Gets the field that implements [SystemResourceInfo] + pub fn get_sys_info(&self) -> Option> { + self.sys_info.clone() + } + /// Build a [WorkerTuner] from the configured slot suppliers pub fn build(&mut self) -> TunerHolder { TunerHolder { diff --git a/core/src/worker/tuner/resource_based.rs b/core/src/worker/tuner/resource_based.rs index 3dbed0aef..ea1786d98 100644 --- a/core/src/worker/tuner/resource_based.rs +++ b/core/src/worker/tuner/resource_based.rs @@ -4,8 +4,9 @@ use std::{ marker::PhantomData, sync::{ Arc, OnceLock, - atomic::{AtomicU64, AtomicUsize, Ordering}, + atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}, }, + thread, time::{Duration, Instant}, }; use temporal_sdk_core_api::{ @@ -31,6 +32,8 @@ pub struct ResourceBasedTuner { act_opts: Option, la_opts: Option, nexus_opts: Option, + + sys_info: Arc, } impl ResourceBasedTuner { @@ -42,25 +45,28 @@ impl ResourceBasedTuner { .target_cpu_usage(target_cpu_usage) .build() .expect("default resource based slot options can't fail to build"); - let controller = ResourceController::new_with_sysinfo(opts, RealSysInfo::new()); + let controller = ResourceController::new_with_sysinfo(opts, Arc::new(RealSysInfo::new())); Self::new_from_controller(controller) } /// Create an instance using the fully configurable set of PID controller options pub fn new_from_options(options: ResourceBasedSlotsOptions) -> Self { - let controller = ResourceController::new_with_sysinfo(options, RealSysInfo::new()); + let controller = + ResourceController::new_with_sysinfo(options, Arc::new(RealSysInfo::new())); Self::new_from_controller(controller) } } impl ResourceBasedTuner { fn new_from_controller(controller: ResourceController) -> Self { + let sys_info = controller.sys_info_supplier.clone(); Self { slots: Arc::new(controller), wf_opts: None, act_opts: None, la_opts: None, nexus_opts: None, + sys_info, } } @@ -87,6 +93,17 @@ impl ResourceBasedTuner { self.nexus_opts = Some(opts); self } + + /// Set sys info + pub fn with_sys_info(&mut self, sys_info: Arc) -> &mut Self { + self.sys_info = sys_info; + self + } + + /// Get sys info + pub fn sys_info(&self) -> Arc { + self.sys_info.clone() + } } const DEFAULT_WF_SLOT_OPTS: ResourceSlotOptions = ResourceSlotOptions { @@ -121,7 +138,7 @@ pub struct ResourceSlotOptions { struct ResourceController { options: ResourceBasedSlotsOptions, - sys_info_supplier: MI, + sys_info_supplier: Arc, metrics: OnceLock>, pids: Mutex, last_metric_vals: Arc>, @@ -425,7 +442,7 @@ impl ResourceController { Arc::new(ResourceBasedSlotsForType::new(self.clone(), opts)) } - fn new_with_sysinfo(options: ResourceBasedSlotsOptions, sys_info: MI) -> Self { + fn new_with_sysinfo(options: ResourceBasedSlotsOptions, sys_info: Arc) -> Self { Self { pids: Mutex::new(PidControllers::new(&options)), options, @@ -478,37 +495,14 @@ impl ResourceController { /// Implements [SystemResourceInfo] using the [sysinfo] crate #[derive(Debug)] -pub struct RealSysInfo { +struct RealSysInfoInner { sys: Mutex, total_mem: AtomicU64, cur_mem_usage: AtomicU64, cur_cpu_usage: AtomicU64, - last_refresh: AtomicCell, } -impl RealSysInfo { - fn new() -> Self { - let mut sys = sysinfo::System::new(); - sys.refresh_memory(); - let total_mem = sys.total_memory(); - let s = Self { - sys: Mutex::new(sys), - last_refresh: AtomicCell::new(Instant::now()), - cur_mem_usage: AtomicU64::new(0), - cur_cpu_usage: AtomicU64::new(0), - total_mem: AtomicU64::new(total_mem), - }; - s.refresh(); - s - } - - fn refresh_if_needed(&self) { - // This is all quite expensive and meaningfully slows everything down if it's allowed to - // happen more often. A better approach than a lock would be needed to go faster. - if (Instant::now() - self.last_refresh.load()) > Duration::from_millis(100) { - self.refresh(); - } - } +impl RealSysInfoInner { fn refresh(&self) { let mut lock = self.sys.lock(); lock.refresh_memory(); @@ -526,25 +520,74 @@ impl RealSysInfo { self.cur_mem_usage.store(mem, Ordering::Release); } self.cur_cpu_usage.store(cpu.to_bits(), Ordering::Release); - self.last_refresh.store(Instant::now()); + } +} + +/// Tracks host resource usage by refreshing metrics on a background thread. +pub struct RealSysInfo { + inner: Arc, + shutdown: Arc, + refresh_thread: Mutex>>, +} + +impl RealSysInfo { + pub(crate) fn new() -> Self { + let mut sys = sysinfo::System::new(); + sys.refresh_memory(); + let total_mem = sys.total_memory(); + let inner = Arc::new(RealSysInfoInner { + sys: Mutex::new(sys), + cur_mem_usage: AtomicU64::new(0), + cur_cpu_usage: AtomicU64::new(0), + total_mem: AtomicU64::new(total_mem), + }); + inner.refresh(); + + let shutdown = Arc::new(AtomicBool::new(false)); + let thread_inner = inner.clone(); + let thread_shutdown = shutdown.clone(); + let handle = thread::Builder::new() + .name("temporal-real-sysinfo".to_string()) + .spawn(move || { + const REFRESH_INTERVAL: Duration = Duration::from_millis(100); + loop { + if thread_shutdown.load(Ordering::Acquire) { + return; + } + thread_inner.refresh(); + thread::sleep(REFRESH_INTERVAL); + } + }) + .expect("failed to spawn RealSysInfo refresh thread"); + + Self { + inner, + shutdown, + refresh_thread: Mutex::new(Some(handle)), + } } } impl SystemResourceInfo for RealSysInfo { fn total_mem(&self) -> u64 { - self.total_mem.load(Ordering::Acquire) + self.inner.total_mem.load(Ordering::Acquire) } fn used_mem(&self) -> u64 { - // TODO: This should really happen on a background thread since it's getting called from - // the async reserve - self.refresh_if_needed(); - self.cur_mem_usage.load(Ordering::Acquire) + self.inner.cur_mem_usage.load(Ordering::Acquire) } fn used_cpu_percent(&self) -> f64 { - self.refresh_if_needed(); - f64::from_bits(self.cur_cpu_usage.load(Ordering::Acquire)) + f64::from_bits(self.inner.cur_cpu_usage.load(Ordering::Acquire)) + } +} + +impl Drop for RealSysInfo { + fn drop(&mut self) { + self.shutdown.store(true, Ordering::Release); + if let Some(handle) = self.refresh_thread.lock().take() { + let _ = handle.join(); + } } } @@ -562,9 +605,9 @@ mod tests { used: Arc, } impl FakeMIS { - fn new() -> (Self, Arc) { + fn new() -> (Arc, Arc) { let used = Arc::new(AtomicU64::new(0)); - (Self { used: used.clone() }, used) + (Arc::new(Self { used: used.clone() }), used) } } impl SystemResourceInfo for FakeMIS { diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index 3c1999212..1acfaf091 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -6,13 +6,15 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH}; use temporal_client::WorkflowClientTrait; use temporal_sdk::{ActContext, ActivityOptions, WfContext}; use temporal_sdk_core::telemetry::{build_otlp_metric_exporter, start_prometheus_metric_exporter}; -use temporal_sdk_core::{CoreRuntime, RuntimeOptionsBuilder}; +use temporal_sdk_core::{ + CoreRuntime, ResourceBasedTuner, ResourceSlotOptions, RuntimeOptionsBuilder, +}; use temporal_sdk_core_api::telemetry::{ - OtelCollectorOptionsBuilder, PrometheusExporterOptionsBuilder, + OtelCollectorOptionsBuilder, PrometheusExporterOptionsBuilder, TelemetryOptionsBuilder, }; use temporal_sdk_core_protos::coresdk::AsJsonPayloadExt; +use temporal_sdk_core_protos::temporal::api::deployment::v1::WorkerDeploymentVersion; use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerStatus; -use tokio::sync::Semaphore; use url::Url; fn within_two_minutes_ts(ts: Timestamp) -> bool { @@ -32,13 +34,7 @@ fn within_duration(dur: PbDuration, threshold: Duration) -> bool { // with `docker_` and set the `DOCKER_PROMETHEUS_RUNNING` env variable to run #[rstest::rstest] #[tokio::test] -async fn docker_worker_heartbeat_basic( - #[values( - "otel", - // "prom" -)] - backing: &str, -) { +async fn docker_worker_heartbeat_basic(#[values("otel", "prom")] backing: &str) { let runtimeopts = RuntimeOptionsBuilder::default() .telemetry_options(get_integ_telem_options()) .heartbeat_interval(Some(Duration::from_millis(100))) @@ -52,8 +48,6 @@ async fn docker_worker_heartbeat_basic( .unwrap(); let mut opts_build = OtelCollectorOptionsBuilder::default(); let opts = opts_build.url(url).build().unwrap(); - // If wanna add more options: https://github.com/temporalio/sdk-ruby/blob/143e421d82d16e58bd45226998363d55e4bc3bbb/temporalio/ext/src/runtime.rs#L113C21-L135C22 - rt.telemetry_mut() .attach_late_init_metrics(Arc::new(build_otlp_metric_exporter(opts).unwrap())); } @@ -67,8 +61,6 @@ async fn docker_worker_heartbeat_basic( _ => unreachable!(), } let wf_name = format!("worker_heartbeat_basic_{backing}"); - static ACTS_STARTED: Semaphore = Semaphore::const_new(0); - static ACTS_DONE: Semaphore = Semaphore::const_new(0); let mut starter = CoreWfStarter::new_with_runtime(&wf_name, rt); starter .worker_config @@ -80,7 +72,6 @@ async fn docker_worker_heartbeat_basic( // Run a workflow worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { - println!("asdfasdfasdfasdfasdfasdf"); ctx.activity(ActivityOptions { activity_type: "pass_fail_act".to_string(), input: "pass".as_json_payload().expect("serializes fine"), @@ -91,65 +82,117 @@ async fn docker_worker_heartbeat_basic( Ok(().into()) }); worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { - println!("pass_fail_act"); - ACTS_STARTED.add_permits(1); - let _ = ACTS_DONE.acquire().await.unwrap(); Ok(i) }); starter .start_with_worker(wf_name.clone(), &mut worker) .await; + worker.run_until_done().await.unwrap(); + + let client = starter.get_client().await; + let workers_list = client + .list_workers(100, Vec::new(), String::new()) + .await + .unwrap(); + // Since list_workers finds all workers in the namespace, must find specific worker used in this + // test + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + assert!(heartbeat.task_queue.starts_with(&wf_name)); + assert_eq!(heartbeat.worker_identity, "integ_tester"); + assert_eq!(heartbeat.sdk_name, "temporal-core"); + assert_eq!(heartbeat.sdk_version, "0.1.0"); + assert_eq!(heartbeat.status, WorkerStatus::Shutdown as i32); + assert!(within_two_minutes_ts(heartbeat.start_time.unwrap())); + assert!(within_two_minutes_ts(heartbeat.heartbeat_time.unwrap())); + assert!(within_duration( + heartbeat.elapsed_since_last_heartbeat.unwrap(), + Duration::from_secs(1) + )); + + let workflow_poller_info = heartbeat.workflow_poller_info.unwrap(); + assert!(!workflow_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + workflow_poller_info.last_successful_poll_time.unwrap() + )); + let sticky_poller_info = heartbeat.workflow_sticky_poller_info.unwrap(); + assert!(!sticky_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + sticky_poller_info.last_successful_poll_time.unwrap() + )); + let nexus_poller_info = heartbeat.nexus_poller_info.unwrap(); + assert!(!nexus_poller_info.is_autoscaling); + assert!(nexus_poller_info.last_successful_poll_time.is_none()); + let activity_poller_info = heartbeat.activity_poller_info.unwrap(); + assert!(!activity_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + activity_poller_info.last_successful_poll_time.unwrap() + )); - let test_fut = async { - let _ = ACTS_STARTED.acquire().await.unwrap(); - let client = starter.get_client().await; - let workers_list = client - .list_workers(100, Vec::new(), String::new()) - .await - .unwrap(); - let worker_info = workers_list - .workers_info - .iter() - .find(|worker_info| { - if let Some(hb) = worker_info.worker_heartbeat.as_ref() { - hb.worker_instance_key == worker_instance_key.to_string() - } else { - false - } - }) - .unwrap(); - let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); - println!("1231231231231: {heartbeat:#?}"); - let workflow_poller_info = heartbeat.workflow_poller_info.unwrap(); - assert_eq!(workflow_poller_info.current_pollers, 1); - let sticky_poller_info = heartbeat.workflow_sticky_poller_info.unwrap(); - assert_eq!(sticky_poller_info.current_pollers, 3); - let nexus_poller_info = heartbeat.nexus_poller_info.unwrap(); - assert_eq!(nexus_poller_info.current_pollers, 0); - let activity_poller_info = heartbeat.activity_poller_info.unwrap(); - assert_eq!(activity_poller_info.current_pollers, 4); - assert_eq!(heartbeat.current_sticky_cache_size, 1); - ACTS_DONE.add_permits(1); - }; + assert_eq!(heartbeat.total_sticky_cache_hit, 2); +} - // for i in 1..5 { - // worker.submit_wf( - // format!("{wf_name}-{i}"), - // wf_name, - // vec![], - // starter.workflow_options.clone(), - // ) - // .await - // .unwrap(); - // } +// Tests that rely on Prometheus running in a docker container need to start +// with `docker_` and set the `DOCKER_PROMETHEUS_RUNNING` env variable to run +#[tokio::test] +async fn docker_worker_heartbeat_tuner() { + let runtimeopts = RuntimeOptionsBuilder::default() + .telemetry_options(get_integ_telem_options()) + .heartbeat_interval(Some(Duration::from_millis(100))) + .build() + .unwrap(); + let mut rt = CoreRuntime::new_assume_tokio(runtimeopts).unwrap(); + + let url = Some("grpc://localhost:4317") + .map(|x| x.parse::().unwrap()) + .unwrap(); + let mut opts_build = OtelCollectorOptionsBuilder::default(); + let opts = opts_build.url(url).build().unwrap(); - let runner = async move { - worker.run_until_done().await.unwrap(); - }; - tokio::join!(test_fut, runner); + rt.telemetry_mut() + .attach_late_init_metrics(Arc::new(build_otlp_metric_exporter(opts).unwrap())); + let wf_name = "worker_heartbeat_tuner"; + let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); + let mut tuner = ResourceBasedTuner::new(0.0, 0.0); + tuner + .with_workflow_slots_options(ResourceSlotOptions::new(2, 10, Duration::from_millis(0))) + .with_activity_slots_options(ResourceSlotOptions::new(5, 10, Duration::from_millis(50))); + starter + .worker_config + .clear_max_outstanding_opts() + .tuner(Arc::new(tuner)); + let mut worker = starter.worker().await; + let worker_instance_key = worker.worker_instance_key(); + + // Run a workflow + worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { + ctx.activity(ActivityOptions { + activity_type: "pass_fail_act".to_string(), + input: "pass".as_json_payload().expect("serializes fine"), + start_to_close_timeout: Some(Duration::from_secs(1)), + ..Default::default() + }) + .await; + Ok(().into()) + }); + worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { + Ok(i) + }); + + starter.start_with_worker(wf_name, &mut worker).await; + worker.run_until_done().await.unwrap(); - // TODO: clone_no_worker() for new worker let client = starter.get_client().await; let workers_list = client .list_workers(100, Vec::new(), String::new()) @@ -169,8 +212,7 @@ async fn docker_worker_heartbeat_basic( }) .unwrap(); let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); - println!("heartbeat: {heartbeat:#?}"); - assert!(heartbeat.task_queue.starts_with(&wf_name)); + assert!(heartbeat.task_queue.starts_with(wf_name)); assert_eq!(heartbeat.worker_identity, "integ_tester"); assert_eq!(heartbeat.sdk_name, "temporal-core"); assert_eq!(heartbeat.sdk_version, "0.1.0"); @@ -182,65 +224,6 @@ async fn docker_worker_heartbeat_basic( Duration::from_secs(1) )); - // TODO: - // workflow_task_slots_info: Some( - // WorkerSlotsInfo { - // current_available_slots: 5, - // current_used_slots: 0, - // slot_supplier_kind: "Workflow", - // total_processed_tasks: 2, - // total_failed_tasks: 0, - // last_interval_processed_tasks: 0, - // last_interval_failure_tasks: 0, - // }, - // ), - // activity_task_slots_info: None, - // nexus_task_slots_info: None, - // local_activity_slots_info: None, - - // workflow_poller_info: Some( - // WorkerPollerInfo { - // current_pollers: 0, - // last_successful_poll_time: Some( - // Timestamp { - // seconds: 1759209070, - // nanos: 909377000, - // }, - // ), - // is_autoscaling: false, - // }, - // ), - // workflow_sticky_poller_info: Some( - // WorkerPollerInfo { - // current_pollers: 0, - // last_successful_poll_time: Some( - // Timestamp { - // seconds: 1759209072, - // nanos: 281956000, - // }, - // ), - // is_autoscaling: false, - // }, - // ), - // activity_poller_info: Some( - // WorkerPollerInfo { - // current_pollers: 0, - // last_successful_poll_time: Some( - // Timestamp { - // seconds: 1759209071, - // nanos: 737598000, - // }, - // ), - // is_autoscaling: false, - // }, - // ), - // nexus_poller_info: Some( - // WorkerPollerInfo { - // current_pollers: 0, - // last_successful_poll_time: None, - // is_autoscaling: false, - // }, - // ), let workflow_poller_info = heartbeat.workflow_poller_info.unwrap(); assert!(!workflow_poller_info.is_autoscaling); assert!(within_two_minutes_ts( @@ -261,7 +244,79 @@ async fn docker_worker_heartbeat_basic( )); assert_eq!(heartbeat.total_sticky_cache_hit, 2); - // total_sticky_cache_miss: 0, - // current_sticky_cache_size: 0, - // plugins: [], +} + +#[tokio::test] +async fn docker_worker_heartbeat_no_metrics() { + // Even if no metrics are used, we should still get in-memory metrics for worker heartbeat + let runtimeopts = RuntimeOptionsBuilder::default() + .telemetry_options(TelemetryOptionsBuilder::default().build().unwrap()) + .heartbeat_interval(Some(Duration::from_millis(100))) + .build() + .unwrap(); + let rt = CoreRuntime::new_assume_tokio(runtimeopts).unwrap(); + let wf_name = "worker_heartbeat_no_metrics"; + let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); + starter + .worker_config + .max_outstanding_workflow_tasks(5_usize) + .max_cached_workflows(5_usize) + .max_outstanding_activities(5_usize); + let mut worker = starter.worker().await; + let worker_instance_key = worker.worker_instance_key(); + + // Run a workflow + worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { + ctx.activity(ActivityOptions { + activity_type: "pass_fail_act".to_string(), + input: "pass".as_json_payload().expect("serializes fine"), + start_to_close_timeout: Some(Duration::from_secs(1)), + ..Default::default() + }) + .await; + Ok(().into()) + }); + worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { + Ok(i) + }); + starter.start_with_worker(wf_name, &mut worker).await; + worker.run_until_done().await.unwrap(); + + let client = starter.get_client().await; + let workers_list = client + .list_workers(100, Vec::new(), String::new()) + .await + .unwrap(); + // Since list_workers finds all workers in the namespace, must find specific worker used in this + // test + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + assert!(heartbeat.task_queue.starts_with(wf_name)); + assert_eq!(heartbeat.worker_identity, "integ_tester"); + assert_eq!(heartbeat.sdk_name, "temporal-core"); + assert_eq!(heartbeat.sdk_version, "0.1.0"); + assert_eq!(heartbeat.status, WorkerStatus::Shutdown as i32); + assert_eq!( + heartbeat.deployment_version, + Some(WorkerDeploymentVersion { + build_id: "test_build_id".to_owned(), + deployment_name: String::new(), + }) + ); + assert!(within_two_minutes_ts(heartbeat.start_time.unwrap())); + assert!(within_two_minutes_ts(heartbeat.heartbeat_time.unwrap())); + assert!(within_duration( + heartbeat.elapsed_since_last_heartbeat.unwrap(), + Duration::from_secs(1) + )); } From ef966161a10f901bdd67683e85015cf115eee19f Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Sat, 4 Oct 2025 19:54:17 -0700 Subject: [PATCH 06/23] Format, AtomicCell --- core-api/src/telemetry/metrics.rs | 17 +++++-------- core/src/pollers/poll_buffer.rs | 18 +++++++------- core/src/worker/activities.rs | 7 +++--- core/src/worker/mod.rs | 32 ++++++++++++------------- core/src/worker/tuner/resource_based.rs | 6 ++--- core/src/worker/workflow/wft_poller.rs | 6 ++--- 6 files changed, 41 insertions(+), 45 deletions(-) diff --git a/core-api/src/telemetry/metrics.rs b/core-api/src/telemetry/metrics.rs index 30ed6a7f3..4599c162f 100644 --- a/core-api/src/telemetry/metrics.rs +++ b/core-api/src/telemetry/metrics.rs @@ -958,18 +958,13 @@ impl CoreMeter for NoOpCoreMeter { attribs: NewAttributes, ) -> MetricAttributes { let new_attrs = InMemoryMetricAttributes::from_new_attributes(attribs.attributes); - let merged = if let MetricAttributes::Dynamic(existing_attrs) = existing { - if let Ok(in_mem) = existing_attrs - .clone() + let merged = match existing { + MetricAttributes::Dynamic(attrs) => attrs .as_any() - .downcast::() - { - in_mem.merge(&new_attrs) - } else { - new_attrs - } - } else { - new_attrs + .downcast_ref::() + .map(|in_mem| in_mem.merge(&new_attrs)) + .unwrap_or(new_attrs), + _ => new_attrs, }; MetricAttributes::Dynamic(Arc::new(merged)) } diff --git a/core/src/pollers/poll_buffer.rs b/core/src/pollers/poll_buffer.rs index c30bf7120..786b195e4 100644 --- a/core/src/pollers/poll_buffer.rs +++ b/core/src/pollers/poll_buffer.rs @@ -6,6 +6,7 @@ use crate::{ client::{PollActivityOptions, PollOptions, PollWorkflowOptions, WorkerClient}, }, }; +use crossbeam_utils::atomic::AtomicCell; use futures_util::{FutureExt, StreamExt, future::BoxFuture}; use governor::{Quota, RateLimiter}; use std::time::SystemTime; @@ -75,7 +76,7 @@ impl LongPollBuffer { shutdown: CancellationToken, num_pollers_handler: Option, options: WorkflowTaskOptions, - last_successful_poll_time: Arc>>, + last_successful_poll_time: Arc>>, ) -> Self { let is_sticky = sticky_queue.is_some(); let poll_scaler = PollScaler::new( @@ -152,7 +153,7 @@ impl LongPollBuffer { shutdown: CancellationToken, num_pollers_handler: Option, options: ActivityTaskOptions, - last_successful_poll_time: Arc>>, + last_successful_poll_time: Arc>>, ) -> Self { let pre_permit_delay = options .max_worker_acts_per_second @@ -218,7 +219,7 @@ impl LongPollBuffer { permit_dealer: MeteredPermitDealer, shutdown: CancellationToken, num_pollers_handler: Option, - last_successful_poll_time: Arc>>, + last_successful_poll_time: Arc>>, send_heartbeat: bool, ) -> Self { let no_retry = if matches!(poller_behavior, PollerBehavior::Autoscaling { .. }) { @@ -438,7 +439,7 @@ where behavior: PollerBehavior, num_pollers_handler: Option, shutdown: CancellationToken, - last_successful_poll_time: Arc>>, + last_successful_poll_time: Arc>>, ) -> Self { let (active_tx, active_rx) = watch::channel(0); let num_pollers_handler = num_pollers_handler.map(Arc::new); @@ -522,7 +523,7 @@ struct PollScalerReportHandle { ingested_this_period: AtomicUsize, ingested_last_period: AtomicUsize, scale_up_allowed: AtomicBool, - last_successful_poll_time: Arc>>, + last_successful_poll_time: Arc>>, } impl PollScalerReportHandle { @@ -531,8 +532,7 @@ impl PollScalerReportHandle { match res { Ok(res) => { self.last_successful_poll_time - .lock() - .replace(SystemTime::now()); + .store(Some(SystemTime::now())); if let PollerBehavior::SimpleMaximum(_) = self.behavior { // We don't do auto-scaling with the simple max return true; @@ -766,7 +766,7 @@ mod tests { WorkflowTaskOptions { wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(10)))), }, - Arc::new(parking_lot::Mutex::new(None)), + Arc::new(AtomicCell::new(None)), ); // Poll a bunch of times, "interrupting" it each time, we should only actually have polled @@ -822,7 +822,7 @@ mod tests { WorkflowTaskOptions { wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(1)))), }, - Arc::new(parking_lot::Mutex::new(None)), + Arc::new(AtomicCell::new(None)), ); // Should not see error, unwraps should get empty response diff --git a/core/src/worker/activities.rs b/core/src/worker/activities.rs index 2ded441f6..a4edb4f2c 100644 --- a/core/src/worker/activities.rs +++ b/core/src/worker/activities.rs @@ -728,6 +728,7 @@ mod tests { prost_dur, worker::client::mocks::mock_worker_client, }; + use crossbeam_utils::atomic::AtomicCell; use temporal_sdk_core_api::worker::PollerBehavior; use temporal_sdk_core_protos::coresdk::activity_result::ActivityExecutionResult; @@ -773,7 +774,7 @@ mod tests { max_worker_acts_per_second: Some(2.0), max_tps: None, }, - Arc::new(parking_lot::Mutex::new(None)), + Arc::new(AtomicCell::new(None)), ); let atm = WorkerActivityTasks::new( sem.clone(), @@ -865,7 +866,7 @@ mod tests { max_worker_acts_per_second: None, max_tps: None, }, - Arc::new(parking_lot::Mutex::new(None)), + Arc::new(AtomicCell::new(None)), ); let atm = WorkerActivityTasks::new( sem.clone(), @@ -939,7 +940,7 @@ mod tests { max_worker_acts_per_second: None, max_tps: None, }, - Arc::new(parking_lot::Mutex::new(None)), + Arc::new(AtomicCell::new(None)), ); let atm = WorkerActivityTasks::new( sem.clone(), diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index b0774c871..5397ecee7 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -51,6 +51,7 @@ use crate::{ }; use activities::WorkerActivityTasks; use anyhow::bail; +use crossbeam_utils::atomic::AtomicCell; use futures_util::{StreamExt, stream}; use gethostname::gethostname; use parking_lot::{Mutex, RwLock}; @@ -429,10 +430,10 @@ impl Worker { let act_permits = act_slots.get_extant_count_rcv(); let (external_wft_tx, external_wft_rx) = unbounded_channel(); - let wf_last_suc_poll_time = Arc::new(Mutex::new(None)); - let wf_sticky_last_suc_poll_time = Arc::new(Mutex::new(None)); - let act_last_suc_poll_time = Arc::new(Mutex::new(None)); - let nexus_last_suc_poll_time = Arc::new(Mutex::new(None)); + let wf_last_suc_poll_time = Arc::new(AtomicCell::new(None)); + let wf_sticky_last_suc_poll_time = Arc::new(AtomicCell::new(None)); + let act_last_suc_poll_time = Arc::new(AtomicCell::new(None)); + let nexus_last_suc_poll_time = Arc::new(AtomicCell::new(None)); let nexus_slots = MeteredPermitDealer::new( tuner.nexus_task_slot_supplier(), @@ -576,7 +577,7 @@ impl Worker { let sdk_name_and_ver = client.sdk_name_and_version(); let worker_heartbeat = worker_heartbeat_interval.map(|hb_interval| { - let hb_metrics = WorkerHeartbeatManagerMetrics { + let hb_metrics = HeartbeatMetrics { in_mem_metrics: metrics.in_memory_meter(), wft_slots: wft_slots.clone(), act_slots, @@ -1030,17 +1031,16 @@ impl ClientWorker for ClientWorkerRegistrator { } } -// TODO: better name? -struct WorkerHeartbeatManagerMetrics { +struct HeartbeatMetrics { in_mem_metrics: Option>, wft_slots: MeteredPermitDealer, act_slots: MeteredPermitDealer, nexus_slots: MeteredPermitDealer, la_slots: MeteredPermitDealer, - wf_last_suc_poll_time: Arc>>, - wf_sticky_last_suc_poll_time: Arc>>, - act_last_suc_poll_time: Arc>>, - nexus_last_suc_poll_time: Arc>>, + wf_last_suc_poll_time: Arc>>, + wf_sticky_last_suc_poll_time: Arc>>, + act_last_suc_poll_time: Arc>>, + nexus_last_suc_poll_time: Arc>>, status: Arc>, sys_info: Arc, } @@ -1060,7 +1060,7 @@ impl WorkerHeartbeatManager { worker_instance_key: Uuid, heartbeat_interval: Duration, telemetry_instance: Option, - heartbeat_manager_metrics: WorkerHeartbeatManagerMetrics, + heartbeat_manager_metrics: HeartbeatMetrics, ) -> Self { let worker_heartbeat_callback: HeartbeatFn = Arc::new(move || { let deployment_version = config.computed_deployment_version().map(|dv| { @@ -1128,7 +1128,7 @@ impl WorkerHeartbeatManager { .load(Ordering::Relaxed) as i32, last_successful_poll_time: heartbeat_manager_metrics .wf_last_suc_poll_time - .lock() + .load() .map(|time| time.into()), is_autoscaling: config.workflow_task_poller_behavior.is_autoscaling(), }); @@ -1139,7 +1139,7 @@ impl WorkerHeartbeatManager { .load(Ordering::Relaxed) as i32, last_successful_poll_time: heartbeat_manager_metrics .wf_sticky_last_suc_poll_time - .lock() + .load() .map(|time| time.into()), is_autoscaling: config.workflow_task_poller_behavior.is_autoscaling(), }); @@ -1150,7 +1150,7 @@ impl WorkerHeartbeatManager { .load(Ordering::Relaxed) as i32, last_successful_poll_time: heartbeat_manager_metrics .act_last_suc_poll_time - .lock() + .load() .map(|time| time.into()), is_autoscaling: config.activity_task_poller_behavior.is_autoscaling(), }); @@ -1161,7 +1161,7 @@ impl WorkerHeartbeatManager { .load(Ordering::Relaxed) as i32, last_successful_poll_time: heartbeat_manager_metrics .nexus_last_suc_poll_time - .lock() + .load() .map(|time| time.into()), is_autoscaling: config.nexus_task_poller_behavior.is_autoscaling(), }); diff --git a/core/src/worker/tuner/resource_based.rs b/core/src/worker/tuner/resource_based.rs index ea1786d98..b321f2f9c 100644 --- a/core/src/worker/tuner/resource_based.rs +++ b/core/src/worker/tuner/resource_based.rs @@ -527,7 +527,7 @@ impl RealSysInfoInner { pub struct RealSysInfo { inner: Arc, shutdown: Arc, - refresh_thread: Mutex>>, + shutdown_handle: Mutex>>, } impl RealSysInfo { @@ -563,7 +563,7 @@ impl RealSysInfo { Self { inner, shutdown, - refresh_thread: Mutex::new(Some(handle)), + shutdown_handle: Mutex::new(Some(handle)), } } } @@ -585,7 +585,7 @@ impl SystemResourceInfo for RealSysInfo { impl Drop for RealSysInfo { fn drop(&mut self) { self.shutdown.store(true, Ordering::Release); - if let Some(handle) = self.refresh_thread.lock().take() { + if let Some(handle) = self.shutdown_handle.lock().take() { let _ = handle.join(); } } diff --git a/core/src/worker/workflow/wft_poller.rs b/core/src/worker/workflow/wft_poller.rs index 5d7faf2dd..0a00ad179 100644 --- a/core/src/worker/workflow/wft_poller.rs +++ b/core/src/worker/workflow/wft_poller.rs @@ -6,8 +6,8 @@ use crate::{ telemetry::metrics::{workflow_poller, workflow_sticky_poller}, worker::{client::WorkerClient, wft_poller_behavior}, }; +use crossbeam_utils::atomic::AtomicCell; use futures_util::{Stream, stream}; -use parking_lot::Mutex; use std::sync::{Arc, OnceLock}; use std::time::SystemTime; use temporal_sdk_core_api::worker::{WorkerConfig, WorkflowSlotKind}; @@ -23,8 +23,8 @@ pub(crate) fn make_wft_poller( metrics: &MetricsContext, shutdown_token: &CancellationToken, wft_slots: &MeteredPermitDealer, - last_successful_poll_time: Arc>>, - sticky_last_successful_poll_time: Arc>>, + last_successful_poll_time: Arc>>, + sticky_last_successful_poll_time: Arc>>, ) -> impl Stream< Item = Result< ( From 0350f91ec7009f21680e8661d5c3428919dc5a92 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Sat, 4 Oct 2025 20:33:51 -0700 Subject: [PATCH 07/23] Fix unit test --- core/src/worker/client/mocks.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/core/src/worker/client/mocks.rs b/core/src/worker/client/mocks.rs index b8ed1de7b..b86334e4b 100644 --- a/core/src/worker/client/mocks.rs +++ b/core/src/worker/client/mocks.rs @@ -36,6 +36,12 @@ pub fn mock_worker_client() -> MockWorkerClient { r.expect_identity() .returning(|| "test-identity".to_string()); r.expect_worker_grouping_key().returning(Uuid::new_v4); + r.expect_set_heartbeat_client_fields().returning(|hb| { + hb.sdk_name = "test-core".to_string(); + hb.sdk_version = "0.0.0".to_string(); + hb.worker_identity = "test-identity".to_string(); + hb.heartbeat_time = Some(SystemTime::now().into()); + }); r } From fe3956ce174453d34c3def537940ea9321db0896 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 7 Oct 2025 14:03:08 -0700 Subject: [PATCH 08/23] Set dynamic config for WorkerHeartbeatsEnabled and ListWorkersEnabled, remove stale metric previously added --- core/src/telemetry/metrics.rs | 8 -------- tests/runner.rs | 4 ++++ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/core/src/telemetry/metrics.rs b/core/src/telemetry/metrics.rs index e3527c7c2..0fdcf11d8 100644 --- a/core/src/telemetry/metrics.rs +++ b/core/src/telemetry/metrics.rs @@ -65,7 +65,6 @@ struct Instruments { sticky_cache_miss: Counter, sticky_cache_size: Gauge, sticky_cache_forced_evictions: Counter, - last_successful_poll_time: HistogramDuration, } impl MetricsContext { @@ -517,11 +516,6 @@ impl Instruments { description: "Count of evictions of cached workflows".into(), unit: "".into(), }), - last_successful_poll_time: meter.histogram_duration(MetricParameters { - name: "last_successful_poll_time".into(), - unit: "duration".into(), - description: "Timestamp of the last successful poll time".into(), - }), } } @@ -594,8 +588,6 @@ impl Instruments { .update_attributes(new_attributes.clone()); self.sticky_cache_forced_evictions .update_attributes(new_attributes.clone()); - self.last_successful_poll_time - .update_attributes(new_attributes.clone()); } } diff --git a/tests/runner.rs b/tests/runner.rs index f2d843968..af763cb4a 100644 --- a/tests/runner.rs +++ b/tests/runner.rs @@ -121,6 +121,10 @@ async fn main() -> Result<(), anyhow::Error> { "system.enableDeploymentVersions=true".to_owned(), "--dynamic-config-value".to_owned(), "component.nexusoperations.recordCancelRequestCompletionEvents=true".to_owned(), + "--dynamic-config-value".to_owned(), + "frontend.WorkerHeartbeatsEnabled=true".to_owned(), + "--dynamic-config-value".to_owned(), + "frontend.ListWorkersEnabled=true".to_owned(), "--http-port".to_string(), "7243".to_string(), "--search-attribute".to_string(), From d9567841fac346fcefccf82358ce700fa3e2ac9b Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 7 Oct 2025 14:41:55 -0700 Subject: [PATCH 09/23] Should not expect heartbeat nexus worker in metrics for non-heartbeating integ test --- tests/integ_tests/metrics_tests.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/integ_tests/metrics_tests.rs b/tests/integ_tests/metrics_tests.rs index dc7caa812..901e7c178 100644 --- a/tests/integ_tests/metrics_tests.rs +++ b/tests/integ_tests/metrics_tests.rs @@ -762,15 +762,8 @@ async fn docker_metrics_with_prometheus( assert!(!data.is_empty(), "No metrics found for query: {test_uid}"); assert_eq!(data[0]["metric"]["exported_job"], "temporal-core-sdk"); assert_eq!(data[0]["metric"]["job"], "otel-collector"); - // Worker heartbeating nexus worker assert!( data[0]["metric"]["task_queue"] - .as_str() - .unwrap() - .starts_with("temporal-sys/worker-commands/default/") - ); - assert!( - data[1]["metric"]["task_queue"] .as_str() .unwrap() .starts_with(test_name) From 98e778dd3dfe92dfa76a969b8bc4153c30386d51 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Wed, 8 Oct 2025 12:36:45 -0700 Subject: [PATCH 10/23] recv_timeout instead of thread::sleep, use WorkflowService::list_workers directly, WithLabel API improvement --- client/src/lib.rs | 23 --------- core-api/src/telemetry/metrics.rs | 37 ++++++++------ core-api/src/worker.rs | 4 +- core/src/worker/mod.rs | 12 ++--- core/src/worker/tuner/resource_based.rs | 26 ++++------ tests/integ_tests/worker_heartbeat_tests.rs | 54 ++++++++++++++++----- 6 files changed, 80 insertions(+), 76 deletions(-) diff --git a/client/src/lib.rs b/client/src/lib.rs index 4c824a75f..4d6088f87 100644 --- a/client/src/lib.rs +++ b/client/src/lib.rs @@ -1227,14 +1227,6 @@ pub trait WorkflowClientTrait: NamespacedClient { query: String, ) -> Result; - /// List workers registered with server through worker heartbeats - async fn list_workers( - &self, - page_size: i32, - next_page_token: Vec, - query: String, - ) -> Result; - /// Get Cluster Search Attributes async fn get_search_attributes(&self) -> Result; @@ -1801,21 +1793,6 @@ where .into_inner()) } - async fn list_workers( - &self, - page_size: i32, - next_page_token: Vec, - query: String, - ) -> Result { - Ok(WorkflowService::list_workers(&mut self.clone(), - ListWorkersRequest { - namespace: self.namespace().to_owned(), - page_size, - next_page_token, - query, - }).await?.into_inner()) - } - async fn get_search_attributes(&self) -> Result { Ok(WorkflowService::get_search_attributes(&mut self.clone(), GetSearchAttributesRequest {}, diff --git a/core-api/src/telemetry/metrics.rs b/core-api/src/telemetry/metrics.rs index 4599c162f..9bbe4ba2f 100644 --- a/core-api/src/telemetry/metrics.rs +++ b/core-api/src/telemetry/metrics.rs @@ -28,7 +28,7 @@ pub trait CoreMeter: Send + Sync + Debug { ) -> MetricAttributes; fn counter(&self, params: MetricParameters) -> Counter; - /// Create a counter with in-memory tracking for dual metrics reporting + /// Create a counter with in-memory tracking for worker heartbeating reporting fn counter_with_in_memory( &self, params: MetricParameters, @@ -47,19 +47,19 @@ pub trait CoreMeter: Send + Sync + Debug { /// accordingly. fn histogram_duration(&self, params: MetricParameters) -> HistogramDuration; - /// Create a histogram duration with in-memory tracking for dual metrics reporting + /// Create a histogram duration with in-memory tracking for worker heartbeating reporting fn histogram_duration_with_in_memory( &self, params: MetricParameters, in_memory_hist: HeartbeatMetricType, ) -> HistogramDuration { - let primary_hist = self.histogram_duration(params.clone()); + let primary_hist = self.histogram_duration(params); HistogramDuration::new_with_in_memory(primary_hist.primary.metric.clone(), in_memory_hist) } fn gauge(&self, params: MetricParameters) -> Gauge; - /// Create a gauge with in-memory tracking for dual metrics reporting + /// Create a gauge with in-memory tracking for worker heartbeating reporting fn gauge_with_in_memory( &self, params: MetricParameters, @@ -78,7 +78,10 @@ pub trait CoreMeter: Send + Sync + Debug { #[derive(Clone, Debug)] pub enum HeartbeatMetricType { Individual(Arc), - WithLabel(HashMap>), + WithLabel { + label_key: String, + metrics: HashMap>, + }, } impl HeartbeatMetricType { @@ -87,7 +90,7 @@ impl HeartbeatMetricType { HeartbeatMetricType::Individual(metric) => { metric.fetch_add(delta, Ordering::Relaxed); } - HeartbeatMetricType::WithLabel(_) => { + HeartbeatMetricType::WithLabel { .. } => { dbg_panic!("Counter does not support in-memory metric with labels"); } } @@ -98,7 +101,7 @@ impl HeartbeatMetricType { HeartbeatMetricType::Individual(metric) => { metric.fetch_add(1, Ordering::Relaxed); } - HeartbeatMetricType::WithLabel(_) => { + HeartbeatMetricType::WithLabel { .. } => { dbg_panic!("Histogram does not support in-memory metric with labels"); } } @@ -109,11 +112,11 @@ impl HeartbeatMetricType { HeartbeatMetricType::Individual(metric) => { metric.store(value, Ordering::Relaxed); } - HeartbeatMetricType::WithLabel(metrics) => { - if let Some(label_value) = label_value_from_attributes(attributes, "poller_type") { - if let Some(metric) = metrics.get(label_value.as_str()) { - metric.store(value, Ordering::Relaxed); - } + HeartbeatMetricType::WithLabel { label_key, metrics } => { + if let Some(metric) = label_value_from_attributes(attributes, label_key.as_str()) + .and_then(|label_value| metrics.get(label_value.as_str())) + { + metric.store(value, Ordering::Relaxed) } } } @@ -192,7 +195,10 @@ impl WorkerHeartbeatMetrics { "sticky_cache_miss" => Some(HeartbeatMetricType::Individual( self.total_sticky_cache_miss.clone(), )), - "num_pollers" => Some(HeartbeatMetricType::WithLabel(self.num_pollers.as_map())), + "num_pollers" => Some(HeartbeatMetricType::WithLabel { + label_key: "poller_type".to_string(), + metrics: self.num_pollers.as_map(), + }), "workflow_task_execution_failed" => Some(HeartbeatMetricType::Individual( self.workflow_task_execution_failed.clone(), )), @@ -1085,7 +1091,10 @@ mod tests { let value = Arc::new(AtomicU64::new(0)); let mut metrics = HashMap::new(); metrics.insert("workflow_task".to_string(), value.clone()); - let heartbeat_metric = HeartbeatMetricType::WithLabel(metrics); + let heartbeat_metric = HeartbeatMetricType::WithLabel { + label_key: "poller_type".to_string(), + metrics, + }; heartbeat_metric.record_gauge(3, &attrs); diff --git a/core-api/src/worker.rs b/core-api/src/worker.rs index 817ab1e90..8fffb3eb0 100644 --- a/core-api/src/worker.rs +++ b/core-api/src/worker.rs @@ -1,6 +1,6 @@ use crate::{errors::WorkflowErrorType, telemetry::metrics::TemporalMeter}; use std::{ - any::{Any, type_name}, + any::Any, collections::{HashMap, HashSet}, str::FromStr, sync::Arc, @@ -366,7 +366,7 @@ pub trait SlotSupplier { /// Returns a human-friendly identifier describing this supplier implementation for /// diagnostics and telemetry. fn slot_supplier_kind(&self) -> String { - type_name::().to_string() + "Custom".to_string() } } diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index 5397ecee7..3b0e362cf 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -1103,14 +1103,10 @@ impl WorkerHeartbeatManager { total_sticky_cache_miss: 0, current_sticky_cache_size: 0, - // sdk_name, sdk_version, and worker_identity must be set by + // Some fields like sdk_name, sdk_version, and worker_identity, must be set by // SharedNamespaceWorker because they rely on the client, and // need to be pulled from the current client used by SharedNamespaceWorker - worker_identity: String::new(), - heartbeat_time: None, - elapsed_since_last_heartbeat: None, - sdk_name: String::new(), - sdk_version: String::new(), + ..Default::default() }; if let Some(in_mem) = heartbeat_manager_metrics.in_mem_metrics.as_ref() { @@ -1260,8 +1256,8 @@ where current_available_slots: avail, current_used_slots: *permits.borrow() as i32, slot_supplier_kind: dealer.slot_supplier_kind().to_string(), - total_processed_tasks: i32::try_from(total_processed).unwrap_or(i32::MAX), - total_failed_tasks: i32::try_from(total_failed).unwrap_or(i32::MAX), + total_processed_tasks: i32::try_from(total_processed).unwrap_or(i32::MIN), + total_failed_tasks: i32::try_from(total_failed).unwrap_or(i32::MIN), // Filled in by heartbeat later last_interval_processed_tasks: 0, diff --git a/core/src/worker/tuner/resource_based.rs b/core/src/worker/tuner/resource_based.rs index b321f2f9c..88606add3 100644 --- a/core/src/worker/tuner/resource_based.rs +++ b/core/src/worker/tuner/resource_based.rs @@ -1,10 +1,11 @@ use crossbeam_utils::atomic::AtomicCell; use parking_lot::Mutex; +use std::sync::mpsc; use std::{ marker::PhantomData, sync::{ Arc, OnceLock, - atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}, + atomic::{AtomicU64, AtomicUsize, Ordering}, }, thread, time::{Duration, Instant}, @@ -94,12 +95,6 @@ impl ResourceBasedTuner { self } - /// Set sys info - pub fn with_sys_info(&mut self, sys_info: Arc) -> &mut Self { - self.sys_info = sys_info; - self - } - /// Get sys info pub fn sys_info(&self) -> Arc { self.sys_info.clone() @@ -526,7 +521,7 @@ impl RealSysInfoInner { /// Tracks host resource usage by refreshing metrics on a background thread. pub struct RealSysInfo { inner: Arc, - shutdown: Arc, + shutdown_tx: mpsc::Sender<()>, shutdown_handle: Mutex>>, } @@ -543,26 +538,25 @@ impl RealSysInfo { }); inner.refresh(); - let shutdown = Arc::new(AtomicBool::new(false)); - let thread_inner = inner.clone(); - let thread_shutdown = shutdown.clone(); + let thread_clone = inner.clone(); + let (tx, rx) = mpsc::channel::<()>(); let handle = thread::Builder::new() .name("temporal-real-sysinfo".to_string()) .spawn(move || { const REFRESH_INTERVAL: Duration = Duration::from_millis(100); loop { - if thread_shutdown.load(Ordering::Acquire) { + thread_clone.refresh(); + let r = rx.recv_timeout(REFRESH_INTERVAL); + if matches!(r, Err(mpsc::RecvTimeoutError::Disconnected)) || r.is_ok() { return; } - thread_inner.refresh(); - thread::sleep(REFRESH_INTERVAL); } }) .expect("failed to spawn RealSysInfo refresh thread"); Self { inner, - shutdown, + shutdown_tx: tx, shutdown_handle: Mutex::new(Some(handle)), } } @@ -584,7 +578,7 @@ impl SystemResourceInfo for RealSysInfo { impl Drop for RealSysInfo { fn drop(&mut self) { - self.shutdown.store(true, Ordering::Release); + let _res = self.shutdown_tx.send(()); if let Some(handle) = self.shutdown_handle.lock().take() { let _ = handle.join(); } diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index 1acfaf091..6f76e40fd 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -3,7 +3,7 @@ use prost_types::Duration as PbDuration; use prost_types::Timestamp; use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use temporal_client::WorkflowClientTrait; +use temporal_client::{NamespacedClient, WorkflowService}; use temporal_sdk::{ActContext, ActivityOptions, WfContext}; use temporal_sdk_core::telemetry::{build_otlp_metric_exporter, start_prometheus_metric_exporter}; use temporal_sdk_core::{ @@ -15,6 +15,7 @@ use temporal_sdk_core_api::telemetry::{ use temporal_sdk_core_protos::coresdk::AsJsonPayloadExt; use temporal_sdk_core_protos::temporal::api::deployment::v1::WorkerDeploymentVersion; use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerStatus; +use temporal_sdk_core_protos::temporal::api::workflowservice::v1::ListWorkersRequest; use url::Url; fn within_two_minutes_ts(ts: Timestamp) -> bool { @@ -91,10 +92,19 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom")] backing: &str) worker.run_until_done().await.unwrap(); let client = starter.get_client().await; - let workers_list = client - .list_workers(100, Vec::new(), String::new()) - .await - .unwrap(); + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + }, + ) + .await + .unwrap() + .into_inner(); // Since list_workers finds all workers in the namespace, must find specific worker used in this // test let worker_info = workers_list @@ -194,10 +204,19 @@ async fn docker_worker_heartbeat_tuner() { worker.run_until_done().await.unwrap(); let client = starter.get_client().await; - let workers_list = client - .list_workers(100, Vec::new(), String::new()) - .await - .unwrap(); + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + }, + ) + .await + .unwrap() + .into_inner(); // Since list_workers finds all workers in the namespace, must find specific worker used in this // test let worker_info = workers_list @@ -283,10 +302,19 @@ async fn docker_worker_heartbeat_no_metrics() { worker.run_until_done().await.unwrap(); let client = starter.get_client().await; - let workers_list = client - .list_workers(100, Vec::new(), String::new()) - .await - .unwrap(); + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + }, + ) + .await + .unwrap() + .into_inner(); // Since list_workers finds all workers in the namespace, must find specific worker used in this // test let worker_info = workers_list From 96e0186664d7398a5f46bd1869ee64a373a056fd Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Wed, 8 Oct 2025 16:19:57 -0700 Subject: [PATCH 11/23] MetricAttributes::NoOp, add mechanism to ignore dupe workers for testing, more tests --- client/src/lib.rs | 2 +- client/src/raw.rs | 9 + client/src/worker_registry/mod.rs | 26 +- core-api/src/lib.rs | 2 +- core-api/src/telemetry/metrics.rs | 108 ++-- core-c-bridge/src/metric.rs | 23 +- core/src/telemetry/metrics.rs | 8 +- core/src/worker/client/mocks.rs | 2 +- core/src/worker/mod.rs | 74 ++- .../api_upstream/openapi/openapiv2.json | 259 ++++++++- .../api_upstream/openapi/openapiv3.yaml | 239 ++++++++- .../temporal/api/common/v1/message.proto | 2 +- .../temporal/api/deployment/v1/message.proto | 6 + .../temporal/api/namespace/v1/message.proto | 8 +- .../workflowservice/v1/request_response.proto | 62 ++- .../api/workflowservice/v1/service.proto | 36 +- tests/integ_tests/worker_heartbeat_tests.rs | 495 ++++++++++++++---- 17 files changed, 1102 insertions(+), 259 deletions(-) diff --git a/client/src/lib.rs b/client/src/lib.rs index 4d6088f87..8b54aa1a6 100644 --- a/client/src/lib.rs +++ b/client/src/lib.rs @@ -591,7 +591,7 @@ impl ClientOptions { client: TemporalServiceClient::new(svc), options: Arc::new(self.clone()), capabilities: None, - workers: Arc::new(ClientWorkerSet::new()), + workers: Arc::new(ClientWorkerSet::new(false)), }; if !self.skip_get_system_info { match client diff --git a/client/src/raw.rs b/client/src/raw.rs index 92e6a7956..f461de84d 100644 --- a/client/src/raw.rs +++ b/client/src/raw.rs @@ -1345,6 +1345,15 @@ proxier! { r.extensions_mut().insert(labels); } ); + ( + describe_worker, + DescribeWorkerRequest, + DescribeWorkerResponse, + |r| { + let labels = namespaced_request!(r); + r.extensions_mut().insert(labels); + } + ); ( record_worker_heartbeat, RecordWorkerHeartbeatRequest, diff --git a/client/src/worker_registry/mod.rs b/client/src/worker_registry/mod.rs index 83f5e270c..f97adab1a 100644 --- a/client/src/worker_registry/mod.rs +++ b/client/src/worker_registry/mod.rs @@ -46,15 +46,19 @@ struct ClientWorkerSetImpl { all_workers: HashMap>, /// Maps namespace to shared worker for worker heartbeating shared_worker: HashMap>, + /// Disables erroring when multiple workers on the same namespace+task queue are registered. + /// This is used with testing, where multiple tests run in parallel on the same client + disable_dupe_check: bool, } impl ClientWorkerSetImpl { /// Factory method. - fn new() -> Self { + fn new(disable_dupe_check: bool) -> Self { Self { slot_providers: Default::default(), all_workers: Default::default(), shared_worker: Default::default(), + disable_dupe_check, } } @@ -81,7 +85,7 @@ impl ClientWorkerSetImpl { worker.namespace().to_string(), worker.task_queue().to_string(), ); - if self.slot_providers.contains_key(&slot_key) { + if self.slot_providers.contains_key(&slot_key) && !self.disable_dupe_check { bail!( "Registration of multiple workers on the same namespace and task queue for the same client not allowed: {slot_key:?}, worker_instance_key: {:?}.", worker.worker_instance_key() @@ -182,16 +186,16 @@ pub struct ClientWorkerSet { impl Default for ClientWorkerSet { fn default() -> Self { - Self::new() + Self::new(false) } } impl ClientWorkerSet { /// Factory method. - pub fn new() -> Self { + pub fn new(disable_dupe_check: bool) -> Self { Self { worker_grouping_key: Uuid::new_v4(), - worker_manager: RwLock::new(ClientWorkerSetImpl::new()), + worker_manager: RwLock::new(ClientWorkerSetImpl::new(disable_dupe_check)), } } @@ -270,7 +274,7 @@ pub trait ClientWorker: Send + Sync { fn try_reserve_wft_slot(&self) -> Option>; /// Unique identifier for this worker instance. - /// This must be stable across the worker's lifetime but unique per instance. + /// This must be stable across the worker's lifetime and unique per instance. fn worker_instance_key(&self) -> Uuid; /// Indicates if worker heartbeating is enabled for this client worker. @@ -331,7 +335,7 @@ mod tests { #[test] fn registry_keeps_one_provider_per_namespace() { - let manager = ClientWorkerSet::new(); + let manager = ClientWorkerSet::new(false); let mut worker_keys = vec![]; let mut successful_registrations = 0; @@ -461,7 +465,7 @@ mod tests { #[test] fn duplicate_namespace_task_queue_registration_fails() { - let manager = ClientWorkerSet::new(); + let manager = ClientWorkerSet::new(false); let worker1 = new_mock_provider_with_heartbeat( "test_namespace".to_string(), @@ -500,7 +504,7 @@ mod tests { #[test] fn multiple_workers_same_namespace_share_heartbeat_manager() { - let manager = ClientWorkerSet::new(); + let manager = ClientWorkerSet::new(false); let worker1 = new_mock_provider_with_heartbeat( "shared_namespace".to_string(), @@ -533,7 +537,7 @@ mod tests { #[test] fn different_namespaces_get_separate_heartbeat_managers() { - let manager = ClientWorkerSet::new(); + let manager = ClientWorkerSet::new(false); let worker1 = new_mock_provider_with_heartbeat( "namespace1".to_string(), "queue1".to_string(), @@ -561,7 +565,7 @@ mod tests { #[test] fn unregister_heartbeat_workers_cleans_up_shared_worker_when_last_removed() { - let manager = ClientWorkerSet::new(); + let manager = ClientWorkerSet::new(false); // Create two workers with same namespace but different task queues let worker1 = new_mock_provider_with_heartbeat( diff --git a/core-api/src/lib.rs b/core-api/src/lib.rs index f4811f2ff..511c9383f 100644 --- a/core-api/src/lib.rs +++ b/core-api/src/lib.rs @@ -141,7 +141,7 @@ pub trait Worker: Send + Sync { async fn finalize_shutdown(self); /// Unique identifier for this worker instance. - /// This must be stable across the worker's lifetime but unique per instance. + /// This must be stable across the worker's lifetime and unique per instance. fn worker_instance_key(&self) -> Uuid; } diff --git a/core-api/src/telemetry/metrics.rs b/core-api/src/telemetry/metrics.rs index 9bbe4ba2f..bb276f9bd 100644 --- a/core-api/src/telemetry/metrics.rs +++ b/core-api/src/telemetry/metrics.rs @@ -131,10 +131,7 @@ fn label_value_from_attributes(attributes: &MetricAttributes, key: &str) -> Opti .iter() .find(|kv| kv.key.as_str() == key) .map(|kv| kv.value.to_string()), - MetricAttributes::Buffer(buffer_attrs) => buffer_attrs.get().label_value(key), - MetricAttributes::Dynamic(custom_metrics_attribute) => { - custom_metrics_attribute.label_value(key) - } + MetricAttributes::NoOp(labels) => labels.get(key).cloned(), _ => None, } } @@ -167,12 +164,36 @@ impl NumPollersMetric { } } +#[derive(Default, Debug)] +pub struct SlotMetrics { + pub workflow_worker: Arc, + pub activity_worker: Arc, + pub nexus_worker: Arc, + pub local_activity_worker: Arc, +} + +impl SlotMetrics { + pub fn as_map(&self) -> HashMap> { + HashMap::from([ + ("WorkflowWorker".to_string(), self.workflow_worker.clone()), + ("ActivityWorker".to_string(), self.activity_worker.clone()), + ("NexusWorker".to_string(), self.nexus_worker.clone()), + ( + "LocalActivityWorker".to_string(), + self.local_activity_worker.clone(), + ), + ]) + } +} + #[derive(Default, Debug)] pub struct WorkerHeartbeatMetrics { pub sticky_cache_size: Arc, pub total_sticky_cache_hit: Arc, pub total_sticky_cache_miss: Arc, pub num_pollers: NumPollersMetric, + pub worker_task_slots_used: SlotMetrics, + pub worker_task_slots_available: SlotMetrics, pub workflow_task_execution_failed: Arc, pub activity_execution_failed: Arc, pub nexus_task_execution_failed: Arc, @@ -199,6 +220,14 @@ impl WorkerHeartbeatMetrics { label_key: "poller_type".to_string(), metrics: self.num_pollers.as_map(), }), + "worker_task_slots_used" => Some(HeartbeatMetricType::WithLabel { + label_key: "worker_type".to_string(), + metrics: self.worker_task_slots_used.as_map(), + }), + "worker_task_slots_available" => Some(HeartbeatMetricType::WithLabel { + label_key: "worker_type".to_string(), + metrics: self.worker_task_slots_available.as_map(), + }), "workflow_task_execution_failed" => Some(HeartbeatMetricType::Individual( self.workflow_task_execution_failed.clone(), )), @@ -315,6 +344,7 @@ pub enum MetricAttributes { }, Buffer(BufferAttributes), Dynamic(Arc), + NoOp(Arc>), Empty, } @@ -323,11 +353,6 @@ pub trait CustomMetricAttributes: Debug + Send + Sync { /// Must be implemented to work around existing type system restrictions, see /// [here](https://internals.rust-lang.org/t/downcast-not-from-any-but-from-any-trait/16736/12) fn as_any(self: Arc) -> Arc; - - /// Return the stringified value for a label key, if available. - fn label_value(&self, _key: &str) -> Option { - None - } } /// Options that are attached to metrics on a per-call basis @@ -351,6 +376,16 @@ where } } +impl From for HashMap { + fn from(value: NewAttributes) -> Self { + value + .attributes + .into_iter() + .map(|kv| (kv.key, kv.value.to_string())) + .collect() + } +} + /// A K/V pair that can be used to label a specific recording of a metric #[derive(Clone, Debug, PartialEq)] pub struct MetricKeyValue { @@ -953,9 +988,7 @@ impl LazyRef { pub struct NoOpCoreMeter; impl CoreMeter for NoOpCoreMeter { fn new_attributes(&self, attribs: NewAttributes) -> MetricAttributes { - MetricAttributes::Dynamic(Arc::new(InMemoryMetricAttributes::from_new_attributes( - attribs.attributes, - ))) + MetricAttributes::NoOp(Arc::new(attribs.into())) } fn extend_attributes( @@ -963,16 +996,14 @@ impl CoreMeter for NoOpCoreMeter { existing: MetricAttributes, attribs: NewAttributes, ) -> MetricAttributes { - let new_attrs = InMemoryMetricAttributes::from_new_attributes(attribs.attributes); - let merged = match existing { - MetricAttributes::Dynamic(attrs) => attrs - .as_any() - .downcast_ref::() - .map(|in_mem| in_mem.merge(&new_attrs)) - .unwrap_or(new_attrs), - _ => new_attrs, - }; - MetricAttributes::Dynamic(Arc::new(merged)) + if let MetricAttributes::NoOp(labels) = existing { + let mut labels = (*labels).clone(); + labels.extend::>(attribs.into()); + MetricAttributes::NoOp(Arc::new(labels)) + } else { + dbg_panic!("Must use NoOp attributes with a NoOp metric implementation"); + existing + } } fn counter(&self, _: MetricParameters) -> Counter { @@ -1035,39 +1066,6 @@ impl_no_op!(HistogramDurationBase, Duration); impl_no_op!(GaugeBase, u64); impl_no_op!(GaugeF64Base, f64); -#[derive(Debug, Clone, Default)] -struct InMemoryMetricAttributes { - labels: HashMap, -} - -impl InMemoryMetricAttributes { - fn from_new_attributes(attributes: Vec) -> Self { - let mut labels = HashMap::new(); - for kv in attributes { - labels.insert(kv.key, kv.value.to_string()); - } - Self { labels } - } - - fn merge(&self, other: &InMemoryMetricAttributes) -> Self { - let mut labels = self.labels.clone(); - for (key, value) in &other.labels { - labels.insert(key.clone(), value.clone()); - } - Self { labels } - } -} - -impl CustomMetricAttributes for InMemoryMetricAttributes { - fn as_any(self: Arc) -> Arc { - self as Arc - } - - fn label_value(&self, key: &str) -> Option { - self.labels.get(key).cloned() - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/core-c-bridge/src/metric.rs b/core-c-bridge/src/metric.rs index 9d06aab00..92e46b423 100644 --- a/core-c-bridge/src/metric.rs +++ b/core-c-bridge/src/metric.rs @@ -1,5 +1,5 @@ use crate::{ByteArrayRef, runtime::Runtime}; -use std::{any::Any, collections::HashMap, error::Error, sync::Arc, time::Duration}; +use std::{any::Any, error::Error, sync::Arc, time::Duration}; use temporal_sdk_core_api::telemetry::metrics; pub struct MetricMeter { @@ -381,21 +381,16 @@ impl CustomMetricMeterRef { ) -> metrics::MetricAttributes { unsafe { let meter = &*(self.meter_impl.0); - let (append_from, mut label_cache) = match append_from { + let append_from = match append_from { Some(metrics::MetricAttributes::Dynamic(v)) => { - let existing = v - .clone() + v.clone() .as_any() .downcast::() - .expect("Attributes not CustomMetricAttributes as expected"); - (existing.attributes, existing.labels.as_ref().clone()) + .expect("Attributes not CustomMetricAttributes as expected") + .attributes } - _ => (std::ptr::null(), HashMap::new()), + _ => std::ptr::null(), }; - for kv in &attribs.attributes { - label_cache.insert(kv.key.clone(), kv.value.to_string()); - } - let label_cache = Arc::new(label_cache); // Build a set of CustomMetricAttributes with _references_ to the // pieces in attribs. We count on both this vec and the attribs vec // living beyond the callback invocation. @@ -440,7 +435,6 @@ impl CustomMetricMeterRef { metrics::MetricAttributes::Dynamic(Arc::new(CustomMetricAttributes { meter_impl: self.meter_impl.clone(), attributes: raw_attrs, - labels: label_cache, })) } } @@ -485,7 +479,6 @@ impl Drop for CustomMetricMeterImpl { struct CustomMetricAttributes { meter_impl: Arc, attributes: *const libc::c_void, - labels: Arc>, } unsafe impl Send for CustomMetricAttributes {} @@ -495,10 +488,6 @@ impl metrics::CustomMetricAttributes for CustomMetricAttributes { fn as_any(self: Arc) -> Arc { self as Arc } - - fn label_value(&self, key: &str) -> Option { - self.labels.get(key).cloned() - } } impl Drop for CustomMetricAttributes { diff --git a/core/src/telemetry/metrics.rs b/core/src/telemetry/metrics.rs index 0fdcf11d8..d39bf02b0 100644 --- a/core/src/telemetry/metrics.rs +++ b/core/src/telemetry/metrics.rs @@ -484,12 +484,12 @@ impl Instruments { description: "Current number of active pollers per queue type".into(), unit: "".into(), }), - task_slots_available: meter.gauge(MetricParameters { + task_slots_available: gauge_with_in_mem(MetricParameters { name: TASK_SLOTS_AVAILABLE_NAME.into(), description: "Current number of available slots per task type".into(), unit: "".into(), }), - task_slots_used: meter.gauge(MetricParameters { + task_slots_used: gauge_with_in_mem(MetricParameters { name: TASK_SLOTS_USED_NAME.into(), description: "Current number of used slots per task type".into(), unit: "".into(), @@ -1121,10 +1121,6 @@ mod tests { fn as_any(self: Arc) -> Arc { self as Arc } - - fn label_value(&self, _key: &str) -> Option { - None - } } impl DummyCustomAttrs { fn as_id(ba: &BufferAttributes) -> usize { diff --git a/core/src/worker/client/mocks.rs b/core/src/worker/client/mocks.rs index b86334e4b..addd09708 100644 --- a/core/src/worker/client/mocks.rs +++ b/core/src/worker/client/mocks.rs @@ -4,7 +4,7 @@ use std::sync::{Arc, LazyLock}; use temporal_client::ClientWorkerSet; pub(crate) static DEFAULT_WORKERS_REGISTRY: LazyLock> = - LazyLock::new(|| Arc::new(ClientWorkerSet::new())); + LazyLock::new(|| Arc::new(ClientWorkerSet::new(true))); pub(crate) static DEFAULT_TEST_CAPABILITIES: &Capabilities = &Capabilities { signal_and_query_header: true, diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index 3b0e362cf..a5037cb38 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -56,6 +56,7 @@ use futures_util::{StreamExt, stream}; use gethostname::gethostname; use parking_lot::{Mutex, RwLock}; use slot_provider::SlotProvider; +use std::sync::atomic::AtomicU64; use std::time::SystemTime; use std::{ convert::TryInto, @@ -1062,6 +1063,7 @@ impl WorkerHeartbeatManager { telemetry_instance: Option, heartbeat_manager_metrics: HeartbeatMetrics, ) -> Self { + let start_time = Some(SystemTime::now().into()); let worker_heartbeat_callback: HeartbeatFn = Arc::new(move || { let deployment_version = config.computed_deployment_version().map(|dv| { deployment::v1::WorkerDeploymentVersion { @@ -1087,23 +1089,11 @@ impl WorkerHeartbeatManager { deployment_version, status: (*heartbeat_manager_metrics.status.lock()) as i32, - start_time: Some(SystemTime::now().into()), + start_time, plugins: config.plugins.clone(), - // Metrics dependent, set below - workflow_task_slots_info: None, - activity_task_slots_info: None, - nexus_task_slots_info: None, - local_activity_slots_info: None, - workflow_poller_info: None, - workflow_sticky_poller_info: None, - activity_poller_info: None, - nexus_poller_info: None, - total_sticky_cache_hit: 0, - total_sticky_cache_miss: 0, - current_sticky_cache_size: 0, - - // Some fields like sdk_name, sdk_version, and worker_identity, must be set by + // Some Metrics dependent fields are set below, and + // some fields like sdk_name, sdk_version, and worker_identity, must be set by // SharedNamespaceWorker because they rely on the client, and // need to be pulled from the current client used by SharedNamespaceWorker ..Default::default() @@ -1164,31 +1154,34 @@ impl WorkerHeartbeatManager { worker_heartbeat.workflow_task_slots_info = make_slots_info( &heartbeat_manager_metrics.wft_slots, - in_mem - .workflow_task_execution_latency - .load(Ordering::Relaxed), - in_mem - .workflow_task_execution_failed - .load(Ordering::Relaxed), + in_mem.worker_task_slots_available.workflow_worker.clone(), + in_mem.worker_task_slots_used.workflow_worker.clone(), + in_mem.workflow_task_execution_latency.clone(), + in_mem.workflow_task_execution_failed.clone(), ); worker_heartbeat.activity_task_slots_info = make_slots_info( &heartbeat_manager_metrics.act_slots, - in_mem.activity_execution_latency.load(Ordering::Relaxed), - in_mem.activity_execution_failed.load(Ordering::Relaxed), + in_mem.worker_task_slots_available.activity_worker.clone(), + in_mem.worker_task_slots_used.activity_worker.clone(), + in_mem.activity_execution_latency.clone(), + in_mem.activity_execution_failed.clone(), ); worker_heartbeat.nexus_task_slots_info = make_slots_info( &heartbeat_manager_metrics.nexus_slots, - in_mem.nexus_task_execution_latency.load(Ordering::Relaxed), - in_mem.nexus_task_execution_failed.load(Ordering::Relaxed), + in_mem.worker_task_slots_available.nexus_worker.clone(), + in_mem.worker_task_slots_used.nexus_worker.clone(), + in_mem.nexus_task_execution_latency.clone(), + in_mem.nexus_task_execution_failed.clone(), ); worker_heartbeat.local_activity_slots_info = make_slots_info( &heartbeat_manager_metrics.la_slots, in_mem - .local_activity_execution_latency - .load(Ordering::Relaxed), - in_mem - .local_activity_execution_failed - .load(Ordering::Relaxed), + .worker_task_slots_available + .local_activity_worker + .clone(), + in_mem.worker_task_slots_used.local_activity_worker.clone(), + in_mem.local_activity_execution_latency.clone(), + in_mem.local_activity_execution_failed.clone(), ); } worker_heartbeat @@ -1241,23 +1234,22 @@ fn wft_poller_behavior(config: &WorkerConfig, is_sticky: bool) -> PollerBehavior fn make_slots_info( dealer: &MeteredPermitDealer, - total_processed: u64, - total_failed: u64, + slots_available: Arc, + slots_used: Arc, + total_processed: Arc, + total_failed: Arc, ) -> Option where SK: SlotKind + 'static, { - let permits = dealer.get_extant_count_rcv(); - let avail = dealer - .available_permits() - .map_or(-1, |e| i32::try_from(e).unwrap_or(-1)); - Some(WorkerSlotsInfo { - current_available_slots: avail, - current_used_slots: *permits.borrow() as i32, + current_available_slots: i32::try_from(slots_available.load(Ordering::Relaxed)) + .unwrap_or(-1), + current_used_slots: i32::try_from(slots_used.load(Ordering::Relaxed)).unwrap_or(-1), slot_supplier_kind: dealer.slot_supplier_kind().to_string(), - total_processed_tasks: i32::try_from(total_processed).unwrap_or(i32::MIN), - total_failed_tasks: i32::try_from(total_failed).unwrap_or(i32::MIN), + total_processed_tasks: i32::try_from(total_processed.load(Ordering::Relaxed)) + .unwrap_or(i32::MIN), + total_failed_tasks: i32::try_from(total_failed.load(Ordering::Relaxed)).unwrap_or(i32::MIN), // Filled in by heartbeat later last_interval_processed_tasks: 0, diff --git a/sdk-core-protos/protos/api_upstream/openapi/openapiv2.json b/sdk-core-protos/protos/api_upstream/openapi/openapiv2.json index 8591cb0be..cfed16ffd 100644 --- a/sdk-core-protos/protos/api_upstream/openapi/openapiv2.json +++ b/sdk-core-protos/protos/api_upstream/openapi/openapiv2.json @@ -2130,6 +2130,51 @@ ] } }, + "/api/v1/namespaces/{namespace}/worker-deployments/{deploymentName}/set-manager": { + "post": { + "summary": "Set/unset the ManagerIdentity of a Worker Deployment.\nExperimental. This API might significantly change or be removed in a future release.", + "operationId": "SetWorkerDeploymentManager2", + "responses": { + "200": { + "description": "A successful response.", + "schema": { + "$ref": "#/definitions/v1SetWorkerDeploymentManagerResponse" + } + }, + "default": { + "description": "An unexpected error response.", + "schema": { + "$ref": "#/definitions/rpcStatus" + } + } + }, + "parameters": [ + { + "name": "namespace", + "in": "path", + "required": true, + "type": "string" + }, + { + "name": "deploymentName", + "in": "path", + "required": true, + "type": "string" + }, + { + "name": "body", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/WorkflowServiceSetWorkerDeploymentManagerBody" + } + } + ], + "tags": [ + "WorkflowService" + ] + } + }, "/api/v1/namespaces/{namespace}/worker-deployments/{deploymentName}/set-ramping-version": { "post": { "summary": "Set/unset the Ramping Version of a Worker Deployment and its ramp percentage. Can be used for\ngradual ramp to unversioned workers too.\nExperimental. This API might significantly change or be removed in a future release.", @@ -2296,6 +2341,45 @@ ] } }, + "/api/v1/namespaces/{namespace}/workers/describe/{workerInstanceKey}": { + "get": { + "summary": "DescribeWorker returns information about the specified worker.", + "operationId": "DescribeWorker2", + "responses": { + "200": { + "description": "A successful response.", + "schema": { + "$ref": "#/definitions/v1DescribeWorkerResponse" + } + }, + "default": { + "description": "An unexpected error response.", + "schema": { + "$ref": "#/definitions/rpcStatus" + } + } + }, + "parameters": [ + { + "name": "namespace", + "description": "Namespace this worker belongs to.", + "in": "path", + "required": true, + "type": "string" + }, + { + "name": "workerInstanceKey", + "description": "Worker instance key to describe.", + "in": "path", + "required": true, + "type": "string" + } + ], + "tags": [ + "WorkflowService" + ] + } + }, "/api/v1/namespaces/{namespace}/workers/fetch-config": { "post": { "summary": "FetchWorkerConfig returns the worker configuration for a specific worker.", @@ -2822,7 +2906,7 @@ }, "/api/v1/namespaces/{namespace}/workflows/{execution.workflowId}/history-reverse": { "get": { - "summary": "GetWorkflowExecutionHistoryReverse returns the history of specified workflow execution in reverse \norder (starting from last event). Fails with`NotFound` if the specified workflow execution is \nunknown to the service.", + "summary": "GetWorkflowExecutionHistoryReverse returns the history of specified workflow execution in reverse\norder (starting from last event). Fails with`NotFound` if the specified workflow execution is\nunknown to the service.", "operationId": "GetWorkflowExecutionHistoryReverse2", "responses": { "200": { @@ -5871,6 +5955,51 @@ ] } }, + "/namespaces/{namespace}/worker-deployments/{deploymentName}/set-manager": { + "post": { + "summary": "Set/unset the ManagerIdentity of a Worker Deployment.\nExperimental. This API might significantly change or be removed in a future release.", + "operationId": "SetWorkerDeploymentManager", + "responses": { + "200": { + "description": "A successful response.", + "schema": { + "$ref": "#/definitions/v1SetWorkerDeploymentManagerResponse" + } + }, + "default": { + "description": "An unexpected error response.", + "schema": { + "$ref": "#/definitions/rpcStatus" + } + } + }, + "parameters": [ + { + "name": "namespace", + "in": "path", + "required": true, + "type": "string" + }, + { + "name": "deploymentName", + "in": "path", + "required": true, + "type": "string" + }, + { + "name": "body", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/WorkflowServiceSetWorkerDeploymentManagerBody" + } + } + ], + "tags": [ + "WorkflowService" + ] + } + }, "/namespaces/{namespace}/worker-deployments/{deploymentName}/set-ramping-version": { "post": { "summary": "Set/unset the Ramping Version of a Worker Deployment and its ramp percentage. Can be used for\ngradual ramp to unversioned workers too.\nExperimental. This API might significantly change or be removed in a future release.", @@ -6037,6 +6166,45 @@ ] } }, + "/namespaces/{namespace}/workers/describe/{workerInstanceKey}": { + "get": { + "summary": "DescribeWorker returns information about the specified worker.", + "operationId": "DescribeWorker", + "responses": { + "200": { + "description": "A successful response.", + "schema": { + "$ref": "#/definitions/v1DescribeWorkerResponse" + } + }, + "default": { + "description": "An unexpected error response.", + "schema": { + "$ref": "#/definitions/rpcStatus" + } + } + }, + "parameters": [ + { + "name": "namespace", + "description": "Namespace this worker belongs to.", + "in": "path", + "required": true, + "type": "string" + }, + { + "name": "workerInstanceKey", + "description": "Worker instance key to describe.", + "in": "path", + "required": true, + "type": "string" + } + ], + "tags": [ + "WorkflowService" + ] + } + }, "/namespaces/{namespace}/workers/fetch-config": { "post": { "summary": "FetchWorkerConfig returns the worker configuration for a specific worker.", @@ -6563,7 +6731,7 @@ }, "/namespaces/{namespace}/workflows/{execution.workflowId}/history-reverse": { "get": { - "summary": "GetWorkflowExecutionHistoryReverse returns the history of specified workflow execution in reverse \norder (starting from last event). Fails with`NotFound` if the specified workflow execution is \nunknown to the service.", + "summary": "GetWorkflowExecutionHistoryReverse returns the history of specified workflow execution in reverse\norder (starting from last event). Fails with`NotFound` if the specified workflow execution is\nunknown to the service.", "operationId": "GetWorkflowExecutionHistoryReverse", "responses": { "200": { @@ -7861,7 +8029,7 @@ }, "type": { "type": "string", - "description": "Pause all running activities of this type." + "description": "Pause all running activities of this type.\nNote: Experimental - the behavior of pause by activity type might change in a future release." }, "reason": { "type": "string", @@ -8314,10 +8482,37 @@ "ignoreMissingTaskQueues": { "type": "boolean", "description": "Optional. By default this request would be rejected if not all the expected Task Queues are\nbeing polled by the new Version, to protect against accidental removal of Task Queues, or\nworker health issues. Pass `true` here to bypass this protection.\nThe set of expected Task Queues is the set of all the Task Queues that were ever poller by\nthe existing Current Version of the Deployment, with the following exclusions:\n - Task Queues that are not used anymore (inferred by having empty backlog and a task\n add_rate of 0.)\n - Task Queues that are moved to another Worker Deployment (inferred by the Task Queue\n having a different Current Version than the Current Version of this deployment.)\nWARNING: Do not set this flag unless you are sure that the missing task queue pollers are not\nneeded. If the request is unexpectedly rejected due to missing pollers, then that means the\npollers have not reached to the server yet. Only set this if you expect those pollers to\nnever arrive." + }, + "allowNoPollers": { + "type": "boolean", + "description": "Optional. By default this request will be rejected if no pollers have been seen for the proposed\nCurrent Version, in order to protect users from routing tasks to pollers that do not exist, leading\nto possible timeouts. Pass `true` here to bypass this protection." } }, "description": "Set/unset the Current Version of a Worker Deployment." }, + "WorkflowServiceSetWorkerDeploymentManagerBody": { + "type": "object", + "properties": { + "managerIdentity": { + "type": "string", + "description": "Arbitrary value for `manager_identity`.\nEmpty will unset the field." + }, + "self": { + "type": "boolean", + "description": "True will set `manager_identity` to `identity`." + }, + "conflictToken": { + "type": "string", + "format": "byte", + "description": "Optional. This can be the value of conflict_token from a Describe, or another Worker\nDeployment API. Passing a non-nil conflict token will cause this request to fail if the\nDeployment's configuration has been modified between the API call that generated the\ntoken and this one." + }, + "identity": { + "type": "string", + "description": "Required. The identity of the client who initiated this request." + } + }, + "description": "Update the ManagerIdentity of a Worker Deployment." + }, "WorkflowServiceSetWorkerDeploymentRampingVersionBody": { "type": "object", "properties": { @@ -8346,6 +8541,10 @@ "ignoreMissingTaskQueues": { "type": "boolean", "description": "Optional. By default this request would be rejected if not all the expected Task Queues are\nbeing polled by the new Version, to protect against accidental removal of Task Queues, or\nworker health issues. Pass `true` here to bypass this protection.\nThe set of expected Task Queues equals to all the Task Queues ever polled from the existing\nCurrent Version of the Deployment, with the following exclusions:\n - Task Queues that are not used anymore (inferred by having empty backlog and a task\n add_rate of 0.)\n - Task Queues that are moved to another Worker Deployment (inferred by the Task Queue\n having a different Current Version than the Current Version of this deployment.)\nWARNING: Do not set this flag unless you are sure that the missing task queue poller are not\nneeded. If the request is unexpectedly rejected due to missing pollers, then that means the\npollers have not reached to the server yet. Only set this if you expect those pollers to\nnever arrive.\nNote: this check only happens when the ramping version is about to change, not every time\nthat the percentage changes. Also note that the check is against the deployment's Current\nVersion, not the previous Ramping Version." + }, + "allowNoPollers": { + "type": "boolean", + "description": "Optional. By default this request will be rejected if no pollers have been seen for the proposed\nCurrent Version, in order to protect users from routing tasks to pollers that do not exist, leading\nto possible timeouts. Pass `true` here to bypass this protection." } }, "description": "Set/unset the Ramping Version of a Worker Deployment and its ramp percentage." @@ -8643,6 +8842,10 @@ "priority": { "$ref": "#/definitions/v1Priority", "title": "Priority metadata" + }, + "eagerWorkerDeploymentOptions": { + "$ref": "#/definitions/v1WorkerDeploymentOptions", + "description": "Deployment Options of the worker who will process the eager task. Passed when `request_eager_execution=true`." } } }, @@ -10875,6 +11078,14 @@ } } }, + "v1DescribeWorkerResponse": { + "type": "object", + "properties": { + "workerInfo": { + "$ref": "#/definitions/v1WorkerInfo" + } + } + }, "v1DescribeWorkflowExecutionResponse": { "type": "object", "properties": { @@ -11198,6 +11409,14 @@ }, "visibilityStore": { "type": "string" + }, + "initialFailoverVersion": { + "type": "string", + "format": "int64" + }, + "failoverVersionIncrement": { + "type": "string", + "format": "int64" } }, "description": "GetClusterInfoResponse contains information about Temporal cluster." @@ -12167,6 +12386,14 @@ "asyncUpdate": { "type": "boolean", "title": "True if the namespace supports async update" + }, + "workerHeartbeats": { + "type": "boolean", + "title": "True if the namespace supports worker heartbeats" + }, + "reportedProblemsSearchAttribute": { + "type": "boolean", + "title": "True if the namespace supports reported problems search attribute" } }, "description": "Namespace capability details. Should contain what features are enabled in a namespace." @@ -13154,7 +13381,7 @@ "priorityKey": { "type": "integer", "format": "int32", - "description": "Priority key is a positive integer from 1 to n, where smaller integers\ncorrespond to higher priorities (tasks run sooner). In general, tasks in\na queue should be processed in close to priority order, although small\ndeviations are possible.\n\nThe maximum priority value (minimum priority) is determined by server\nconfiguration, and defaults to 5.\n\nIf priority is not present (or zero), then the effective priority will be\nthe default priority, which is is calculated by (min+max)/2. With the\ndefault max of 5, and min of 1, that comes out to 3." + "description": "Priority key is a positive integer from 1 to n, where smaller integers\ncorrespond to higher priorities (tasks run sooner). In general, tasks in\na queue should be processed in close to priority order, although small\ndeviations are possible.\n\nThe maximum priority value (minimum priority) is determined by server\nconfiguration, and defaults to 5.\n\nIf priority is not present (or zero), then the effective priority will be\nthe default priority, which is calculated by (min+max)/2. With the\ndefault max of 5, and min of 1, that comes out to 3." }, "fairnessKey": { "type": "string", @@ -14343,6 +14570,20 @@ } } }, + "v1SetWorkerDeploymentManagerResponse": { + "type": "object", + "properties": { + "conflictToken": { + "type": "string", + "format": "byte", + "description": "This value is returned so that it can be optionally passed to APIs\nthat write to the Worker Deployment state to ensure that the state\ndid not change between this API call and a future write." + }, + "previousManagerIdentity": { + "type": "string", + "description": "What the `manager_identity` field was before this change." + } + } + }, "v1SetWorkerDeploymentRampingVersionResponse": { "type": "object", "properties": { @@ -14891,6 +15132,10 @@ "priority": { "$ref": "#/definitions/v1Priority", "title": "Priority metadata" + }, + "eagerWorkerDeploymentOptions": { + "$ref": "#/definitions/v1WorkerDeploymentOptions", + "description": "Deployment Options of the worker who will process the eager task. Passed when `request_eager_execution=true`." } } }, @@ -15453,7 +15698,7 @@ "additionalProperties": { "type": "string" }, - "description": "A key-value map for any customized purpose.\nIf data already exists on the namespace, \nthis will merge with the existing key values." + "description": "A key-value map for any customized purpose.\nIf data already exists on the namespace,\nthis will merge with the existing key values." }, "state": { "$ref": "#/definitions/v1NamespaceState", @@ -15812,6 +16057,10 @@ "lastModifierIdentity": { "type": "string", "description": "Identity of the last client who modified the configuration of this Deployment. Set to the\n`identity` value sent by APIs such as `SetWorkerDeploymentCurrentVersion` and\n`SetWorkerDeploymentRampingVersion`." + }, + "managerIdentity": { + "type": "string", + "description": "Identity of the client that has the exclusive right to make changes to this Worker Deployment.\nEmpty by default.\nIf this is set, clients whose identity does not match `manager_identity` will not be able to make changes\nto this Worker Deployment. They can either set their own identity as the manager or unset the field to proceed." } }, "description": "A Worker Deployment (Deployment, for short) represents all workers serving \na shared set of Task Queues. Typically, a Deployment represents one service or \napplication.\nA Deployment contains multiple Deployment Versions, each representing a different \nversion of workers. (see documentation of WorkerDeploymentVersionInfo)\nDeployment records are created in Temporal server automatically when their\nfirst poller arrives to the server.\nExperimental. Worker Deployments are experimental and might significantly change in the future." diff --git a/sdk-core-protos/protos/api_upstream/openapi/openapiv3.yaml b/sdk-core-protos/protos/api_upstream/openapi/openapiv3.yaml index 7d587b366..88f8737d0 100644 --- a/sdk-core-protos/protos/api_upstream/openapi/openapiv3.yaml +++ b/sdk-core-protos/protos/api_upstream/openapi/openapiv3.yaml @@ -1914,6 +1914,44 @@ paths: application/json: schema: $ref: '#/components/schemas/Status' + /api/v1/namespaces/{namespace}/worker-deployments/{deploymentName}/set-manager: + post: + tags: + - WorkflowService + description: |- + Set/unset the ManagerIdentity of a Worker Deployment. + Experimental. This API might significantly change or be removed in a future release. + operationId: SetWorkerDeploymentManager + parameters: + - name: namespace + in: path + required: true + schema: + type: string + - name: deploymentName + in: path + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/SetWorkerDeploymentManagerRequest' + required: true + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/SetWorkerDeploymentManagerResponse' + default: + description: Default error response + content: + application/json: + schema: + $ref: '#/components/schemas/Status' /api/v1/namespaces/{namespace}/worker-deployments/{deploymentName}/set-ramping-version: post: tags: @@ -2087,6 +2125,38 @@ paths: application/json: schema: $ref: '#/components/schemas/Status' + /api/v1/namespaces/{namespace}/workers/describe/{workerInstanceKey}: + get: + tags: + - WorkflowService + description: DescribeWorker returns information about the specified worker. + operationId: DescribeWorker + parameters: + - name: namespace + in: path + description: Namespace this worker belongs to. + required: true + schema: + type: string + - name: workerInstanceKey + in: path + description: Worker instance key to describe. + required: true + schema: + type: string + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/DescribeWorkerResponse' + default: + description: Default error response + content: + application/json: + schema: + $ref: '#/components/schemas/Status' /api/v1/namespaces/{namespace}/workers/fetch-config: post: tags: @@ -2540,7 +2610,10 @@ paths: get: tags: - WorkflowService - description: "GetWorkflowExecutionHistoryReverse returns the history of specified workflow execution in reverse \n order (starting from last event). Fails with`NotFound` if the specified workflow execution is \n unknown to the service." + description: |- + GetWorkflowExecutionHistoryReverse returns the history of specified workflow execution in reverse + order (starting from last event). Fails with`NotFound` if the specified workflow execution is + unknown to the service. operationId: GetWorkflowExecutionHistoryReverse parameters: - name: namespace @@ -5265,6 +5338,44 @@ paths: application/json: schema: $ref: '#/components/schemas/Status' + /namespaces/{namespace}/worker-deployments/{deploymentName}/set-manager: + post: + tags: + - WorkflowService + description: |- + Set/unset the ManagerIdentity of a Worker Deployment. + Experimental. This API might significantly change or be removed in a future release. + operationId: SetWorkerDeploymentManager + parameters: + - name: namespace + in: path + required: true + schema: + type: string + - name: deploymentName + in: path + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/SetWorkerDeploymentManagerRequest' + required: true + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/SetWorkerDeploymentManagerResponse' + default: + description: Default error response + content: + application/json: + schema: + $ref: '#/components/schemas/Status' /namespaces/{namespace}/worker-deployments/{deploymentName}/set-ramping-version: post: tags: @@ -5438,6 +5549,38 @@ paths: application/json: schema: $ref: '#/components/schemas/Status' + /namespaces/{namespace}/workers/describe/{workerInstanceKey}: + get: + tags: + - WorkflowService + description: DescribeWorker returns information about the specified worker. + operationId: DescribeWorker + parameters: + - name: namespace + in: path + description: Namespace this worker belongs to. + required: true + schema: + type: string + - name: workerInstanceKey + in: path + description: Worker instance key to describe. + required: true + schema: + type: string + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/DescribeWorkerResponse' + default: + description: Default error response + content: + application/json: + schema: + $ref: '#/components/schemas/Status' /namespaces/{namespace}/workers/fetch-config: post: tags: @@ -5891,7 +6034,10 @@ paths: get: tags: - WorkflowService - description: "GetWorkflowExecutionHistoryReverse returns the history of specified workflow execution in reverse \n order (starting from last event). Fails with`NotFound` if the specified workflow execution is \n unknown to the service." + description: |- + GetWorkflowExecutionHistoryReverse returns the history of specified workflow execution in reverse + order (starting from last event). Fails with`NotFound` if the specified workflow execution is + unknown to the service. operationId: GetWorkflowExecutionHistoryReverse parameters: - name: namespace @@ -7921,6 +8067,11 @@ components: Only set if `report_task_queue_stats` is set to true in the request. (-- api-linter: core::0140::prepositions=disabled aip.dev/not-precedent: "by" is used to clarify the key. --) + DescribeWorkerResponse: + type: object + properties: + workerInfo: + $ref: '#/components/schemas/WorkerInfo' DescribeWorkflowExecutionResponse: type: object properties: @@ -8242,6 +8393,10 @@ components: type: string visibilityStore: type: string + initialFailoverVersion: + type: string + failoverVersionIncrement: + type: string description: GetClusterInfoResponse contains information about Temporal cluster. GetCurrentDeploymentResponse: type: object @@ -9055,6 +9210,12 @@ components: asyncUpdate: type: boolean description: True if the namespace supports async update + workerHeartbeats: + type: boolean + description: True if the namespace supports worker heartbeats + reportedProblemsSearchAttribute: + type: boolean + description: True if the namespace supports reported problems search attribute description: Namespace capability details. Should contain what features are enabled in a namespace. NamespaceReplicationConfig: type: object @@ -9461,7 +9622,9 @@ components: description: Only the activity with this ID will be paused. type: type: string - description: Pause all running activities of this type. + description: |- + Pause all running activities of this type. + Note: Experimental - the behavior of pause by activity type might change in a future release. reason: type: string description: Reason to pause the activity. @@ -9939,7 +10102,7 @@ components: configuration, and defaults to 5. If priority is not present (or zero), then the effective priority will be - the default priority, which is is calculated by (min+max)/2. With the + the default priority, which is calculated by (min+max)/2. With the default max of 5, and min of 1, that comes out to 3. format: int32 fairnessKey: @@ -11347,6 +11510,12 @@ components: needed. If the request is unexpectedly rejected due to missing pollers, then that means the pollers have not reached to the server yet. Only set this if you expect those pollers to never arrive. + allowNoPollers: + type: boolean + description: |- + Optional. By default this request will be rejected if no pollers have been seen for the proposed + Current Version, in order to protect users from routing tasks to pollers that do not exist, leading + to possible timeouts. Pass `true` here to bypass this protection. description: Set/unset the Current Version of a Worker Deployment. SetWorkerDeploymentCurrentVersionResponse: type: object @@ -11365,6 +11534,46 @@ components: allOf: - $ref: '#/components/schemas/WorkerDeploymentVersion' description: The version that was current before executing this operation. + SetWorkerDeploymentManagerRequest: + type: object + properties: + namespace: + type: string + deploymentName: + type: string + managerIdentity: + type: string + description: |- + Arbitrary value for `manager_identity`. + Empty will unset the field. + self: + type: boolean + description: True will set `manager_identity` to `identity`. + conflictToken: + type: string + description: |- + Optional. This can be the value of conflict_token from a Describe, or another Worker + Deployment API. Passing a non-nil conflict token will cause this request to fail if the + Deployment's configuration has been modified between the API call that generated the + token and this one. + format: bytes + identity: + type: string + description: Required. The identity of the client who initiated this request. + description: Update the ManagerIdentity of a Worker Deployment. + SetWorkerDeploymentManagerResponse: + type: object + properties: + conflictToken: + type: string + description: |- + This value is returned so that it can be optionally passed to APIs + that write to the Worker Deployment state to ensure that the state + did not change between this API call and a future write. + format: bytes + previousManagerIdentity: + type: string + description: What the `manager_identity` field was before this change. SetWorkerDeploymentRampingVersionRequest: type: object properties: @@ -11415,6 +11624,12 @@ components: Note: this check only happens when the ramping version is about to change, not every time that the percentage changes. Also note that the check is against the deployment's Current Version, not the previous Ramping Version. + allowNoPollers: + type: boolean + description: |- + Optional. By default this request will be rejected if no pollers have been seen for the proposed + Current Version, in order to protect users from routing tasks to pollers that do not exist, leading + to possible timeouts. Pass `true` here to bypass this protection. description: Set/unset the Ramping Version of a Worker Deployment and its ramp percentage. SetWorkerDeploymentRampingVersionResponse: type: object @@ -11960,6 +12175,10 @@ components: allOf: - $ref: '#/components/schemas/Priority' description: Priority metadata + eagerWorkerDeploymentOptions: + allOf: + - $ref: '#/components/schemas/WorkerDeploymentOptions' + description: Deployment Options of the worker who will process the eager task. Passed when `request_eager_execution=true`. StartWorkflowExecutionResponse: type: object properties: @@ -12551,7 +12770,10 @@ components: type: object additionalProperties: type: string - description: "A key-value map for any customized purpose.\n If data already exists on the namespace, \n this will merge with the existing key values." + description: |- + A key-value map for any customized purpose. + If data already exists on the namespace, + this will merge with the existing key values. state: enum: - NAMESPACE_STATE_UNSPECIFIED @@ -13093,6 +13315,13 @@ components: Identity of the last client who modified the configuration of this Deployment. Set to the `identity` value sent by APIs such as `SetWorkerDeploymentCurrentVersion` and `SetWorkerDeploymentRampingVersion`. + managerIdentity: + type: string + description: |- + Identity of the client that has the exclusive right to make changes to this Worker Deployment. + Empty by default. + If this is set, clients whose identity does not match `manager_identity` will not be able to make changes + to this Worker Deployment. They can either set their own identity as the manager or unset the field to proceed. description: "A Worker Deployment (Deployment, for short) represents all workers serving \n a shared set of Task Queues. Typically, a Deployment represents one service or \n application.\n A Deployment contains multiple Deployment Versions, each representing a different \n version of workers. (see documentation of WorkerDeploymentVersionInfo)\n Deployment records are created in Temporal server automatically when their\n first poller arrives to the server.\n Experimental. Worker Deployments are experimental and might significantly change in the future." WorkerDeploymentInfo_WorkerDeploymentVersionSummary: type: object diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/common/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/common/v1/message.proto index 51acfaa2e..838f5fefc 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/common/v1/message.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/common/v1/message.proto @@ -280,7 +280,7 @@ message Priority { // configuration, and defaults to 5. // // If priority is not present (or zero), then the effective priority will be - // the default priority, which is is calculated by (min+max)/2. With the + // the default priority, which is calculated by (min+max)/2. With the // default max of 5, and min of 1, that comes out to 3. int32 priority_key = 1; diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/deployment/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/deployment/v1/message.proto index 14b4205c5..8f6685a5d 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/deployment/v1/message.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/deployment/v1/message.proto @@ -195,6 +195,12 @@ message WorkerDeploymentInfo { // `SetWorkerDeploymentRampingVersion`. string last_modifier_identity = 5; + // Identity of the client that has the exclusive right to make changes to this Worker Deployment. + // Empty by default. + // If this is set, clients whose identity does not match `manager_identity` will not be able to make changes + // to this Worker Deployment. They can either set their own identity as the manager or unset the field to proceed. + string manager_identity = 6; + message WorkerDeploymentVersionSummary { // Deprecated. Use `deployment_version`. string version = 1 [deprecated = true]; diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/namespace/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/namespace/v1/message.proto index 405cd53c9..79c44cb05 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/namespace/v1/message.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/namespace/v1/message.proto @@ -34,6 +34,10 @@ message NamespaceInfo { bool sync_update = 2; // True if the namespace supports async update bool async_update = 3; + // True if the namespace supports worker heartbeats + bool worker_heartbeats = 4; + // True if the namespace supports reported problems search attribute + bool reported_problems_search_attribute = 5; } // Whether scheduled workflows are supported on this namespace. This is only needed @@ -68,8 +72,8 @@ message UpdateNamespaceInfo { string description = 1; string owner_email = 2; // A key-value map for any customized purpose. - // If data already exists on the namespace, - // this will merge with the existing key values. + // If data already exists on the namespace, + // this will merge with the existing key values. map data = 3; // New namespace state, server will reject if transition is not allowed. // Allowed transitions are: diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto b/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto index 5059575dc..37ad083c4 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto @@ -194,6 +194,8 @@ message StartWorkflowExecutionRequest { temporal.api.workflow.v1.OnConflictOptions on_conflict_options = 26; // Priority metadata temporal.api.common.v1.Priority priority = 27; + // Deployment Options of the worker who will process the eager task. Passed when `request_eager_execution=true`. + temporal.api.deployment.v1.WorkerDeploymentOptions eager_worker_deployment_options = 28; } message StartWorkflowExecutionResponse { @@ -1157,6 +1159,8 @@ message GetClusterInfoResponse { int32 history_shard_count = 6; string persistence_store = 7; string visibility_store = 8; + int64 initial_failover_version = 9; + int64 failover_version_increment = 10; } message GetSystemInfoRequest { @@ -1938,6 +1942,7 @@ message PauseActivityRequest { // Only the activity with this ID will be paused. string id = 4; // Pause all running activities of this type. + // Note: Experimental - the behavior of pause by activity type might change in a future release. string type = 5; } @@ -2163,6 +2168,10 @@ message SetWorkerDeploymentCurrentVersionRequest { // pollers have not reached to the server yet. Only set this if you expect those pollers to // never arrive. bool ignore_missing_task_queues = 6; + // Optional. By default this request will be rejected if no pollers have been seen for the proposed + // Current Version, in order to protect users from routing tasks to pollers that do not exist, leading + // to possible timeouts. Pass `true` here to bypass this protection. + bool allow_no_pollers = 9; } message SetWorkerDeploymentCurrentVersionResponse { @@ -2215,6 +2224,10 @@ message SetWorkerDeploymentRampingVersionRequest { // that the percentage changes. Also note that the check is against the deployment's Current // Version, not the previous Ramping Version. bool ignore_missing_task_queues = 7; + // Optional. By default this request will be rejected if no pollers have been seen for the proposed + // Current Version, in order to protect users from routing tasks to pollers that do not exist, leading + // to possible timeouts. Pass `true` here to bypass this protection. + bool allow_no_pollers = 10; } message SetWorkerDeploymentRampingVersionResponse { @@ -2248,8 +2261,8 @@ message ListWorkerDeploymentsResponse { google.protobuf.Timestamp create_time = 2; temporal.api.deployment.v1.RoutingConfig routing_config = 3; // Summary of the version that was added most recently in the Worker Deployment. - temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary latest_version_summary = 4; - // Summary of the current version of the Worker Deployment. + temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary latest_version_summary = 4; + // Summary of the current version of the Worker Deployment. temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary current_version_summary = 5; // Summary of the ramping version of the Worker Deployment. temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary ramping_version_summary = 6; @@ -2309,6 +2322,39 @@ message UpdateWorkerDeploymentVersionMetadataResponse { temporal.api.deployment.v1.VersionMetadata metadata = 1; } +// Update the ManagerIdentity of a Worker Deployment. +message SetWorkerDeploymentManagerRequest { + string namespace = 1; + string deployment_name = 2; + + oneof new_manager_identity { + // Arbitrary value for `manager_identity`. + // Empty will unset the field. + string manager_identity = 3; + + // True will set `manager_identity` to `identity`. + bool self = 4; + } + + // Optional. This can be the value of conflict_token from a Describe, or another Worker + // Deployment API. Passing a non-nil conflict token will cause this request to fail if the + // Deployment's configuration has been modified between the API call that generated the + // token and this one. + bytes conflict_token = 5; + + // Required. The identity of the client who initiated this request. + string identity = 6; +} + +message SetWorkerDeploymentManagerResponse { + // This value is returned so that it can be optionally passed to APIs + // that write to the Worker Deployment state to ensure that the state + // did not change between this API call and a future write. + bytes conflict_token = 1; + + // What the `manager_identity` field was before this change. + string previous_manager_identity = 2; +} // Returns the Current Deployment of a deployment series. // [cleanup-wv-pre-release] Pre-release deployment APIs, clean up later @@ -2537,3 +2583,15 @@ message UpdateWorkerConfigResponse { // Once we support sending update to a multiple workers - it will be converted into a batch job, and job id will be returned. } } + +message DescribeWorkerRequest { + // Namespace this worker belongs to. + string namespace = 1; + + // Worker instance key to describe. + string worker_instance_key = 2; +} + +message DescribeWorkerResponse { + temporal.api.worker.v1.WorkerInfo worker_info = 1; +} diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/service.proto b/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/service.proto index cc74230af..dc33b84ef 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/service.proto @@ -133,9 +133,9 @@ service WorkflowService { } }; } - - // GetWorkflowExecutionHistoryReverse returns the history of specified workflow execution in reverse - // order (starting from last event). Fails with`NotFound` if the specified workflow execution is + + // GetWorkflowExecutionHistoryReverse returns the history of specified workflow execution in reverse + // order (starting from last event). Fails with`NotFound` if the specified workflow execution is // unknown to the service. rpc GetWorkflowExecutionHistoryReverse (GetWorkflowExecutionHistoryReverseRequest) returns (GetWorkflowExecutionHistoryReverseResponse) { option (google.api.http) = { @@ -458,7 +458,8 @@ service WorkflowService { }; } - // ScanWorkflowExecutions is a visibility API to list large amount of workflow executions in a specific namespace without order. + // ScanWorkflowExecutions _was_ a visibility API to list large amount of workflow executions in a specific namespace without order. + // It has since been deprecated in favor of `ListWorkflowExecutions` and rewritten to use `ListWorkflowExecutions` internally. // // Deprecated: Replaced with `ListWorkflowExecutions`. // (-- api-linter: core::0127::http-annotation=disabled @@ -669,8 +670,8 @@ service WorkflowService { // members are compatible with one another. // // A single build id may be mapped to multiple task queues using this API for cases where a single process hosts - // multiple workers. - // + // multiple workers. + // // To query which workers can be retired, use the `GetWorkerTaskReachability` API. // // NOTE: The number of task queues mapped to a single build id is limited by the `limit.taskQueuesPerBuildId` @@ -923,6 +924,19 @@ service WorkflowService { }; } + // Set/unset the ManagerIdentity of a Worker Deployment. + // Experimental. This API might significantly change or be removed in a future release. + rpc SetWorkerDeploymentManager (SetWorkerDeploymentManagerRequest) returns (SetWorkerDeploymentManagerResponse) { + option (google.api.http) = { + post: "/namespaces/{namespace}/worker-deployments/{deployment_name}/set-manager" + body: "*" + additional_bindings { + post: "/api/v1/namespaces/{namespace}/worker-deployments/{deployment_name}/set-manager" + body: "*" + } + }; + } + // Invokes the specified Update function on user Workflow code. rpc UpdateWorkflowExecution(UpdateWorkflowExecutionRequest) returns (UpdateWorkflowExecutionResponse) { option (google.api.http) = { @@ -1235,4 +1249,14 @@ service WorkflowService { } }; } + + // DescribeWorker returns information about the specified worker. + rpc DescribeWorker (DescribeWorkerRequest) returns (DescribeWorkerResponse) { + option (google.api.http) = { + get: "/namespaces/{namespace}/workers/describe/{worker_instance_key}" + additional_bindings { + get: "/api/v1/namespaces/{namespace}/workers/describe/{worker_instance_key}" + } + }; + } } diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index 6f76e40fd..bdd962ce6 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -1,9 +1,13 @@ use crate::common::{ANY_PORT, CoreWfStarter, get_integ_telem_options}; +use anyhow::anyhow; +use crossbeam_utils::atomic::AtomicCell; use prost_types::Duration as PbDuration; use prost_types::Timestamp; +use std::collections::HashSet; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use temporal_client::{NamespacedClient, WorkflowService}; +use temporal_client::{Client, NamespacedClient, RetryClient, WorkflowService}; use temporal_sdk::{ActContext, ActivityOptions, WfContext}; use temporal_sdk_core::telemetry::{build_otlp_metric_exporter, start_prometheus_metric_exporter}; use temporal_sdk_core::{ @@ -12,10 +16,15 @@ use temporal_sdk_core::{ use temporal_sdk_core_api::telemetry::{ OtelCollectorOptionsBuilder, PrometheusExporterOptionsBuilder, TelemetryOptionsBuilder, }; +use temporal_sdk_core_api::worker::PollerBehavior; use temporal_sdk_core_protos::coresdk::AsJsonPayloadExt; -use temporal_sdk_core_protos::temporal::api::deployment::v1::WorkerDeploymentVersion; +use temporal_sdk_core_protos::temporal::api::common::v1::RetryPolicy; use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerStatus; +use temporal_sdk_core_protos::temporal::api::worker::v1::WorkerHeartbeat; +use temporal_sdk_core_protos::temporal::api::workflowservice::v1::DescribeWorkerRequest; use temporal_sdk_core_protos::temporal::api::workflowservice::v1::ListWorkersRequest; +use tokio::sync::Semaphore; +use tokio::time::sleep; use url::Url; fn within_two_minutes_ts(ts: Timestamp) -> bool { @@ -31,13 +40,50 @@ fn within_duration(dur: PbDuration, threshold: Duration) -> bool { std_dur <= threshold } +fn new_no_metrics_starter(wf_name: &str) -> CoreWfStarter { + let runtimeopts = RuntimeOptionsBuilder::default() + .telemetry_options(TelemetryOptionsBuilder::default().build().unwrap()) + .heartbeat_interval(Some(Duration::from_millis(100))) + .build() + .unwrap(); + CoreWfStarter::new_with_runtime(wf_name, CoreRuntime::new_assume_tokio(runtimeopts).unwrap()) +} + +async fn list_worker_heartbeats( + client: &Arc>, + query: impl Into, +) -> Vec { + let mut raw_client = client.as_ref().clone(); + WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 200, + next_page_token: Vec::new(), + query: query.into(), + }, + ) + .await + .unwrap() + .into_inner() + .workers_info + .into_iter() + .filter_map(|info| info.worker_heartbeat) + .collect() +} + // Tests that rely on Prometheus running in a docker container need to start // with `docker_` and set the `DOCKER_PROMETHEUS_RUNNING` env variable to run #[rstest::rstest] #[tokio::test] -async fn docker_worker_heartbeat_basic(#[values("otel", "prom")] backing: &str) { +async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] backing: &str) { + let telemopts = if backing == "no_metrics" { + TelemetryOptionsBuilder::default().build().unwrap() + } else { + get_integ_telem_options() + }; let runtimeopts = RuntimeOptionsBuilder::default() - .telemetry_options(get_integ_telem_options()) + .telemetry_options(telemopts) .heartbeat_interval(Some(Duration::from_millis(100))) .build() .unwrap(); @@ -59,6 +105,7 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom")] backing: &str) rt.telemetry_mut() .attach_late_init_metrics(start_prometheus_metric_exporter(opts).unwrap().meter); } + "no_metrics" => {} _ => unreachable!(), } let wf_name = format!("worker_heartbeat_basic_{backing}"); @@ -82,14 +129,60 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom")] backing: &str) .await; Ok(().into()) }); + + static ACTS_STARTED: Semaphore = Semaphore::const_new(0); + static ACTS_DONE: Semaphore = Semaphore::const_new(0); worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { + ACTS_STARTED.add_permits(1); + let _ = ACTS_DONE.acquire().await.unwrap(); Ok(i) }); starter .start_with_worker(wf_name.clone(), &mut worker) .await; - worker.run_until_done().await.unwrap(); + + let start_time = AtomicCell::new(None); + let heartbeat_time = AtomicCell::new(None); + + let test_fut = async { + // Give enough time to ensure heartbeat interval has been hit + tokio::time::sleep(Duration::from_millis(150)).await; + let _ = ACTS_STARTED.acquire().await.unwrap(); + let client = starter.get_client().await; + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + }, + ) + .await + .unwrap() + .into_inner(); + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + in_activity_checks(heartbeat, &start_time, &heartbeat_time); + ACTS_DONE.add_permits(1); + }; + + let runner = async move { + worker.run_until_done().await.unwrap(); + }; + tokio::join!(test_fut, runner); let client = starter.get_client().await; let mut raw_client = (*client).clone(); @@ -119,38 +212,7 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom")] backing: &str) }) .unwrap(); let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); - assert!(heartbeat.task_queue.starts_with(&wf_name)); - assert_eq!(heartbeat.worker_identity, "integ_tester"); - assert_eq!(heartbeat.sdk_name, "temporal-core"); - assert_eq!(heartbeat.sdk_version, "0.1.0"); - assert_eq!(heartbeat.status, WorkerStatus::Shutdown as i32); - assert!(within_two_minutes_ts(heartbeat.start_time.unwrap())); - assert!(within_two_minutes_ts(heartbeat.heartbeat_time.unwrap())); - assert!(within_duration( - heartbeat.elapsed_since_last_heartbeat.unwrap(), - Duration::from_secs(1) - )); - - let workflow_poller_info = heartbeat.workflow_poller_info.unwrap(); - assert!(!workflow_poller_info.is_autoscaling); - assert!(within_two_minutes_ts( - workflow_poller_info.last_successful_poll_time.unwrap() - )); - let sticky_poller_info = heartbeat.workflow_sticky_poller_info.unwrap(); - assert!(!sticky_poller_info.is_autoscaling); - assert!(within_two_minutes_ts( - sticky_poller_info.last_successful_poll_time.unwrap() - )); - let nexus_poller_info = heartbeat.nexus_poller_info.unwrap(); - assert!(!nexus_poller_info.is_autoscaling); - assert!(nexus_poller_info.last_successful_poll_time.is_none()); - let activity_poller_info = heartbeat.activity_poller_info.unwrap(); - assert!(!activity_poller_info.is_autoscaling); - assert!(within_two_minutes_ts( - activity_poller_info.last_successful_poll_time.unwrap() - )); - - assert_eq!(heartbeat.total_sticky_cache_hit, 2); + after_shutdown_checks(heartbeat, &wf_name, &start_time, &heartbeat_time); } // Tests that rely on Prometheus running in a docker container need to start @@ -180,6 +242,16 @@ async fn docker_worker_heartbeat_tuner() { .with_activity_slots_options(ResourceSlotOptions::new(5, 10, Duration::from_millis(50))); starter .worker_config + .workflow_task_poller_behavior(PollerBehavior::Autoscaling { + minimum: 1, + maximum: 200, + initial: 5, + }) + .nexus_task_poller_behavior(PollerBehavior::Autoscaling { + minimum: 1, + maximum: 200, + initial: 5, + }) .clear_max_outstanding_opts() .tuner(Arc::new(tuner)); let mut worker = starter.worker().await; @@ -232,16 +304,151 @@ async fn docker_worker_heartbeat_tuner() { .unwrap(); let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); assert!(heartbeat.task_queue.starts_with(wf_name)); + + assert_eq!( + heartbeat + .workflow_task_slots_info + .clone() + .unwrap() + .slot_supplier_kind, + "ResourceBased" + ); + assert_eq!( + heartbeat + .activity_task_slots_info + .clone() + .unwrap() + .slot_supplier_kind, + "ResourceBased" + ); + assert_eq!( + heartbeat + .nexus_task_slots_info + .clone() + .unwrap() + .slot_supplier_kind, + "ResourceBased" + ); + assert_eq!( + heartbeat + .local_activity_slots_info + .clone() + .unwrap() + .slot_supplier_kind, + "ResourceBased" + ); + + let workflow_poller_info = heartbeat.workflow_poller_info.unwrap(); + assert!(workflow_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + workflow_poller_info.last_successful_poll_time.unwrap() + )); + let sticky_poller_info = heartbeat.workflow_sticky_poller_info.unwrap(); + assert!(sticky_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + sticky_poller_info.last_successful_poll_time.unwrap() + )); + let nexus_poller_info = heartbeat.nexus_poller_info.unwrap(); + assert!(nexus_poller_info.is_autoscaling); + assert!(nexus_poller_info.last_successful_poll_time.is_none()); + let activity_poller_info = heartbeat.activity_poller_info.unwrap(); + assert!(!activity_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + activity_poller_info.last_successful_poll_time.unwrap() + )); +} + +fn in_activity_checks( + heartbeat: &WorkerHeartbeat, + start_time: &AtomicCell>, + heartbeat_time: &AtomicCell>, +) { + assert_eq!(heartbeat.status, WorkerStatus::Running as i32); + + let workflow_task_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); + assert_eq!(workflow_task_slots.total_processed_tasks, 1); + assert_eq!(workflow_task_slots.current_available_slots, 5); + assert_eq!(workflow_task_slots.current_used_slots, 0); + assert_eq!(workflow_task_slots.slot_supplier_kind, "Fixed"); + let activity_task_slots = heartbeat.activity_task_slots_info.clone().unwrap(); + assert_eq!(activity_task_slots.current_available_slots, 4); + assert_eq!(activity_task_slots.current_used_slots, 1); + assert_eq!(activity_task_slots.slot_supplier_kind, "Fixed"); + let nexus_task_slots = heartbeat.nexus_task_slots_info.clone().unwrap(); + assert_eq!(nexus_task_slots.current_available_slots, 0); + assert_eq!(nexus_task_slots.current_used_slots, 0); + assert_eq!(nexus_task_slots.slot_supplier_kind, "Fixed"); + let local_activity_task_slots = heartbeat.local_activity_slots_info.clone().unwrap(); + assert_eq!(local_activity_task_slots.current_available_slots, 100); + assert_eq!(local_activity_task_slots.current_used_slots, 0); + assert_eq!(local_activity_task_slots.slot_supplier_kind, "Fixed"); + + let workflow_poller_info = heartbeat.workflow_poller_info.unwrap(); + assert_eq!(workflow_poller_info.current_pollers, 1); + let sticky_poller_info = heartbeat.workflow_sticky_poller_info.unwrap(); + assert_ne!(sticky_poller_info.current_pollers, 0); + let nexus_poller_info = heartbeat.nexus_poller_info.unwrap(); + assert_eq!(nexus_poller_info.current_pollers, 0); + let activity_poller_info = heartbeat.activity_poller_info.unwrap(); + assert_ne!(activity_poller_info.current_pollers, 0); + assert_ne!(heartbeat.current_sticky_cache_size, 0); + start_time.store(Some(heartbeat.start_time.unwrap())); + heartbeat_time.store(Some(heartbeat.heartbeat_time.unwrap())); +} + +fn after_shutdown_checks( + heartbeat: &WorkerHeartbeat, + wf_name: &str, + start_time: &AtomicCell>, + heartbeat_time: &AtomicCell>, +) { assert_eq!(heartbeat.worker_identity, "integ_tester"); + let host_info = heartbeat.host_info.clone().unwrap(); + assert!(!host_info.host_name.is_empty()); + assert!(!host_info.process_key.is_empty()); + assert!(!host_info.process_id.is_empty()); + assert_ne!(host_info.current_host_cpu_usage, 0.0); + assert_ne!(host_info.current_host_mem_usage, 0.0); + assert!(heartbeat.task_queue.starts_with(wf_name)); + assert_eq!( + heartbeat.deployment_version.clone().unwrap().build_id, + "test_build_id" + ); assert_eq!(heartbeat.sdk_name, "temporal-core"); assert_eq!(heartbeat.sdk_version, "0.1.0"); assert_eq!(heartbeat.status, WorkerStatus::Shutdown as i32); + assert_eq!(start_time.load().unwrap(), heartbeat.start_time.unwrap()); + assert_ne!( + heartbeat_time.load().unwrap(), + heartbeat.heartbeat_time.unwrap() + ); + // TODO: heartbeat.heartbeat_time comes after heartbeat_time assert!(within_two_minutes_ts(heartbeat.start_time.unwrap())); assert!(within_two_minutes_ts(heartbeat.heartbeat_time.unwrap())); assert!(within_duration( heartbeat.elapsed_since_last_heartbeat.unwrap(), - Duration::from_secs(1) + Duration::from_secs(200) )); + let workflow_task_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); + assert_eq!(workflow_task_slots.current_available_slots, 5); + // TODO: Could be a bug here with "+ extra" from when the metric is recorded in MeteredPermitDealer.build_owned() + assert_eq!(workflow_task_slots.current_used_slots, 1); + assert_eq!(workflow_task_slots.total_processed_tasks, 2); + assert_eq!(workflow_task_slots.slot_supplier_kind, "Fixed"); + let activity_task_slots = heartbeat.activity_task_slots_info.clone().unwrap(); + assert_eq!(activity_task_slots.current_available_slots, 5); + // TODO: Could be a bug here with "+ extra" from when the metric is recorded in MeteredPermitDealer.build_owned() + assert_eq!(workflow_task_slots.current_used_slots, 1); + assert_eq!(activity_task_slots.slot_supplier_kind, "Fixed"); + assert_eq!(activity_task_slots.last_interval_processed_tasks, 1); + let nexus_task_slots = heartbeat.nexus_task_slots_info.clone().unwrap(); + assert_eq!(nexus_task_slots.current_available_slots, 0); + assert_eq!(nexus_task_slots.current_used_slots, 0); + assert_eq!(nexus_task_slots.slot_supplier_kind, "Fixed"); + let local_activity_task_slots = heartbeat.local_activity_slots_info.clone().unwrap(); + assert_eq!(local_activity_task_slots.current_available_slots, 100); + assert_eq!(local_activity_task_slots.current_used_slots, 0); + assert_eq!(local_activity_task_slots.slot_supplier_kind, "Fixed"); let workflow_poller_info = heartbeat.workflow_poller_info.unwrap(); assert!(!workflow_poller_info.is_autoscaling); @@ -263,88 +470,166 @@ async fn docker_worker_heartbeat_tuner() { )); assert_eq!(heartbeat.total_sticky_cache_hit, 2); + // TODO: total_sticky_cache_miss + assert_eq!(heartbeat.current_sticky_cache_size, 0); + // TODO: plugin } #[tokio::test] -async fn docker_worker_heartbeat_no_metrics() { - // Even if no metrics are used, we should still get in-memory metrics for worker heartbeat - let runtimeopts = RuntimeOptionsBuilder::default() - .telemetry_options(TelemetryOptionsBuilder::default().build().unwrap()) - .heartbeat_interval(Some(Duration::from_millis(100))) - .build() - .unwrap(); - let rt = CoreRuntime::new_assume_tokio(runtimeopts).unwrap(); - let wf_name = "worker_heartbeat_no_metrics"; - let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); +async fn docker_worker_heartbeat_multiple_workers() { + let wf_name = "worker_heartbeat_multi_workers"; + let mut starter = new_no_metrics_starter(wf_name); starter .worker_config .max_outstanding_workflow_tasks(5_usize) - .max_cached_workflows(5_usize) - .max_outstanding_activities(5_usize); - let mut worker = starter.worker().await; - let worker_instance_key = worker.worker_instance_key(); + .max_cached_workflows(5_usize); - // Run a workflow - worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { - ctx.activity(ActivityOptions { - activity_type: "pass_fail_act".to_string(), - input: "pass".as_json_payload().expect("serializes fine"), - start_to_close_timeout: Some(Duration::from_secs(1)), - ..Default::default() - }) - .await; + let client = starter.get_client().await; + let starting_hb_len = list_worker_heartbeats(&client, String::new()).await.len(); + + let mut worker_a = starter.worker().await; + worker_a.register_wf(wf_name.to_string(), |_ctx: WfContext| async move { Ok(().into()) }); - worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { - Ok(i) + worker_a.register_activity("failing_act", |_ctx: ActContext, _: String| async move { + Ok(()) }); - starter.start_with_worker(wf_name, &mut worker).await; - worker.run_until_done().await.unwrap(); - let client = starter.get_client().await; - let mut raw_client = (*client).clone(); - let workers_list = WorkflowService::list_workers( + let mut starter_b = starter.clone_no_worker(); + let mut worker_b = starter_b.worker().await; + worker_b.register_wf(wf_name.to_string(), |_ctx: WfContext| async move { + Ok(().into()) + }); + worker_b.register_activity("failing_act", |_ctx: ActContext, _: String| async move { + Ok(()) + }); + + let worker_a_key = worker_a.worker_instance_key().to_string(); + let worker_b_key = worker_b.worker_instance_key().to_string(); + let _ = starter.start_with_worker(wf_name, &mut worker_a).await; + worker_a.run_until_done().await.unwrap(); + + let _ = starter_b.start_with_worker(wf_name, &mut worker_b).await; + worker_b.run_until_done().await.unwrap(); + + sleep(Duration::from_millis(200)).await; + + let all = list_worker_heartbeats(&client, String::new()).await; + let keys: HashSet<_> = all + .iter() + .map(|hb| hb.worker_instance_key.clone()) + .collect(); + assert!(keys.contains(&worker_a_key)); + assert!(keys.contains(&worker_b_key)); + + // Verify both heartbeats contain the same shared process_key + let process_keys: HashSet<_> = all + .iter() + .filter_map(|hb| hb.host_info.as_ref().map(|info| info.process_key.clone())) + .collect(); + assert!(process_keys.len() > starting_hb_len); + + let filtered = + list_worker_heartbeats(&client, format!("WorkerInstanceKey=\"{worker_a_key}\"")).await; + assert_eq!(filtered.len(), 1); + assert_eq!(filtered[0].worker_instance_key, worker_a_key); + + // Verify describe worker gives the same heartbeat as listworker + let mut raw_client = client.as_ref().clone(); + let describe_worker_a = WorkflowService::describe_worker( &mut raw_client, - ListWorkersRequest { + DescribeWorkerRequest { namespace: client.namespace().to_owned(), - page_size: 100, - next_page_token: Vec::new(), - query: String::new(), + worker_instance_key: worker_a_key.to_string(), }, ) .await .unwrap() - .into_inner(); - // Since list_workers finds all workers in the namespace, must find specific worker used in this - // test - let worker_info = workers_list - .workers_info - .iter() - .find(|worker_info| { - if let Some(hb) = worker_info.worker_heartbeat.as_ref() { - hb.worker_instance_key == worker_instance_key.to_string() - } else { - false - } - }) - .unwrap(); - let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); - assert!(heartbeat.task_queue.starts_with(wf_name)); - assert_eq!(heartbeat.worker_identity, "integ_tester"); - assert_eq!(heartbeat.sdk_name, "temporal-core"); - assert_eq!(heartbeat.sdk_version, "0.1.0"); - assert_eq!(heartbeat.status, WorkerStatus::Shutdown as i32); - assert_eq!( - heartbeat.deployment_version, - Some(WorkerDeploymentVersion { - build_id: "test_build_id".to_owned(), - deployment_name: String::new(), - }) - ); - assert!(within_two_minutes_ts(heartbeat.start_time.unwrap())); - assert!(within_two_minutes_ts(heartbeat.heartbeat_time.unwrap())); - assert!(within_duration( - heartbeat.elapsed_since_last_heartbeat.unwrap(), - Duration::from_secs(1) - )); + .into_inner() + .worker_info + .unwrap() + .worker_heartbeat + .unwrap(); + assert_eq!(describe_worker_a, filtered[0]); + + let filtered_b = + list_worker_heartbeats(&client, format!("WorkerInstanceKey = \"{worker_b_key}\"")).await; + assert_eq!(filtered_b.len(), 1); + assert_eq!(filtered_b[0].worker_instance_key, worker_b_key); + let describe_worker_b = WorkflowService::describe_worker( + &mut raw_client, + DescribeWorkerRequest { + namespace: client.namespace().to_owned(), + worker_instance_key: worker_b_key.to_string(), + }, + ) + .await + .unwrap() + .into_inner() + .worker_info + .unwrap() + .worker_heartbeat + .unwrap(); + assert_eq!(describe_worker_b, filtered_b[0]); +} + +#[tokio::test] +async fn docker_worker_heartbeat_failure_metrics() { + let wf_name = "worker_heartbeat_failure_metrics"; + let mut starter = new_no_metrics_starter(wf_name); + starter.worker_config.max_outstanding_activities(5_usize); + + let mut worker = starter.worker().await; + static COUNT: AtomicU64 = AtomicU64::new(0); + + worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { + println!("[WF] starting"); + COUNT.store(COUNT.load(Ordering::Relaxed) + 1, Ordering::Relaxed); + let _asdf = ctx + .activity(ActivityOptions { + activity_type: "failing_act".to_string(), + input: "boom".as_json_payload().expect("serialize"), + start_to_close_timeout: Some(Duration::from_secs(1)), // TODO: use retry policy instead + retry_policy: Some(RetryPolicy { + maximum_attempts: 3, + ..Default::default() + }), + ..Default::default() + }) + .await; + if COUNT.load(Ordering::Relaxed) == 1 { + println!("[WF] returning error"); + panic!("expected WF panic"); + } + Ok(().into()) + }); + worker.register_activity("failing_act", |_ctx: ActContext, _: String| async move { + if COUNT.load(Ordering::Relaxed) >= 3 { + return Ok(()); + } + Err(anyhow!("Expected error").into()) + }); + + let worker_key = worker.worker_instance_key().to_string(); + starter.workflow_options.retry_policy = Some(RetryPolicy { + maximum_attempts: 2, + ..Default::default() + }); + let _ = starter.start_with_worker(wf_name, &mut worker).await; + + worker.run_until_done().await.unwrap(); + + sleep(Duration::from_millis(150)).await; + let client = starter.get_client().await; + let mut heartbeats = + list_worker_heartbeats(&client, format!("WorkerInstanceKey=\"{worker_key}\"")).await; + assert_eq!(heartbeats.len(), 1); + let heartbeat = heartbeats.pop().unwrap(); + + let activity_slots = heartbeat.activity_task_slots_info.unwrap(); + assert_eq!(activity_slots.total_failed_tasks, 3); + assert!(activity_slots.last_interval_failure_tasks >= 1); + + let workflow_slots = heartbeat.workflow_task_slots_info.unwrap(); + assert_eq!(workflow_slots.total_failed_tasks, 1); } From e1ad0ae1dcf712687c3d1b4905ead225fa528c86 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Mon, 13 Oct 2025 10:54:22 -0700 Subject: [PATCH 12/23] More tests, sticky cache miss, plugins --- client/src/lib.rs | 2 +- client/src/worker_registry/mod.rs | 26 +-- core-api/src/worker.rs | 12 +- core/src/worker/client/mocks.rs | 2 +- core/src/worker/mod.rs | 24 +- tests/integ_tests/worker_heartbeat_tests.rs | 245 ++++++++++++++++++-- 6 files changed, 268 insertions(+), 43 deletions(-) diff --git a/client/src/lib.rs b/client/src/lib.rs index 8b54aa1a6..4d6088f87 100644 --- a/client/src/lib.rs +++ b/client/src/lib.rs @@ -591,7 +591,7 @@ impl ClientOptions { client: TemporalServiceClient::new(svc), options: Arc::new(self.clone()), capabilities: None, - workers: Arc::new(ClientWorkerSet::new(false)), + workers: Arc::new(ClientWorkerSet::new()), }; if !self.skip_get_system_info { match client diff --git a/client/src/worker_registry/mod.rs b/client/src/worker_registry/mod.rs index f97adab1a..b2ca6ad7b 100644 --- a/client/src/worker_registry/mod.rs +++ b/client/src/worker_registry/mod.rs @@ -46,19 +46,15 @@ struct ClientWorkerSetImpl { all_workers: HashMap>, /// Maps namespace to shared worker for worker heartbeating shared_worker: HashMap>, - /// Disables erroring when multiple workers on the same namespace+task queue are registered. - /// This is used with testing, where multiple tests run in parallel on the same client - disable_dupe_check: bool, } impl ClientWorkerSetImpl { /// Factory method. - fn new(disable_dupe_check: bool) -> Self { + fn new() -> Self { Self { slot_providers: Default::default(), all_workers: Default::default(), shared_worker: Default::default(), - disable_dupe_check, } } @@ -85,7 +81,7 @@ impl ClientWorkerSetImpl { worker.namespace().to_string(), worker.task_queue().to_string(), ); - if self.slot_providers.contains_key(&slot_key) && !self.disable_dupe_check { + if self.slot_providers.contains_key(&slot_key) { bail!( "Registration of multiple workers on the same namespace and task queue for the same client not allowed: {slot_key:?}, worker_instance_key: {:?}.", worker.worker_instance_key() @@ -186,16 +182,16 @@ pub struct ClientWorkerSet { impl Default for ClientWorkerSet { fn default() -> Self { - Self::new(false) + Self::new() } } impl ClientWorkerSet { /// Factory method. - pub fn new(disable_dupe_check: bool) -> Self { + pub fn new() -> Self { Self { worker_grouping_key: Uuid::new_v4(), - worker_manager: RwLock::new(ClientWorkerSetImpl::new(disable_dupe_check)), + worker_manager: RwLock::new(ClientWorkerSetImpl::new()), } } @@ -335,7 +331,7 @@ mod tests { #[test] fn registry_keeps_one_provider_per_namespace() { - let manager = ClientWorkerSet::new(false); + let manager = ClientWorkerSet::new(); let mut worker_keys = vec![]; let mut successful_registrations = 0; @@ -465,7 +461,7 @@ mod tests { #[test] fn duplicate_namespace_task_queue_registration_fails() { - let manager = ClientWorkerSet::new(false); + let manager = ClientWorkerSet::new(); let worker1 = new_mock_provider_with_heartbeat( "test_namespace".to_string(), @@ -504,7 +500,7 @@ mod tests { #[test] fn multiple_workers_same_namespace_share_heartbeat_manager() { - let manager = ClientWorkerSet::new(false); + let manager = ClientWorkerSet::new(); let worker1 = new_mock_provider_with_heartbeat( "shared_namespace".to_string(), @@ -537,7 +533,7 @@ mod tests { #[test] fn different_namespaces_get_separate_heartbeat_managers() { - let manager = ClientWorkerSet::new(false); + let manager = ClientWorkerSet::new(); let worker1 = new_mock_provider_with_heartbeat( "namespace1".to_string(), "queue1".to_string(), @@ -565,7 +561,7 @@ mod tests { #[test] fn unregister_heartbeat_workers_cleans_up_shared_worker_when_last_removed() { - let manager = ClientWorkerSet::new(false); + let manager = ClientWorkerSet::new(); // Create two workers with same namespace but different task queues let worker1 = new_mock_provider_with_heartbeat( @@ -583,6 +579,8 @@ mod tests { let worker_instance_key1 = worker1.worker_instance_key(); let worker_instance_key2 = worker2.worker_instance_key(); + assert_ne!(worker_instance_key1, worker_instance_key2); + manager.register_worker(Arc::new(worker1)).unwrap(); manager.register_worker(Arc::new(worker2)).unwrap(); diff --git a/core-api/src/worker.rs b/core-api/src/worker.rs index 8fffb3eb0..1a96a6645 100644 --- a/core-api/src/worker.rs +++ b/core-api/src/worker.rs @@ -142,19 +142,19 @@ pub struct WorkerConfig { /// Mutually exclusive with `tuner` #[builder(setter(into, strip_option), default)] pub max_outstanding_workflow_tasks: Option, - /// The maximum number of activity tasks that will ever be given to this worker concurrently + /// The maximum number of activity tasks that will ever be given to this worker concurrently. /// /// Mutually exclusive with `tuner` #[builder(setter(into, strip_option), default)] pub max_outstanding_activities: Option, /// The maximum number of local activity tasks that will ever be given to this worker - /// concurrently + /// concurrently. /// /// Mutually exclusive with `tuner` #[builder(setter(into, strip_option), default)] pub max_outstanding_local_activities: Option, /// The maximum number of nexus tasks that will ever be given to this worker - /// concurrently + /// concurrently. /// /// Mutually exclusive with `tuner` #[builder(setter(into, strip_option), default)] @@ -163,9 +163,13 @@ pub struct WorkerConfig { /// A versioning strategy for this worker. pub versioning_strategy: WorkerVersioningStrategy, - /// List of plugins used by lang + /// List of plugins used by lang. #[builder(default)] pub plugins: Vec, + + /// Skips the single worker+client+namespace+task_queue check + #[builder(default = "false")] + pub skip_client_worker_set_check: bool, } impl WorkerConfig { diff --git a/core/src/worker/client/mocks.rs b/core/src/worker/client/mocks.rs index addd09708..b86334e4b 100644 --- a/core/src/worker/client/mocks.rs +++ b/core/src/worker/client/mocks.rs @@ -4,7 +4,7 @@ use std::sync::{Arc, LazyLock}; use temporal_client::ClientWorkerSet; pub(crate) static DEFAULT_WORKERS_REGISTRY: LazyLock> = - LazyLock::new(|| Arc::new(ClientWorkerSet::new(true))); + LazyLock::new(|| Arc::new(ClientWorkerSet::new())); pub(crate) static DEFAULT_TEST_CAPABILITIES: &Capabilities = &Capabilities { signal_and_query_header: true, diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index a5037cb38..f82bb48db 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -267,7 +267,9 @@ impl WorkerTrait for Worker { *self.status.lock() = WorkerStatus::ShuttingDown; } // First, unregister worker from the client - if !self.client_worker_registrator.shared_namespace_worker { + if !self.client_worker_registrator.shared_namespace_worker + && !self.config.skip_client_worker_set_check + { let _res = self .client .workers() @@ -350,10 +352,16 @@ impl Worker { new_client: ConfiguredClient, ) -> Result<(), anyhow::Error> { // Unregister worker from current client, register in new client at the end - let client_worker = self - .client - .workers() - .unregister_worker(self.worker_instance_key)?; + let client_worker = if !self.config.skip_client_worker_set_check { + Some( + self.client + .workers() + .unregister_worker(self.worker_instance_key)?, + ) + } else { + None + }; + let new_worker_client = super::init_worker_client( self.config.namespace.clone(), self.config.client_identity_override.clone(), @@ -362,7 +370,9 @@ impl Worker { self.client.replace_client(new_worker_client); *self.client_worker_registrator.client.write() = self.client.clone(); - self.client.workers().register_worker(client_worker) + client_worker.map_or(Ok(()), |worker| { + self.client.workers().register_worker(worker) + }) } #[cfg(test)] @@ -608,7 +618,7 @@ impl Worker { shared_namespace_worker, }); - if !shared_namespace_worker { + if !shared_namespace_worker && !config.skip_client_worker_set_check { client .workers() .register_worker(client_worker_registrator.clone())?; diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index bdd962ce6..2de815142 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -7,7 +7,9 @@ use std::collections::HashSet; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use temporal_client::{Client, NamespacedClient, RetryClient, WorkflowService}; +use temporal_client::{ + Client, NamespacedClient, RetryClient, WfClientExt, WorkflowClientTrait, WorkflowService, +}; use temporal_sdk::{ActContext, ActivityOptions, WfContext}; use temporal_sdk_core::telemetry::{build_otlp_metric_exporter, start_prometheus_metric_exporter}; use temporal_sdk_core::{ @@ -17,10 +19,10 @@ use temporal_sdk_core_api::telemetry::{ OtelCollectorOptionsBuilder, PrometheusExporterOptionsBuilder, TelemetryOptionsBuilder, }; use temporal_sdk_core_api::worker::PollerBehavior; -use temporal_sdk_core_protos::coresdk::AsJsonPayloadExt; +use temporal_sdk_core_protos::coresdk::{AsJsonPayloadExt, FromJsonPayloadExt}; use temporal_sdk_core_protos::temporal::api::common::v1::RetryPolicy; use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerStatus; -use temporal_sdk_core_protos::temporal::api::worker::v1::WorkerHeartbeat; +use temporal_sdk_core_protos::temporal::api::worker::v1::{PluginInfo, WorkerHeartbeat}; use temporal_sdk_core_protos::temporal::api::workflowservice::v1::DescribeWorkerRequest; use temporal_sdk_core_protos::temporal::api::workflowservice::v1::ListWorkersRequest; use tokio::sync::Semaphore; @@ -49,6 +51,10 @@ fn new_no_metrics_starter(wf_name: &str) -> CoreWfStarter { CoreWfStarter::new_with_runtime(wf_name, CoreRuntime::new_assume_tokio(runtimeopts).unwrap()) } +fn to_system_time(ts: Timestamp) -> SystemTime { + UNIX_EPOCH + Duration::new(ts.seconds as u64, ts.nanos as u32) +} + async fn list_worker_heartbeats( client: &Arc>, query: impl Into, @@ -114,11 +120,20 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b .worker_config .max_outstanding_workflow_tasks(5_usize) .max_cached_workflows(5_usize) - .max_outstanding_activities(5_usize); + .max_outstanding_activities(5_usize) + .plugins(vec![ + PluginInfo { + name: "plugin1".to_string(), + version: "1".to_string(), + }, + PluginInfo { + name: "plugin2".to_string(), + version: "2".to_string(), + }, + ]); let mut worker = starter.worker().await; let worker_instance_key = worker.worker_instance_key(); - // Run a workflow worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { ctx.activity(ActivityOptions { activity_type: "pass_fail_act".to_string(), @@ -409,6 +424,7 @@ fn after_shutdown_checks( assert!(!host_info.process_id.is_empty()); assert_ne!(host_info.current_host_cpu_usage, 0.0); assert_ne!(host_info.current_host_mem_usage, 0.0); + assert!(heartbeat.task_queue.starts_with(wf_name)); assert_eq!( heartbeat.deployment_version.clone().unwrap().build_id, @@ -417,27 +433,30 @@ fn after_shutdown_checks( assert_eq!(heartbeat.sdk_name, "temporal-core"); assert_eq!(heartbeat.sdk_version, "0.1.0"); assert_eq!(heartbeat.status, WorkerStatus::Shutdown as i32); + assert_eq!(start_time.load().unwrap(), heartbeat.start_time.unwrap()); assert_ne!( heartbeat_time.load().unwrap(), heartbeat.heartbeat_time.unwrap() ); - // TODO: heartbeat.heartbeat_time comes after heartbeat_time assert!(within_two_minutes_ts(heartbeat.start_time.unwrap())); assert!(within_two_minutes_ts(heartbeat.heartbeat_time.unwrap())); + assert!( + to_system_time(heartbeat_time.load().unwrap()) + < to_system_time(heartbeat.heartbeat_time.unwrap()) + ); assert!(within_duration( heartbeat.elapsed_since_last_heartbeat.unwrap(), - Duration::from_secs(200) + Duration::from_millis(200) )); + let workflow_task_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); assert_eq!(workflow_task_slots.current_available_slots, 5); - // TODO: Could be a bug here with "+ extra" from when the metric is recorded in MeteredPermitDealer.build_owned() assert_eq!(workflow_task_slots.current_used_slots, 1); assert_eq!(workflow_task_slots.total_processed_tasks, 2); assert_eq!(workflow_task_slots.slot_supplier_kind, "Fixed"); let activity_task_slots = heartbeat.activity_task_slots_info.clone().unwrap(); assert_eq!(activity_task_slots.current_available_slots, 5); - // TODO: Could be a bug here with "+ extra" from when the metric is recorded in MeteredPermitDealer.build_owned() assert_eq!(workflow_task_slots.current_used_slots, 1); assert_eq!(activity_task_slots.slot_supplier_kind, "Fixed"); assert_eq!(activity_task_slots.last_interval_processed_tasks, 1); @@ -470,13 +489,150 @@ fn after_shutdown_checks( )); assert_eq!(heartbeat.total_sticky_cache_hit, 2); - // TODO: total_sticky_cache_miss assert_eq!(heartbeat.current_sticky_cache_size, 0); - // TODO: plugin + assert_eq!( + heartbeat.plugins, + vec![ + PluginInfo { + name: "plugin1".to_string(), + version: "1".to_string() + }, + PluginInfo { + name: "plugin2".to_string(), + version: "2".to_string() + } + ] + ); } #[tokio::test] -async fn docker_worker_heartbeat_multiple_workers() { +async fn worker_heartbeat_sticky_cache_miss() { + let wf_name = "worker_heartbeat_cache_miss"; + let mut starter = new_no_metrics_starter(wf_name); + starter.worker_config.max_cached_workflows(1_usize); + + let mut worker = starter.worker().await; + worker.fetch_results = false; + let worker_key = worker.worker_instance_key().to_string(); + let worker_core = worker.core_worker.clone(); + let submitter = worker.get_submitter_handle(); + let wf_opts = starter.workflow_options.clone(); + let client = starter.get_client().await; + let client_for_orchestrator = client.clone(); + + static HISTORY_WF1_ACTIVITY_STARTED: Semaphore = Semaphore::const_new(0); + static HISTORY_WF1_ACTIVITY_FINISH: Semaphore = Semaphore::const_new(0); + static HISTORY_WF2_ACTIVITY_STARTED: Semaphore = Semaphore::const_new(0); + static HISTORY_WF2_ACTIVITY_FINISH: Semaphore = Semaphore::const_new(0); + + worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { + let wf_marker = ctx + .get_args() + .first() + .and_then(|p| String::from_json_payload(p).ok()) + .unwrap_or_else(|| "wf1".to_string()); + + ctx.activity(ActivityOptions { + activity_type: "sticky_cache_history_act".to_string(), + input: wf_marker.clone().as_json_payload().expect("serialize"), + start_to_close_timeout: Some(Duration::from_secs(5)), + ..Default::default() + }) + .await; + + Ok(().into()) + }); + worker.register_activity( + "sticky_cache_history_act", + |_ctx: ActContext, marker: String| async move { + match marker.as_str() { + "wf1" => { + HISTORY_WF1_ACTIVITY_STARTED.add_permits(1); + let permit = HISTORY_WF1_ACTIVITY_FINISH.acquire().await.unwrap(); + permit.forget(); + } + "wf2" => { + HISTORY_WF2_ACTIVITY_STARTED.add_permits(1); + let permit = HISTORY_WF2_ACTIVITY_FINISH.acquire().await.unwrap(); + permit.forget(); + } + _ => {} + } + Ok(marker) + }, + ); + + let wf1_id = format!("{wf_name}_wf1"); + let wf2_id = format!("{wf_name}_wf2"); + + let orchestrator = async move { + let wf1_run = submitter + .submit_wf( + wf1_id.clone(), + wf_name.to_string(), + vec!["wf1".to_string().as_json_payload().unwrap()], + wf_opts.clone(), + ) + .await + .unwrap(); + + let permit = HISTORY_WF1_ACTIVITY_STARTED.acquire().await.unwrap(); + permit.forget(); + + client_for_orchestrator + .get_workflow_execution_history(wf1_id.clone(), Some(wf1_run.clone()), vec![]) + .await + .unwrap(); + + let wf2_run = submitter + .submit_wf( + wf2_id.clone(), + wf_name.to_string(), + vec!["wf2".to_string().as_json_payload().unwrap()], + wf_opts, + ) + .await + .unwrap(); + + let permit = HISTORY_WF2_ACTIVITY_STARTED.acquire().await.unwrap(); + permit.forget(); + + HISTORY_WF1_ACTIVITY_FINISH.add_permits(1); + let handle1 = client_for_orchestrator.get_untyped_workflow_handle(wf1_id, wf1_run); + handle1 + .get_workflow_result(Default::default()) + .await + .expect("wf1 result"); + + HISTORY_WF2_ACTIVITY_FINISH.add_permits(1); + let handle2 = client_for_orchestrator.get_untyped_workflow_handle(wf2_id, wf2_run); + handle2 + .get_workflow_result(Default::default()) + .await + .expect("wf2 result"); + + worker_core.initiate_shutdown(); + }; + + let mut worker_runner = worker; + let runner = async move { + worker_runner.run_until_done().await.unwrap(); + }; + + tokio::join!(orchestrator, runner); + + sleep(Duration::from_millis(200)).await; + let mut heartbeats = + list_worker_heartbeats(&client, format!("WorkerInstanceKey=\"{worker_key}\"")).await; + assert_eq!(heartbeats.len(), 1); + let heartbeat = heartbeats.pop().unwrap(); + + assert!(heartbeat.total_sticky_cache_miss >= 1); + assert_eq!(heartbeat.worker_instance_key, worker_key); +} + +#[tokio::test] +async fn worker_heartbeat_multiple_workers() { let wf_name = "worker_heartbeat_multi_workers"; let mut starter = new_no_metrics_starter(wf_name); starter @@ -574,7 +730,7 @@ async fn docker_worker_heartbeat_multiple_workers() { } #[tokio::test] -async fn docker_worker_heartbeat_failure_metrics() { +async fn worker_heartbeat_failure_metrics() { let wf_name = "worker_heartbeat_failure_metrics"; let mut starter = new_no_metrics_starter(wf_name); starter.worker_config.max_outstanding_activities(5_usize); @@ -583,13 +739,12 @@ async fn docker_worker_heartbeat_failure_metrics() { static COUNT: AtomicU64 = AtomicU64::new(0); worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { - println!("[WF] starting"); COUNT.store(COUNT.load(Ordering::Relaxed) + 1, Ordering::Relaxed); let _asdf = ctx .activity(ActivityOptions { activity_type: "failing_act".to_string(), input: "boom".as_json_payload().expect("serialize"), - start_to_close_timeout: Some(Duration::from_secs(1)), // TODO: use retry policy instead + start_to_close_timeout: Some(Duration::from_secs(1)), retry_policy: Some(RetryPolicy { maximum_attempts: 3, ..Default::default() @@ -598,7 +753,6 @@ async fn docker_worker_heartbeat_failure_metrics() { }) .await; if COUNT.load(Ordering::Relaxed) == 1 { - println!("[WF] returning error"); panic!("expected WF panic"); } Ok(().into()) @@ -633,3 +787,62 @@ async fn docker_worker_heartbeat_failure_metrics() { let workflow_slots = heartbeat.workflow_task_slots_info.unwrap(); assert_eq!(workflow_slots.total_failed_tasks, 1); } + +#[tokio::test] +async fn worker_heartbeat_no_runtime_heartbeat() { + let wf_name = "worker_heartbeat_no_runtime_heartbeat"; + let runtimeopts = RuntimeOptionsBuilder::default() + .telemetry_options(get_integ_telem_options()) + .heartbeat_interval(None) // Turn heartbeating off + .build() + .unwrap(); + let rt = CoreRuntime::new_assume_tokio(runtimeopts).unwrap(); + let mut starter = CoreWfStarter::new_with_runtime(&wf_name, rt); + let mut worker = starter.worker().await; + let worker_instance_key = worker.worker_instance_key(); + + worker.register_wf(wf_name.to_owned(), |ctx: WfContext| async move { + ctx.activity(ActivityOptions { + activity_type: "pass_fail_act".to_string(), + input: "pass".as_json_payload().expect("serializes fine"), + start_to_close_timeout: Some(Duration::from_secs(1)), + ..Default::default() + }) + .await; + Ok(().into()) + }); + + worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { + Ok(i) + }); + + starter + .start_with_worker(wf_name.to_owned(), &mut worker) + .await; + + worker.run_until_done().await.unwrap(); + let client = starter.get_client().await; + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + }, + ) + .await + .unwrap() + .into_inner(); + + // Ensure worker has not ever heartbeated + let heartbeat = workers_list.workers_info.iter().find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }); + assert!(heartbeat.is_none()); +} From 8f423874929b5660a9a807795fa2c0470855e19c Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 14 Oct 2025 15:04:26 -0700 Subject: [PATCH 13/23] Formatting, fix skip_client_worker_set_check --- client/src/raw.rs | 9 +++ client/src/worker_registry/mod.rs | 26 ++++---- core/src/telemetry/otel.rs | 1 - core/src/worker/client.rs | 17 ----- core/src/worker/mod.rs | 28 ++++----- tests/integ_tests/worker_heartbeat_tests.rs | 69 ++++++++++++++++++++- 6 files changed, 104 insertions(+), 46 deletions(-) diff --git a/client/src/raw.rs b/client/src/raw.rs index f461de84d..26d08203d 100644 --- a/client/src/raw.rs +++ b/client/src/raw.rs @@ -1391,6 +1391,15 @@ proxier! { r.extensions_mut().insert(labels); } ); + ( + set_worker_deployment_manager, + SetWorkerDeploymentManagerRequest, + SetWorkerDeploymentManagerResponse, + |r| { + let labels = namespaced_request!(r); + r.extensions_mut().insert(labels); + } + ); } proxier! { diff --git a/client/src/worker_registry/mod.rs b/client/src/worker_registry/mod.rs index b2ca6ad7b..6112eb11a 100644 --- a/client/src/worker_registry/mod.rs +++ b/client/src/worker_registry/mod.rs @@ -76,12 +76,13 @@ impl ClientWorkerSetImpl { fn register( &mut self, worker: Arc, + skip_client_worker_set_check: bool, ) -> Result<(), anyhow::Error> { let slot_key = SlotKey::new( worker.namespace().to_string(), worker.task_queue().to_string(), ); - if self.slot_providers.contains_key(&slot_key) { + if self.slot_providers.contains_key(&slot_key) && !skip_client_worker_set_check { bail!( "Registration of multiple workers on the same namespace and task queue for the same client not allowed: {slot_key:?}, worker_instance_key: {:?}.", worker.worker_instance_key() @@ -210,8 +211,11 @@ impl ClientWorkerSet { pub fn register_worker( &self, worker: Arc, + skip_client_worker_set_check: bool, ) -> Result<(), anyhow::Error> { - self.worker_manager.write().register(worker) + self.worker_manager + .write() + .register(worker, skip_client_worker_set_check) } /// Unregisters a local worker, typically when that worker starts shutdown. @@ -341,7 +345,7 @@ mod tests { new_mock_provider(namespace, "bar_q".to_string(), false, false, false); let worker_instance_key = mock_provider.worker_instance_key(); - let result = manager.register_worker(Arc::new(mock_provider)); + let result = manager.register_worker(Arc::new(mock_provider), false); if result.is_ok() { successful_registrations += 1; worker_keys.push(worker_instance_key); @@ -478,10 +482,10 @@ mod tests { Uuid::new_v4(), ); - manager.register_worker(Arc::new(worker1)).unwrap(); + manager.register_worker(Arc::new(worker1), false).unwrap(); // second worker register should fail due to duplicate namespace+task_queue - let result = manager.register_worker(Arc::new(worker2)); + let result = manager.register_worker(Arc::new(worker2), false); assert!(result.is_err()); assert!( result @@ -517,8 +521,8 @@ mod tests { Uuid::new_v4(), ); - manager.register_worker(Arc::new(worker1)).unwrap(); - manager.register_worker(Arc::new(worker2)).unwrap(); + manager.register_worker(Arc::new(worker1), false).unwrap(); + manager.register_worker(Arc::new(worker2), false).unwrap(); assert_eq!(2, manager.num_providers()); assert_eq!(manager.num_heartbeat_workers(), 2); @@ -547,8 +551,8 @@ mod tests { Uuid::new_v4(), ); - manager.register_worker(Arc::new(worker1)).unwrap(); - manager.register_worker(Arc::new(worker2)).unwrap(); + manager.register_worker(Arc::new(worker1), false).unwrap(); + manager.register_worker(Arc::new(worker2), false).unwrap(); assert_eq!(2, manager.num_providers()); assert_eq!(manager.num_heartbeat_workers(), 2); @@ -581,8 +585,8 @@ mod tests { assert_ne!(worker_instance_key1, worker_instance_key2); - manager.register_worker(Arc::new(worker1)).unwrap(); - manager.register_worker(Arc::new(worker2)).unwrap(); + manager.register_worker(Arc::new(worker1), false).unwrap(); + manager.register_worker(Arc::new(worker2), false).unwrap(); // Verify initial state: 2 slot providers, 2 heartbeat workers, 1 shared worker assert_eq!(2, manager.num_providers()); diff --git a/core/src/telemetry/otel.rs b/core/src/telemetry/otel.rs index 50ecddd39..410e63a83 100644 --- a/core/src/telemetry/otel.rs +++ b/core/src/telemetry/otel.rs @@ -156,7 +156,6 @@ pub fn build_otlp_metric_exporter( opts.histogram_bucket_overrides, )? .build(); - Ok::<_, anyhow::Error>(CoreOtelMeter { meter: mp.meter(TELEM_SERVICE_NAME), use_seconds_for_durations: opts.use_seconds_for_durations, diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index 65250fc58..578976d0e 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -758,23 +758,6 @@ impl WorkerClient for WorkerClientBag { heartbeat.elapsed_since_last_heartbeat = elapsed_since_last_heartbeat; client_heartbeat_data.last_heartbeat_time = Some(now); - if let Some(wft_slot_info) = heartbeat.workflow_task_slots_info.as_mut() { - wft_slot_info.last_interval_processed_tasks = wft_slot_info.total_processed_tasks - - client_heartbeat_data - .workflow_task_slots_info - .total_processed_tasks; - wft_slot_info.last_interval_failure_tasks = wft_slot_info.total_failed_tasks - - client_heartbeat_data - .workflow_task_slots_info - .total_failed_tasks; - - client_heartbeat_data - .workflow_task_slots_info - .total_processed_tasks = wft_slot_info.total_processed_tasks; - client_heartbeat_data - .workflow_task_slots_info - .total_failed_tasks = wft_slot_info.total_failed_tasks; - } update_slots( &mut heartbeat.workflow_task_slots_info, &mut client_heartbeat_data.workflow_task_slots_info, diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index f82bb48db..94974067a 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -352,15 +352,10 @@ impl Worker { new_client: ConfiguredClient, ) -> Result<(), anyhow::Error> { // Unregister worker from current client, register in new client at the end - let client_worker = if !self.config.skip_client_worker_set_check { - Some( - self.client - .workers() - .unregister_worker(self.worker_instance_key)?, - ) - } else { - None - }; + let client_worker = self + .client + .workers() + .unregister_worker(self.worker_instance_key)?; let new_worker_client = super::init_worker_client( self.config.namespace.clone(), @@ -370,9 +365,9 @@ impl Worker { self.client.replace_client(new_worker_client); *self.client_worker_registrator.client.write() = self.client.clone(); - client_worker.map_or(Ok(()), |worker| { - self.client.workers().register_worker(worker) - }) + self.client + .workers() + .register_worker(client_worker, self.config.skip_client_worker_set_check) } #[cfg(test)] @@ -618,10 +613,11 @@ impl Worker { shared_namespace_worker, }); - if !shared_namespace_worker && !config.skip_client_worker_set_check { - client - .workers() - .register_worker(client_worker_registrator.clone())?; + if !shared_namespace_worker { + client.workers().register_worker( + client_worker_registrator.clone(), + config.skip_client_worker_set_check, + )?; } Ok(Self { diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index 2de815142..e3dc0e03d 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -4,6 +4,7 @@ use crossbeam_utils::atomic::AtomicCell; use prost_types::Duration as PbDuration; use prost_types::Timestamp; use std::collections::HashSet; +use std::env; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; @@ -83,6 +84,9 @@ async fn list_worker_heartbeats( #[rstest::rstest] #[tokio::test] async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] backing: &str) { + if env::var("DOCKER_PROMETHEUS_RUNNING").is_err() { + return; + } let telemopts = if backing == "no_metrics" { TelemetryOptionsBuilder::default().build().unwrap() } else { @@ -234,6 +238,9 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b // with `docker_` and set the `DOCKER_PROMETHEUS_RUNNING` env variable to run #[tokio::test] async fn docker_worker_heartbeat_tuner() { + if env::var("DOCKER_PROMETHEUS_RUNNING").is_err() { + return; + } let runtimeopts = RuntimeOptionsBuilder::default() .telemetry_options(get_integ_telem_options()) .heartbeat_interval(Some(Duration::from_millis(100))) @@ -797,7 +804,7 @@ async fn worker_heartbeat_no_runtime_heartbeat() { .build() .unwrap(); let rt = CoreRuntime::new_assume_tokio(runtimeopts).unwrap(); - let mut starter = CoreWfStarter::new_with_runtime(&wf_name, rt); + let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); let mut worker = starter.worker().await; let worker_instance_key = worker.worker_instance_key(); @@ -846,3 +853,63 @@ async fn worker_heartbeat_no_runtime_heartbeat() { }); assert!(heartbeat.is_none()); } + +#[tokio::test] +async fn worker_heartbeat_skip_client_worker_set_check() { + let wf_name = "worker_heartbeat_skip_client_worker_set_check"; + let runtimeopts = RuntimeOptionsBuilder::default() + .telemetry_options(get_integ_telem_options()) + .heartbeat_interval(Some(Duration::from_millis(100))) + .build() + .unwrap(); + let rt = CoreRuntime::new_assume_tokio(runtimeopts).unwrap(); + let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); + starter.worker_config.skip_client_worker_set_check(true); + let mut worker = starter.worker().await; + let worker_instance_key = worker.worker_instance_key(); + + worker.register_wf(wf_name.to_owned(), |ctx: WfContext| async move { + ctx.activity(ActivityOptions { + activity_type: "pass_fail_act".to_string(), + input: "pass".as_json_payload().expect("serializes fine"), + start_to_close_timeout: Some(Duration::from_secs(1)), + ..Default::default() + }) + .await; + Ok(().into()) + }); + + worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { + Ok(i) + }); + + starter + .start_with_worker(wf_name.to_owned(), &mut worker) + .await; + + worker.run_until_done().await.unwrap(); + let client = starter.get_client().await; + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + }, + ) + .await + .unwrap() + .into_inner(); + + // Ensure worker still heartbeats + let heartbeat = workers_list.workers_info.iter().find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }); + assert!(heartbeat.is_some()); +} From d54a3eba6914926efdde62537f091951b418b56c Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 14 Oct 2025 15:27:28 -0700 Subject: [PATCH 14/23] Cursor found a bug --- .cargo/config.toml | 2 +- core-c-bridge/src/client.rs | 4 ++++ core/src/worker/mod.rs | 1 - 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index 4d1a8e5e1..17cfb5135 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,6 +1,6 @@ [env] # This temporarily overrides the version of the CLI used for integration tests, locally and in CI -CLI_VERSION_OVERRIDE = "v1.4.1-cloud-v1-29-0-139-2.0" +#CLI_VERSION_OVERRIDE = "v1.4.1-cloud-v1-29-0-139-2.0" [alias] # Not sure why --all-features doesn't work diff --git a/core-c-bridge/src/client.rs b/core-c-bridge/src/client.rs index ccdd660fd..28ba7799d 100644 --- a/core-c-bridge/src/client.rs +++ b/core-c-bridge/src/client.rs @@ -571,6 +571,7 @@ async fn call_workflow_service( "DescribeNamespace" => rpc_call!(client, call, describe_namespace), "DescribeSchedule" => rpc_call!(client, call, describe_schedule), "DescribeTaskQueue" => rpc_call!(client, call, describe_task_queue), + "DescribeWorker" => rpc_call!(client, call, describe_worker), "DescribeWorkerDeployment" => rpc_call!(client, call, describe_worker_deployment), "DescribeWorkerDeploymentVersion" => { rpc_call!(client, call, describe_worker_deployment_version) @@ -651,6 +652,9 @@ async fn call_workflow_service( "SetWorkerDeploymentCurrentVersion" => { rpc_call!(client, call, set_worker_deployment_current_version) } + "SetWorkerDeploymentManager" => { + rpc_call!(client, call, set_worker_deployment_manager) + } "SetWorkerDeploymentRampingVersion" => { rpc_call!(client, call, set_worker_deployment_ramping_version) } diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index 94974067a..a8c575a0b 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -268,7 +268,6 @@ impl WorkerTrait for Worker { } // First, unregister worker from the client if !self.client_worker_registrator.shared_namespace_worker - && !self.config.skip_client_worker_set_check { let _res = self .client From 13d981c119e7a7997c24b04cb9436e1003f618eb Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 14 Oct 2025 16:12:08 -0700 Subject: [PATCH 15/23] Lower sleep time, add print for debugging --- core/src/worker/mod.rs | 3 +-- tests/integ_tests/worker_heartbeat_tests.rs | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index a8c575a0b..f3dbf276a 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -267,8 +267,7 @@ impl WorkerTrait for Worker { *self.status.lock() = WorkerStatus::ShuttingDown; } // First, unregister worker from the client - if !self.client_worker_registrator.shared_namespace_worker - { + if !self.client_worker_registrator.shared_namespace_worker { let _res = self .client .workers() diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index e3dc0e03d..686d4477f 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -137,8 +137,10 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b ]); let mut worker = starter.worker().await; let worker_instance_key = worker.worker_instance_key(); + println!("worker_instance_key: {worker_instance_key:?}"); worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { + println!("wf start"); ctx.activity(ActivityOptions { activity_type: "pass_fail_act".to_string(), input: "pass".as_json_payload().expect("serializes fine"), @@ -146,14 +148,17 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b ..Default::default() }) .await; + println!("wf done"); Ok(().into()) }); static ACTS_STARTED: Semaphore = Semaphore::const_new(0); static ACTS_DONE: Semaphore = Semaphore::const_new(0); worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { + println!("act start"); ACTS_STARTED.add_permits(1); let _ = ACTS_DONE.acquire().await.unwrap(); + println!("act done"); Ok(i) }); @@ -166,7 +171,7 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b let test_fut = async { // Give enough time to ensure heartbeat interval has been hit - tokio::time::sleep(Duration::from_millis(150)).await; + tokio::time::sleep(Duration::from_millis(110)).await; let _ = ACTS_STARTED.acquire().await.unwrap(); let client = starter.get_client().await; let mut raw_client = (*client).clone(); @@ -194,7 +199,13 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b }) .unwrap(); let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + assert_eq!( + heartbeat.worker_instance_key, + worker_instance_key.to_string() + ); + println!("in_activity_checks STARTED"); in_activity_checks(heartbeat, &start_time, &heartbeat_time); + println!("in_activity_checks DONE"); ACTS_DONE.add_permits(1); }; @@ -385,6 +396,7 @@ fn in_activity_checks( start_time: &AtomicCell>, heartbeat_time: &AtomicCell>, ) { + println!("in_activity_checks heartbeat: {heartbeat:#?}"); assert_eq!(heartbeat.status, WorkerStatus::Running as i32); let workflow_task_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); @@ -780,7 +792,7 @@ async fn worker_heartbeat_failure_metrics() { worker.run_until_done().await.unwrap(); - sleep(Duration::from_millis(150)).await; + sleep(Duration::from_millis(110)).await; let client = starter.get_client().await; let mut heartbeats = list_worker_heartbeats(&client, format!("WorkerInstanceKey=\"{worker_key}\"")).await; From f1a363435716745ed6425977717086f69b461f3f Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 14 Oct 2025 16:21:03 -0700 Subject: [PATCH 16/23] more prints --- core/src/worker/client.rs | 2 ++ tests/integ_tests/worker_heartbeat_tests.rs | 29 ++++++++++++++------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index 578976d0e..31902970b 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -664,6 +664,7 @@ impl WorkerClient for WorkerClientBag { w.status = WorkerStatus::Shutdown.into(); self.set_heartbeat_client_fields(w); } + println!("Sending final heartbeat: {:#?}", final_heartbeat.clone()); let request = ShutdownWorkerRequest { namespace: self.namespace.clone(), identity: self.identity.clone(), @@ -684,6 +685,7 @@ impl WorkerClient for WorkerClientBag { namespace: String, worker_heartbeat: Vec, ) -> Result { + println!("Sending heartbeat: {:#?}", worker_heartbeat.clone()); let request = RecordWorkerHeartbeatRequest { namespace, identity: self.identity.clone(), diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index 686d4477f..ece076983 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -152,14 +152,22 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b Ok(().into()) }); - static ACTS_STARTED: Semaphore = Semaphore::const_new(0); - static ACTS_DONE: Semaphore = Semaphore::const_new(0); - worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { - println!("act start"); - ACTS_STARTED.add_permits(1); - let _ = ACTS_DONE.acquire().await.unwrap(); - println!("act done"); - Ok(i) + let acts_started = Arc::new(Semaphore::const_new(0)); + let acts_done = Arc::new(Semaphore::const_new(0)); + + let acts_started_act = acts_started.clone(); + let acts_done_act = acts_done.clone(); + worker.register_activity("pass_fail_act", move |_ctx: ActContext, i: String| { + let acts_started = acts_started_act.clone(); + let acts_done = acts_done_act.clone(); + async move { + println!("act start"); + acts_started.add_permits(1); + let permit = acts_done.acquire().await.unwrap(); + permit.forget(); + println!("act done"); + Ok(i) + } }); starter @@ -172,7 +180,8 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b let test_fut = async { // Give enough time to ensure heartbeat interval has been hit tokio::time::sleep(Duration::from_millis(110)).await; - let _ = ACTS_STARTED.acquire().await.unwrap(); + let permit = acts_started.acquire().await.unwrap(); + permit.forget(); let client = starter.get_client().await; let mut raw_client = (*client).clone(); let workers_list = WorkflowService::list_workers( @@ -206,7 +215,7 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b println!("in_activity_checks STARTED"); in_activity_checks(heartbeat, &start_time, &heartbeat_time); println!("in_activity_checks DONE"); - ACTS_DONE.add_permits(1); + acts_done.add_permits(1); }; let runner = async move { From 6d3a9091b91f2665e5baa70d72966e65aaf417e7 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 14 Oct 2025 17:19:33 -0700 Subject: [PATCH 17/23] use semaphores for worker_heartbeat_failure_metrics --- tests/integ_tests/worker_heartbeat_tests.rs | 137 ++++++++++++++++---- 1 file changed, 113 insertions(+), 24 deletions(-) diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index ece076983..44820ec06 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -21,6 +21,7 @@ use temporal_sdk_core_api::telemetry::{ }; use temporal_sdk_core_api::worker::PollerBehavior; use temporal_sdk_core_protos::coresdk::{AsJsonPayloadExt, FromJsonPayloadExt}; +use temporal_sdk_core_protos::prost_dur; use temporal_sdk_core_protos::temporal::api::common::v1::RetryPolicy; use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerStatus; use temporal_sdk_core_protos::temporal::api::worker::v1::{PluginInfo, WorkerHeartbeat}; @@ -764,44 +765,133 @@ async fn worker_heartbeat_failure_metrics() { starter.worker_config.max_outstanding_activities(5_usize); let mut worker = starter.worker().await; + let worker_instance_key = worker.worker_instance_key(); static COUNT: AtomicU64 = AtomicU64::new(0); - worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { - COUNT.store(COUNT.load(Ordering::Relaxed) + 1, Ordering::Relaxed); - let _asdf = ctx - .activity(ActivityOptions { - activity_type: "failing_act".to_string(), - input: "boom".as_json_payload().expect("serialize"), - start_to_close_timeout: Some(Duration::from_secs(1)), - retry_policy: Some(RetryPolicy { - maximum_attempts: 3, + let activity_fail = Arc::new(Semaphore::const_new(0)); + let workflow_fail = Arc::new(Semaphore::const_new(0)); + + let activity_fail_clone = activity_fail.clone(); + let workflow_fail_clone = workflow_fail.clone(); + worker.register_wf(wf_name.to_string(), move |ctx: WfContext| { + let workflow_fail = workflow_fail_clone.clone(); + async move { + COUNT.store(COUNT.load(Ordering::Relaxed) + 1, Ordering::Relaxed); + let _asdf = ctx + .activity(ActivityOptions { + activity_type: "failing_act".to_string(), + input: "boom".as_json_payload().expect("serialize"), + start_to_close_timeout: Some(Duration::from_secs(1)), + retry_policy: Some(RetryPolicy { + initial_interval: Some(prost_dur!(from_millis(10))), + backoff_coefficient: 1.0, + maximum_attempts: 3, + ..Default::default() + }), ..Default::default() - }), - ..Default::default() - }) - .await; - if COUNT.load(Ordering::Relaxed) == 1 { - panic!("expected WF panic"); + }) + .await; + if COUNT.load(Ordering::Relaxed) == 1 { + workflow_fail.add_permits(1); + panic!("expected WF panic"); + } + Ok(().into()) } - Ok(().into()) }); - worker.register_activity("failing_act", |_ctx: ActContext, _: String| async move { - if COUNT.load(Ordering::Relaxed) >= 3 { - return Ok(()); + worker.register_activity("failing_act", move |_ctx: ActContext, _: String| { + let activity_fail = activity_fail_clone.clone(); + async move { + if COUNT.load(Ordering::Relaxed) >= 3 { + return Ok(()); + } + activity_fail.add_permits(1); + Err(anyhow!("Expected error").into()) } - Err(anyhow!("Expected error").into()) }); - let worker_key = worker.worker_instance_key().to_string(); + let worker_key = worker_instance_key.to_string(); starter.workflow_options.retry_policy = Some(RetryPolicy { maximum_attempts: 2, ..Default::default() }); let _ = starter.start_with_worker(wf_name, &mut worker).await; - worker.run_until_done().await.unwrap(); + let test_fut = async { + let permit = activity_fail.acquire().await.unwrap(); + permit.forget(); + // Gives time for heartbeat to be sent after activity failure + tokio::time::sleep(Duration::from_millis(100)).await; + let client = starter.get_client().await; + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + }, + ) + .await + .unwrap() + .into_inner(); + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + assert_eq!( + heartbeat.worker_instance_key, + worker_instance_key.to_string() + ); + let activity_slots = heartbeat.activity_task_slots_info.clone().unwrap(); + assert!(activity_slots.last_interval_failure_tasks >= 1); + + let permit = workflow_fail.acquire().await.unwrap(); + permit.forget(); + + // Gives time for heartbeat to be sent after activity failure + tokio::time::sleep(Duration::from_millis(100)).await; + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + }, + ) + .await + .unwrap() + .into_inner(); + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + let workflow_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); + assert!(workflow_slots.last_interval_failure_tasks >= 1); + }; + + let runner = async move { + worker.run_until_done().await.unwrap(); + }; + tokio::join!(test_fut, runner); - sleep(Duration::from_millis(110)).await; let client = starter.get_client().await; let mut heartbeats = list_worker_heartbeats(&client, format!("WorkerInstanceKey=\"{worker_key}\"")).await; @@ -810,7 +900,6 @@ async fn worker_heartbeat_failure_metrics() { let activity_slots = heartbeat.activity_task_slots_info.unwrap(); assert_eq!(activity_slots.total_failed_tasks, 3); - assert!(activity_slots.last_interval_failure_tasks >= 1); let workflow_slots = heartbeat.workflow_task_slots_info.unwrap(); assert_eq!(workflow_slots.total_failed_tasks, 1); From a29d2df6f35fe7fce139d3dfb177490690d38d39 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 14 Oct 2025 18:01:19 -0700 Subject: [PATCH 18/23] skip_client_worker_set_check for all integ workers --- tests/common/mod.rs | 3 ++- tests/integ_tests/worker_heartbeat_tests.rs | 8 -------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 859629512..94785b9e6 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -101,7 +101,8 @@ pub(crate) fn integ_worker_config(tq: &str) -> WorkerConfigBuilder { .max_outstanding_workflow_tasks(100_usize) .versioning_strategy(WorkerVersioningStrategy::None { build_id: "test_build_id".to_owned(), - }); + }) + .skip_client_worker_set_check(true); b } diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index 44820ec06..578c09e0d 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -138,10 +138,8 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b ]); let mut worker = starter.worker().await; let worker_instance_key = worker.worker_instance_key(); - println!("worker_instance_key: {worker_instance_key:?}"); worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { - println!("wf start"); ctx.activity(ActivityOptions { activity_type: "pass_fail_act".to_string(), input: "pass".as_json_payload().expect("serializes fine"), @@ -149,7 +147,6 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b ..Default::default() }) .await; - println!("wf done"); Ok(().into()) }); @@ -162,11 +159,9 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b let acts_started = acts_started_act.clone(); let acts_done = acts_done_act.clone(); async move { - println!("act start"); acts_started.add_permits(1); let permit = acts_done.acquire().await.unwrap(); permit.forget(); - println!("act done"); Ok(i) } }); @@ -213,9 +208,7 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b heartbeat.worker_instance_key, worker_instance_key.to_string() ); - println!("in_activity_checks STARTED"); in_activity_checks(heartbeat, &start_time, &heartbeat_time); - println!("in_activity_checks DONE"); acts_done.add_permits(1); }; @@ -406,7 +399,6 @@ fn in_activity_checks( start_time: &AtomicCell>, heartbeat_time: &AtomicCell>, ) { - println!("in_activity_checks heartbeat: {heartbeat:#?}"); assert_eq!(heartbeat.status, WorkerStatus::Running as i32); let workflow_task_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); From fc7f83972a82a09219188a9a83e9f90f2256eca5 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 14 Oct 2025 23:58:22 -0700 Subject: [PATCH 19/23] Can't use tokio semaphore in workflow code --- core/src/worker/client.rs | 2 - tests/integ_tests/worker_heartbeat_tests.rs | 74 +++++---------------- 2 files changed, 18 insertions(+), 58 deletions(-) diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index 31902970b..578976d0e 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -664,7 +664,6 @@ impl WorkerClient for WorkerClientBag { w.status = WorkerStatus::Shutdown.into(); self.set_heartbeat_client_fields(w); } - println!("Sending final heartbeat: {:#?}", final_heartbeat.clone()); let request = ShutdownWorkerRequest { namespace: self.namespace.clone(), identity: self.identity.clone(), @@ -685,7 +684,6 @@ impl WorkerClient for WorkerClientBag { namespace: String, worker_heartbeat: Vec, ) -> Result { - println!("Sending heartbeat: {:#?}", worker_heartbeat.clone()); let request = RecordWorkerHeartbeatRequest { namespace, identity: self.identity.clone(), diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index 578c09e0d..f526a92e4 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -761,34 +761,28 @@ async fn worker_heartbeat_failure_metrics() { static COUNT: AtomicU64 = AtomicU64::new(0); let activity_fail = Arc::new(Semaphore::const_new(0)); - let workflow_fail = Arc::new(Semaphore::const_new(0)); let activity_fail_clone = activity_fail.clone(); - let workflow_fail_clone = workflow_fail.clone(); - worker.register_wf(wf_name.to_string(), move |ctx: WfContext| { - let workflow_fail = workflow_fail_clone.clone(); - async move { - COUNT.store(COUNT.load(Ordering::Relaxed) + 1, Ordering::Relaxed); - let _asdf = ctx - .activity(ActivityOptions { - activity_type: "failing_act".to_string(), - input: "boom".as_json_payload().expect("serialize"), - start_to_close_timeout: Some(Duration::from_secs(1)), - retry_policy: Some(RetryPolicy { - initial_interval: Some(prost_dur!(from_millis(10))), - backoff_coefficient: 1.0, - maximum_attempts: 3, - ..Default::default() - }), + worker.register_wf(wf_name.to_string(), move |ctx: WfContext| async move { + COUNT.store(COUNT.load(Ordering::Relaxed) + 1, Ordering::Relaxed); + let _asdf = ctx + .activity(ActivityOptions { + activity_type: "failing_act".to_string(), + input: "boom".as_json_payload().expect("serialize"), + start_to_close_timeout: Some(Duration::from_secs(1)), + retry_policy: Some(RetryPolicy { + initial_interval: Some(prost_dur!(from_millis(10))), + backoff_coefficient: 1.0, + maximum_attempts: 3, ..Default::default() - }) - .await; - if COUNT.load(Ordering::Relaxed) == 1 { - workflow_fail.add_permits(1); - panic!("expected WF panic"); - } - Ok(().into()) + }), + ..Default::default() + }) + .await; + if COUNT.load(Ordering::Relaxed) == 1 { + panic!("expected WF panic"); } + Ok(().into()) }); worker.register_activity("failing_act", move |_ctx: ActContext, _: String| { let activity_fail = activity_fail_clone.clone(); @@ -845,38 +839,6 @@ async fn worker_heartbeat_failure_metrics() { ); let activity_slots = heartbeat.activity_task_slots_info.clone().unwrap(); assert!(activity_slots.last_interval_failure_tasks >= 1); - - let permit = workflow_fail.acquire().await.unwrap(); - permit.forget(); - - // Gives time for heartbeat to be sent after activity failure - tokio::time::sleep(Duration::from_millis(100)).await; - let workers_list = WorkflowService::list_workers( - &mut raw_client, - ListWorkersRequest { - namespace: client.namespace().to_owned(), - page_size: 100, - next_page_token: Vec::new(), - query: String::new(), - }, - ) - .await - .unwrap() - .into_inner(); - let worker_info = workers_list - .workers_info - .iter() - .find(|worker_info| { - if let Some(hb) = worker_info.worker_heartbeat.as_ref() { - hb.worker_instance_key == worker_instance_key.to_string() - } else { - false - } - }) - .unwrap(); - let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); - let workflow_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); - assert!(workflow_slots.last_interval_failure_tasks >= 1); }; let runner = async move { From c73d7b8e02d47e325df4b2fa16fc05d583515603 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Thu, 16 Oct 2025 12:05:49 -0700 Subject: [PATCH 20/23] use signal to test workflow_slots.last_interval_failure_tasks --- tests/integ_tests/worker_heartbeat_tests.rs | 92 +++++++++++++++++---- 1 file changed, 74 insertions(+), 18 deletions(-) diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index f526a92e4..89ea25989 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -1,6 +1,7 @@ use crate::common::{ANY_PORT, CoreWfStarter, get_integ_telem_options}; use anyhow::anyhow; use crossbeam_utils::atomic::AtomicCell; +use futures_util::StreamExt; use prost_types::Duration as PbDuration; use prost_types::Timestamp; use std::collections::HashSet; @@ -752,6 +753,8 @@ async fn worker_heartbeat_multiple_workers() { #[tokio::test] async fn worker_heartbeat_failure_metrics() { + const WORKFLOW_CONTINUE_SIGNAL: &str = "workflow-continue"; + let wf_name = "worker_heartbeat_failure_metrics"; let mut starter = new_no_metrics_starter(wf_name); starter.worker_config.max_outstanding_activities(5_usize); @@ -761,28 +764,38 @@ async fn worker_heartbeat_failure_metrics() { static COUNT: AtomicU64 = AtomicU64::new(0); let activity_fail = Arc::new(Semaphore::const_new(0)); + let workflow_fail = Arc::new(Semaphore::const_new(0)); let activity_fail_clone = activity_fail.clone(); - worker.register_wf(wf_name.to_string(), move |ctx: WfContext| async move { - COUNT.store(COUNT.load(Ordering::Relaxed) + 1, Ordering::Relaxed); - let _asdf = ctx - .activity(ActivityOptions { - activity_type: "failing_act".to_string(), - input: "boom".as_json_payload().expect("serialize"), - start_to_close_timeout: Some(Duration::from_secs(1)), - retry_policy: Some(RetryPolicy { - initial_interval: Some(prost_dur!(from_millis(10))), - backoff_coefficient: 1.0, - maximum_attempts: 3, + let workflow_fail_clone = workflow_fail.clone(); + worker.register_wf(wf_name.to_string(), move |ctx: WfContext| { + let workflow_fail_clone = workflow_fail_clone.clone(); + COUNT.fetch_add(1, Ordering::Relaxed); + async move { + let mut proceed_signal = ctx.make_signal_channel(WORKFLOW_CONTINUE_SIGNAL); + let _ = ctx + .activity(ActivityOptions { + activity_type: "failing_act".to_string(), + input: "boom".as_json_payload().expect("serialize"), + start_to_close_timeout: Some(Duration::from_secs(1)), + retry_policy: Some(RetryPolicy { + initial_interval: Some(prost_dur!(from_millis(10))), + backoff_coefficient: 1.0, + maximum_attempts: 3, + ..Default::default() + }), ..Default::default() - }), - ..Default::default() - }) - .await; - if COUNT.load(Ordering::Relaxed) == 1 { - panic!("expected WF panic"); + }) + .await; + if COUNT.load(Ordering::Relaxed) == 1 { + workflow_fail_clone.add_permits(1); + panic!("expected WF panic"); + } + // Signal here to avoid workflow from completing and shutdown heartbeat from sending + // before we check workflow_slots.last_interval_failure_tasks + proceed_signal.next().await.unwrap(); + Ok(().into()) } - Ok(().into()) }); worker.register_activity("failing_act", move |_ctx: ActContext, _: String| { let activity_fail = activity_fail_clone.clone(); @@ -839,6 +852,49 @@ async fn worker_heartbeat_failure_metrics() { ); let activity_slots = heartbeat.activity_task_slots_info.clone().unwrap(); assert!(activity_slots.last_interval_failure_tasks >= 1); + + let wf_fail_permit = workflow_fail.acquire().await.unwrap(); + wf_fail_permit.forget(); + // Gives time for heartbeat to be sent after workflow failure + tokio::time::sleep(Duration::from_millis(100)).await; + + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + }, + ) + .await + .unwrap() + .into_inner(); + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + let workflow_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); + assert!(workflow_slots.last_interval_failure_tasks >= 1); + + client + .signal_workflow_execution( + starter.get_wf_id().to_string(), + String::new(), + WORKFLOW_CONTINUE_SIGNAL.to_string(), + None, + None, + ) + .await + .unwrap(); }; let runner = async move { From f52345ea97345ae07d6f3634d143711c83dfb97b Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Thu, 16 Oct 2025 16:14:56 -0700 Subject: [PATCH 21/23] Use Notify instead of semaphores, fix test flake --- tests/integ_tests/worker_heartbeat_tests.rs | 137 ++++++++++---------- 1 file changed, 65 insertions(+), 72 deletions(-) diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index 89ea25989..13db003fe 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -28,7 +28,7 @@ use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerStatus; use temporal_sdk_core_protos::temporal::api::worker::v1::{PluginInfo, WorkerHeartbeat}; use temporal_sdk_core_protos::temporal::api::workflowservice::v1::DescribeWorkerRequest; use temporal_sdk_core_protos::temporal::api::workflowservice::v1::ListWorkersRequest; -use tokio::sync::Semaphore; +use tokio::sync::Notify; use tokio::time::sleep; use url::Url; @@ -151,8 +151,8 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b Ok(().into()) }); - let acts_started = Arc::new(Semaphore::const_new(0)); - let acts_done = Arc::new(Semaphore::const_new(0)); + let acts_started = Arc::new(Notify::new()); + let acts_done = Arc::new(Notify::new()); let acts_started_act = acts_started.clone(); let acts_done_act = acts_done.clone(); @@ -160,9 +160,8 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b let acts_started = acts_started_act.clone(); let acts_done = acts_done_act.clone(); async move { - acts_started.add_permits(1); - let permit = acts_done.acquire().await.unwrap(); - permit.forget(); + acts_started.notify_one(); + acts_done.notified().await; Ok(i) } }); @@ -177,8 +176,7 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b let test_fut = async { // Give enough time to ensure heartbeat interval has been hit tokio::time::sleep(Duration::from_millis(110)).await; - let permit = acts_started.acquire().await.unwrap(); - permit.forget(); + acts_started.notified().await; let client = starter.get_client().await; let mut raw_client = (*client).clone(); let workers_list = WorkflowService::list_workers( @@ -210,7 +208,7 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b worker_instance_key.to_string() ); in_activity_checks(heartbeat, &start_time, &heartbeat_time); - acts_done.add_permits(1); + acts_done.notify_one(); }; let runner = async move { @@ -542,10 +540,10 @@ async fn worker_heartbeat_sticky_cache_miss() { let client = starter.get_client().await; let client_for_orchestrator = client.clone(); - static HISTORY_WF1_ACTIVITY_STARTED: Semaphore = Semaphore::const_new(0); - static HISTORY_WF1_ACTIVITY_FINISH: Semaphore = Semaphore::const_new(0); - static HISTORY_WF2_ACTIVITY_STARTED: Semaphore = Semaphore::const_new(0); - static HISTORY_WF2_ACTIVITY_FINISH: Semaphore = Semaphore::const_new(0); + static HISTORY_WF1_ACTIVITY_STARTED: Notify = Notify::const_new(); + static HISTORY_WF1_ACTIVITY_FINISH: Notify = Notify::const_new(); + static HISTORY_WF2_ACTIVITY_STARTED: Notify = Notify::const_new(); + static HISTORY_WF2_ACTIVITY_FINISH: Notify = Notify::const_new(); worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { let wf_marker = ctx @@ -569,14 +567,12 @@ async fn worker_heartbeat_sticky_cache_miss() { |_ctx: ActContext, marker: String| async move { match marker.as_str() { "wf1" => { - HISTORY_WF1_ACTIVITY_STARTED.add_permits(1); - let permit = HISTORY_WF1_ACTIVITY_FINISH.acquire().await.unwrap(); - permit.forget(); + HISTORY_WF1_ACTIVITY_STARTED.notify_one(); + HISTORY_WF1_ACTIVITY_FINISH.notified().await; } "wf2" => { - HISTORY_WF2_ACTIVITY_STARTED.add_permits(1); - let permit = HISTORY_WF2_ACTIVITY_FINISH.acquire().await.unwrap(); - permit.forget(); + HISTORY_WF2_ACTIVITY_STARTED.notify_one(); + HISTORY_WF2_ACTIVITY_FINISH.notified().await; } _ => {} } @@ -598,8 +594,7 @@ async fn worker_heartbeat_sticky_cache_miss() { .await .unwrap(); - let permit = HISTORY_WF1_ACTIVITY_STARTED.acquire().await.unwrap(); - permit.forget(); + HISTORY_WF1_ACTIVITY_STARTED.notified().await; client_for_orchestrator .get_workflow_execution_history(wf1_id.clone(), Some(wf1_run.clone()), vec![]) @@ -616,17 +611,16 @@ async fn worker_heartbeat_sticky_cache_miss() { .await .unwrap(); - let permit = HISTORY_WF2_ACTIVITY_STARTED.acquire().await.unwrap(); - permit.forget(); + HISTORY_WF2_ACTIVITY_STARTED.notified().await; - HISTORY_WF1_ACTIVITY_FINISH.add_permits(1); + HISTORY_WF1_ACTIVITY_FINISH.notify_one(); let handle1 = client_for_orchestrator.get_untyped_workflow_handle(wf1_id, wf1_run); handle1 .get_workflow_result(Default::default()) .await .expect("wf1 result"); - HISTORY_WF2_ACTIVITY_FINISH.add_permits(1); + HISTORY_WF2_ACTIVITY_FINISH.notify_one(); let handle2 = client_for_orchestrator.get_untyped_workflow_handle(wf2_id, wf2_run); handle2 .get_workflow_result(Default::default()) @@ -761,51 +755,46 @@ async fn worker_heartbeat_failure_metrics() { let mut worker = starter.worker().await; let worker_instance_key = worker.worker_instance_key(); - static COUNT: AtomicU64 = AtomicU64::new(0); - - let activity_fail = Arc::new(Semaphore::const_new(0)); - let workflow_fail = Arc::new(Semaphore::const_new(0)); - - let activity_fail_clone = activity_fail.clone(); - let workflow_fail_clone = workflow_fail.clone(); - worker.register_wf(wf_name.to_string(), move |ctx: WfContext| { - let workflow_fail_clone = workflow_fail_clone.clone(); - COUNT.fetch_add(1, Ordering::Relaxed); - async move { - let mut proceed_signal = ctx.make_signal_channel(WORKFLOW_CONTINUE_SIGNAL); - let _ = ctx - .activity(ActivityOptions { - activity_type: "failing_act".to_string(), - input: "boom".as_json_payload().expect("serialize"), - start_to_close_timeout: Some(Duration::from_secs(1)), - retry_policy: Some(RetryPolicy { - initial_interval: Some(prost_dur!(from_millis(10))), - backoff_coefficient: 1.0, - maximum_attempts: 3, - ..Default::default() - }), + static ACT_COUNT: AtomicU64 = AtomicU64::new(0); + static WF_COUNT: AtomicU64 = AtomicU64::new(0); + static ACT_FAIL: Notify = Notify::const_new(); + static WF_FAIL: Notify = Notify::const_new(); + worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { + let _ = ctx + .activity(ActivityOptions { + activity_type: "failing_act".to_string(), + input: "boom".as_json_payload().expect("serialize"), + start_to_close_timeout: Some(Duration::from_secs(1)), + retry_policy: Some(RetryPolicy { + initial_interval: Some(prost_dur!(from_millis(10))), + backoff_coefficient: 1.0, + maximum_attempts: 4, ..Default::default() - }) - .await; - if COUNT.load(Ordering::Relaxed) == 1 { - workflow_fail_clone.add_permits(1); - panic!("expected WF panic"); - } - // Signal here to avoid workflow from completing and shutdown heartbeat from sending - // before we check workflow_slots.last_interval_failure_tasks - proceed_signal.next().await.unwrap(); - Ok(().into()) + }), + ..Default::default() + }) + .await; + + if WF_COUNT.load(Ordering::Relaxed) == 0 { + WF_COUNT.fetch_add(1, Ordering::Relaxed); + WF_FAIL.notify_one(); + panic!("expected WF panic"); } + + // Signal here to avoid workflow from completing and shutdown heartbeat from sending + // before we check workflow_slots.last_interval_failure_tasks + let mut proceed_signal = ctx.make_signal_channel(WORKFLOW_CONTINUE_SIGNAL); + proceed_signal.next().await.unwrap(); + Ok(().into()) }); - worker.register_activity("failing_act", move |_ctx: ActContext, _: String| { - let activity_fail = activity_fail_clone.clone(); - async move { - if COUNT.load(Ordering::Relaxed) >= 3 { - return Ok(()); - } - activity_fail.add_permits(1); - Err(anyhow!("Expected error").into()) + + worker.register_activity("failing_act", |_ctx: ActContext, _: String| async move { + if ACT_COUNT.load(Ordering::Relaxed) == 3 { + return Ok(()); } + ACT_COUNT.fetch_add(1, Ordering::Relaxed); + ACT_FAIL.notify_one(); + Err(anyhow!("Expected error").into()) }); let worker_key = worker_instance_key.to_string(); @@ -813,15 +802,18 @@ async fn worker_heartbeat_failure_metrics() { maximum_attempts: 2, ..Default::default() }); + let _ = starter.start_with_worker(wf_name, &mut worker).await; let test_fut = async { - let permit = activity_fail.acquire().await.unwrap(); - permit.forget(); - // Gives time for heartbeat to be sent after activity failure + ACT_FAIL.notified().await; + + // Give time for heartbeat to reflect activity failure tokio::time::sleep(Duration::from_millis(100)).await; + let client = starter.get_client().await; let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( &mut raw_client, ListWorkersRequest { @@ -853,9 +845,9 @@ async fn worker_heartbeat_failure_metrics() { let activity_slots = heartbeat.activity_task_slots_info.clone().unwrap(); assert!(activity_slots.last_interval_failure_tasks >= 1); - let wf_fail_permit = workflow_fail.acquire().await.unwrap(); - wf_fail_permit.forget(); - // Gives time for heartbeat to be sent after workflow failure + WF_FAIL.notified().await; + + // Give time for heartbeat to reflect workflow failure tokio::time::sleep(Duration::from_millis(100)).await; let workers_list = WorkflowService::list_workers( @@ -881,6 +873,7 @@ async fn worker_heartbeat_failure_metrics() { } }) .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); let workflow_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); assert!(workflow_slots.last_interval_failure_tasks >= 1); From 1601f0e86282e6fcf3a48fdfa5fab92723f3c6bb Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Thu, 16 Oct 2025 17:27:22 -0700 Subject: [PATCH 22/23] Use eventually() instead of a manual sleep --- tests/integ_tests/worker_heartbeat_tests.rs | 137 +++++++++++--------- 1 file changed, 75 insertions(+), 62 deletions(-) diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index 13db003fe..f3769d87c 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -1,4 +1,4 @@ -use crate::common::{ANY_PORT, CoreWfStarter, get_integ_telem_options}; +use crate::common::{ANY_PORT, CoreWfStarter, eventually, get_integ_telem_options}; use anyhow::anyhow; use crossbeam_utils::atomic::AtomicCell; use futures_util::StreamExt; @@ -807,77 +807,90 @@ async fn worker_heartbeat_failure_metrics() { let test_fut = async { ACT_FAIL.notified().await; - - // Give time for heartbeat to reflect activity failure - tokio::time::sleep(Duration::from_millis(100)).await; - let client = starter.get_client().await; - let mut raw_client = (*client).clone(); - - let workers_list = WorkflowService::list_workers( - &mut raw_client, - ListWorkersRequest { - namespace: client.namespace().to_owned(), - page_size: 100, - next_page_token: Vec::new(), - query: String::new(), + eventually( + || async { + let mut raw_client = (*client).clone(); + + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + }, + ) + .await + .unwrap() + .into_inner(); + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + assert_eq!( + heartbeat.worker_instance_key, + worker_instance_key.to_string() + ); + let activity_slots = heartbeat.activity_task_slots_info.clone().unwrap(); + if activity_slots.last_interval_failure_tasks >= 1 { + return Ok(()); + } + Err("activity_slots.last_interval_failure_tasks still 0, retrying") }, + Duration::from_millis(150), ) .await - .unwrap() - .into_inner(); - let worker_info = workers_list - .workers_info - .iter() - .find(|worker_info| { - if let Some(hb) = worker_info.worker_heartbeat.as_ref() { - hb.worker_instance_key == worker_instance_key.to_string() - } else { - false - } - }) - .unwrap(); - let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); - assert_eq!( - heartbeat.worker_instance_key, - worker_instance_key.to_string() - ); - let activity_slots = heartbeat.activity_task_slots_info.clone().unwrap(); - assert!(activity_slots.last_interval_failure_tasks >= 1); + .unwrap(); WF_FAIL.notified().await; - // Give time for heartbeat to reflect workflow failure - tokio::time::sleep(Duration::from_millis(100)).await; - - let workers_list = WorkflowService::list_workers( - &mut raw_client, - ListWorkersRequest { - namespace: client.namespace().to_owned(), - page_size: 100, - next_page_token: Vec::new(), - query: String::new(), + eventually( + || async { + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + }, + ) + .await + .unwrap() + .into_inner(); + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + let workflow_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); + if workflow_slots.last_interval_failure_tasks >= 1 { + return Ok(()); + } + Err("workflow_slots.last_interval_failure_tasks still 0, retrying") }, + Duration::from_millis(150), ) .await - .unwrap() - .into_inner(); - let worker_info = workers_list - .workers_info - .iter() - .find(|worker_info| { - if let Some(hb) = worker_info.worker_heartbeat.as_ref() { - hb.worker_instance_key == worker_instance_key.to_string() - } else { - false - } - }) - .unwrap(); - - let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); - let workflow_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); - assert!(workflow_slots.last_interval_failure_tasks >= 1); - + .unwrap(); client .signal_workflow_execution( starter.get_wf_id().to_string(), From 8f1ef5a0f1ac830fd805c7baa371e120d40671a5 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Thu, 16 Oct 2025 20:28:09 -0700 Subject: [PATCH 23/23] max_outstanding_workflow_tasks 2 --- tests/integ_tests/worker_heartbeat_tests.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs index f3769d87c..1184cc505 100644 --- a/tests/integ_tests/worker_heartbeat_tests.rs +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -529,7 +529,10 @@ fn after_shutdown_checks( async fn worker_heartbeat_sticky_cache_miss() { let wf_name = "worker_heartbeat_cache_miss"; let mut starter = new_no_metrics_starter(wf_name); - starter.worker_config.max_cached_workflows(1_usize); + starter + .worker_config + .max_cached_workflows(1_usize) + .max_outstanding_workflow_tasks(2_usize); let mut worker = starter.worker().await; worker.fetch_results = false; @@ -596,11 +599,6 @@ async fn worker_heartbeat_sticky_cache_miss() { HISTORY_WF1_ACTIVITY_STARTED.notified().await; - client_for_orchestrator - .get_workflow_execution_history(wf1_id.clone(), Some(wf1_run.clone()), vec![]) - .await - .unwrap(); - let wf2_run = submitter .submit_wf( wf2_id.clone(),