-
Notifications
You must be signed in to change notification settings - Fork 100
No retry worker shutdown & fix shutdown hang #1054
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,7 +53,7 @@ use anyhow::bail; | |
| use crossbeam_utils::atomic::AtomicCell; | ||
| use futures_util::{StreamExt, stream}; | ||
| use gethostname::gethostname; | ||
| use parking_lot::{Mutex, RwLock}; | ||
| use parking_lot::RwLock; | ||
| use slot_provider::SlotProvider; | ||
| use std::{ | ||
| convert::TryInto, | ||
|
|
@@ -137,7 +137,7 @@ pub struct Worker { | |
| /// Used to track worker client | ||
| client_worker_registrator: Arc<ClientWorkerRegistrator>, | ||
| /// Status of the worker | ||
| status: Arc<Mutex<WorkerStatus>>, | ||
| status: Arc<RwLock<WorkerStatus>>, | ||
| } | ||
|
|
||
| struct AllPermitsTracker { | ||
|
|
@@ -256,7 +256,7 @@ impl WorkerTrait for Worker { | |
| } | ||
| self.shutdown_token.cancel(); | ||
| { | ||
| *self.status.lock() = WorkerStatus::ShuttingDown; | ||
| *self.status.write() = WorkerStatus::ShuttingDown; | ||
| } | ||
| // First, unregister worker from the client | ||
| if !self.client_worker_registrator.shared_namespace_worker { | ||
|
|
@@ -276,12 +276,11 @@ impl WorkerTrait for Worker { | |
|
|
||
| if !self.workflows.ever_polled() { | ||
| self.local_act_mgr.workflows_have_shutdown(); | ||
| } else { | ||
| // Bump the workflow stream with a pointless input, since if a client initiates shutdown | ||
| // and then immediately blocks waiting on a workflow activation poll, it's possible that | ||
| // there may not be any more inputs ever, and that poll will never resolve. | ||
| self.workflows.send_get_state_info_msg(); | ||
| } | ||
| // Bump the workflow stream with a pointless input, since if a client initiates shutdown | ||
| // and then immediately blocks waiting on a workflow activation poll, it's possible that | ||
| // there may not be any more inputs ever, and that poll will never resolve. | ||
| self.workflows.send_get_state_info_msg(); | ||
|
Comment on lines
+280
to
+283
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the real fix for the shutdown hang. There was a race where "ever polled" could be set true, but nothing else ever ended up in the stream, and driving the stream with bonus messages doesn't happen until we get to |
||
| } | ||
|
|
||
| async fn shutdown(&self) { | ||
|
|
@@ -361,7 +360,12 @@ impl Worker { | |
|
|
||
| #[cfg(test)] | ||
| pub(crate) fn new_test(config: WorkerConfig, client: impl WorkerClient + 'static) -> Self { | ||
| Self::new(config, None, Arc::new(client), None, None).unwrap() | ||
| let sticky_queue_name = if config.max_cached_workflows > 0 { | ||
| Some(format!("sticky-{}", config.task_queue)) | ||
| } else { | ||
| None | ||
| }; | ||
| Self::new(config, sticky_queue_name, Arc::new(client), None, None).unwrap() | ||
| } | ||
|
|
||
| pub(crate) fn new_with_pollers( | ||
|
|
@@ -575,7 +579,7 @@ impl Worker { | |
| deployment_options, | ||
| ); | ||
| let worker_instance_key = Uuid::new_v4(); | ||
| let worker_status = Arc::new(Mutex::new(WorkerStatus::Running)); | ||
| let worker_status = Arc::new(RwLock::new(WorkerStatus::Running)); | ||
|
|
||
| let sdk_name_and_ver = client.sdk_name_and_version(); | ||
| let worker_heartbeat = worker_heartbeat_interval.map(|hb_interval| { | ||
|
|
@@ -698,7 +702,10 @@ impl Worker { | |
| tonic::Code::Unimplemented | tonic::Code::Unavailable | ||
| ) => | ||
| { | ||
| warn!("Failed to shutdown sticky queue {:?}", err); | ||
| warn!( | ||
| "shutdown_worker rpc errored during worker shutdown: {:?}", | ||
| err | ||
| ); | ||
| } | ||
| _ => {} | ||
| } | ||
|
|
@@ -1048,7 +1055,7 @@ struct HeartbeatMetrics { | |
| wf_sticky_last_suc_poll_time: Arc<AtomicCell<Option<SystemTime>>>, | ||
| act_last_suc_poll_time: Arc<AtomicCell<Option<SystemTime>>>, | ||
| nexus_last_suc_poll_time: Arc<AtomicCell<Option<SystemTime>>>, | ||
| status: Arc<Mutex<WorkerStatus>>, | ||
| status: Arc<RwLock<WorkerStatus>>, | ||
| sys_info: Arc<dyn SystemResourceInfo + Send + Sync>, | ||
| } | ||
|
|
||
|
|
@@ -1094,7 +1101,7 @@ impl WorkerHeartbeatManager { | |
| task_queue: config.task_queue.clone(), | ||
| deployment_version, | ||
|
|
||
| status: (*heartbeat_manager_metrics.status.lock()) as i32, | ||
| status: (*heartbeat_manager_metrics.status.read()) as i32, | ||
| start_time, | ||
| plugins: config.plugins.clone(), | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| use futures_util::future::{BoxFuture, FutureExt}; | ||
| use std::{ | ||
| convert::Infallible, | ||
| task::{Context, Poll}, | ||
| }; | ||
| use tokio::{ | ||
| net::TcpListener, | ||
| sync::{mpsc::UnboundedSender, oneshot}, | ||
| }; | ||
| use tonic::{ | ||
| body::Body, codegen::Service, codegen::http::Response, server::NamedService, transport::Server, | ||
| }; | ||
|
|
||
| #[derive(Clone)] | ||
| pub(crate) struct GenericService<F> { | ||
| pub header_to_parse: &'static str, | ||
| pub header_tx: UnboundedSender<String>, | ||
| pub response_maker: F, | ||
| } | ||
| impl<F> Service<tonic::codegen::http::Request<Body>> for GenericService<F> | ||
| where | ||
| F: FnMut(tonic::codegen::http::Request<Body>) -> BoxFuture<'static, Response<Body>>, | ||
| { | ||
| type Response = Response<Body>; | ||
| type Error = Infallible; | ||
| type Future = BoxFuture<'static, Result<Self::Response, Self::Error>>; | ||
|
|
||
| fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> { | ||
| Poll::Ready(Ok(())) | ||
| } | ||
|
|
||
| fn call(&mut self, req: tonic::codegen::http::Request<Body>) -> Self::Future { | ||
| self.header_tx | ||
| .send( | ||
| String::from_utf8_lossy( | ||
| req.headers() | ||
| .get(self.header_to_parse) | ||
| .map(|hv| hv.as_bytes()) | ||
| .unwrap_or_default(), | ||
| ) | ||
| .to_string(), | ||
| ) | ||
| .unwrap(); | ||
| let r = (self.response_maker)(req); | ||
| async move { Ok(r.await) }.boxed() | ||
| } | ||
| } | ||
| impl<F> NamedService for GenericService<F> { | ||
| const NAME: &'static str = "temporal.api.workflowservice.v1.WorkflowService"; | ||
| } | ||
|
|
||
| pub(crate) struct FakeServer { | ||
| pub addr: std::net::SocketAddr, | ||
| shutdown_tx: oneshot::Sender<()>, | ||
| pub header_rx: tokio::sync::mpsc::UnboundedReceiver<String>, | ||
| pub server_handle: tokio::task::JoinHandle<()>, | ||
| } | ||
|
|
||
| pub(crate) async fn fake_server<F>(response_maker: F) -> FakeServer | ||
| where | ||
| F: FnMut(tonic::codegen::http::Request<Body>) -> BoxFuture<'static, Response<Body>> | ||
| + Clone | ||
| + Send | ||
| + Sync | ||
| + 'static, | ||
| { | ||
| let (shutdown_tx, shutdown_rx) = oneshot::channel::<()>(); | ||
| let (header_tx, header_rx) = tokio::sync::mpsc::unbounded_channel(); | ||
|
|
||
| let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); | ||
| let addr = listener.local_addr().unwrap(); | ||
|
|
||
| let server_handle = tokio::spawn(async move { | ||
| Server::builder() | ||
| .add_service(GenericService { | ||
| header_to_parse: "grpc-timeout", | ||
| header_tx, | ||
| response_maker, | ||
| }) | ||
| .serve_with_incoming_shutdown( | ||
| tokio_stream::wrappers::TcpListenerStream::new(listener), | ||
| async { | ||
| shutdown_rx.await.ok(); | ||
| }, | ||
| ) | ||
| .await | ||
| .unwrap(); | ||
| }); | ||
|
|
||
| FakeServer { | ||
| addr, | ||
| shutdown_tx, | ||
| header_rx, | ||
| server_handle, | ||
| } | ||
| } | ||
|
|
||
| impl FakeServer { | ||
| pub(crate) async fn shutdown(self) { | ||
| self.shutdown_tx.send(()).unwrap(); | ||
| self.server_handle.await.unwrap(); | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just some bonus cleanup to avoid holding this lock while doing the callback gathering