Fix query related is_replay flag error (#597)

Sushisource · mjameswh · commit 979366a9dae7 · 2023-08-21T20:15:17.000-04:00
(cherry picked from commit e132dfe)
diff --git a/core/src/core_tests/replay_flag.rs b/core/src/core_tests/replay_flag.rs
@@ -1,8 +1,21 @@
-use crate::{test_help::canned_histories, worker::ManagedWFFunc};
+use crate::{
+    test_help::{
+        build_mock_pollers, canned_histories, hist_to_poll_resp, mock_worker, MockPollCfg,
+    },
+    worker::{client::mocks::mock_workflow_client, ManagedWFFunc, LEGACY_QUERY_ID},
+};
 use rstest::{fixture, rstest};
-use std::time::Duration;
+use std::{collections::VecDeque, time::Duration};
 use temporal_sdk::{WfContext, WorkflowFunction};
-use temporal_sdk_core_protos::temporal::api::enums::v1::CommandType;
+use temporal_sdk_core_api::Worker;
+use temporal_sdk_core_protos::{
+    coresdk::{
+        workflow_commands::{workflow_command::Variant::RespondToQuery, QueryResult, QuerySuccess},
+        workflow_completion::WorkflowActivationCompletion,
+    },
+    temporal::api::{enums::v1::CommandType, query::v1::WorkflowQuery},
+};
+use temporal_sdk_core_test_utils::start_timer_cmd;
 
 fn timers_wf(num_timers: u32) -> WorkflowFunction {
     WorkflowFunction::new(move |command_sink: WfContext| async move {
@@ -63,3 +76,60 @@ async fn replay_flag_is_correct_partial_history() {
     assert_eq!(commands[0].command_type, CommandType::StartTimer as i32);
     wfm.shutdown().await.unwrap();
 }
+
+#[tokio::test]
+async fn replay_flag_correct_with_query() {
+    let wfid = "fake_wf_id";
+    let t = canned_histories::single_timer("1");
+    let tasks = VecDeque::from(vec![
+        {
+            let mut pr = hist_to_poll_resp(&t, wfid.to_owned(), 2.into());
+            // Server can issue queries that contain the WFT completion and the subsequent
+            // commands, but not the consequences yet.
+            pr.query = Some(WorkflowQuery {
+                query_type: "query-type".to_string(),
+                query_args: Some(b"hi".into()),
+                header: None,
+            });
+            let h = pr.history.as_mut().unwrap();
+            h.events.truncate(5);
+            pr.started_event_id = 3;
+            dbg!(&pr.resp);
+            pr
+        },
+        hist_to_poll_resp(&t, wfid.to_owned(), 2.into()),
+    ]);
+    let mut mock = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_workflow_client());
+    mock.num_expected_legacy_query_resps = 1;
+    let mut mock = build_mock_pollers(mock);
+    mock.worker_cfg(|wc| wc.max_cached_workflows = 10);
+    let core = mock_worker(mock);
+
+    let task = core.poll_workflow_activation().await.unwrap();
+    core.complete_workflow_activation(WorkflowActivationCompletion::from_cmd(
+        task.run_id,
+        start_timer_cmd(1, Duration::from_secs(1)),
+    ))
+    .await
+    .unwrap();
+
+    let task = core.poll_workflow_activation().await.unwrap();
+    assert!(task.is_replaying);
+    core.complete_workflow_activation(WorkflowActivationCompletion::from_cmd(
+        task.run_id,
+        RespondToQuery(QueryResult {
+            query_id: LEGACY_QUERY_ID.to_string(),
+            variant: Some(
+                QuerySuccess {
+                    response: Some("hi".into()),
+                }
+                .into(),
+            ),
+        }),
+    ))
+    .await
+    .unwrap();
+
+    let task = core.poll_workflow_activation().await.unwrap();
+    assert!(!task.is_replaying);
+}
diff --git a/core/src/worker/workflow/machines/workflow_machines.rs b/core/src/worker/workflow/machines/workflow_machines.rs
@@ -367,9 +367,17 @@ impl WorkflowMachines {
     /// "no work" situation. Possibly, it may know about some work the machines don't, like queries.
     pub(crate) fn get_wf_activation(&mut self) -> WorkflowActivation {
         let jobs = self.drive_me.drain_jobs();
+        // Even though technically we may have satisfied all the criteria to be done with replay,
+        // query only activations are always "replaying" to keep things sane.
+        let all_query = jobs.iter().all(|j| {
+            matches!(
+                j.variant,
+                Some(workflow_activation_job::Variant::QueryWorkflow(_))
+            )
+        });
         WorkflowActivation {
             timestamp: self.current_wf_time.map(Into::into),
-            is_replaying: self.replaying,
+            is_replaying: self.replaying || all_query,
             run_id: self.run_id.clone(),
             history_length: self.last_processed_event as u32,
             jobs,
@@ -488,7 +496,6 @@ impl WorkflowMachines {
             }
         }
 
-        let mut saw_completed = false;
         let mut do_handle_event = true;
         let mut history = events.into_iter().peekable();
         while let Some(event) = history.next() {
@@ -504,17 +511,21 @@ impl WorkflowMachines {
             // This definition of replaying here is that we are no longer replaying as soon as we
             // see new events that have never been seen or produced by the SDK.
             //
-            // Specifically, replay ends once we have seen the last command-event which was produced
-            // as a result of the last completed WFT. Thus, replay would be false for things like
-            // signals which were received and after the last completion, and thus generated the
-            // current WFT being handled.
-            if self.replaying && has_final_event && saw_completed && !event.is_command_event() {
+            // Specifically, replay ends once we have seen any non-command event (IE: events that
+            // aren't a result of something we produced in the SDK) on a WFT which has the final
+            // event in history (meaning we are processing the most recent WFT and there are no
+            // more subsequent WFTs). WFT Completed in this case does not count as a non-command
+            // event, because that will typically show up as the first event in an incremental
+            // history, and we want to ignore it and its associated commands since we "produced"
+            // them.
+            if self.replaying
+                && has_final_event
+                && event.event_type() != EventType::WorkflowTaskCompleted
+                && !event.is_command_event()
+            {
                 // Replay is finished
                 self.replaying = false;
             }
-            if event.event_type() == EventType::WorkflowTaskCompleted {
-                saw_completed = true;
-            }
 
             if do_handle_event {
                 let eho = self.handle_event(