triton-inference-server · GuanLuo · Nov 13, 2023 · Nov 9, 2023
diff --git a/qa/L0_generative_sequence/generative_sequence_e2e.py b/qa/L0_generative_sequence/generative_sequence_e2e.py
@@ -127,6 +127,25 @@
                     self.assertEqual(res_count, data_item.as_numpy("OUTPUT")[0][0])
             self.assertEqual(0, res_count)
 
+    def test_reschedule_error(self):
+        # Use short idle timeout (< backend reschedule delay: 0.5s) so that
+        # the backend won't be able to reschedule the request as the scheduler
+        # will terminate the sequence early
+        config = r'"sequence_batching" : { "generative_sequence" : true, "max_sequence_idle_microseconds" : 200000 }'
+        with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
+            triton_client.load_model(
+                "generative_sequence", config=MODEL_CONFIG_BASE.format(config)
+            )
+        with self.assertRaises(InferenceServerException) as context:
+            # Without specifying 'generative_sequence : true', the sequence
+            # batcher expects sequence parameters to be provided explicitly
+            self.test_grpc_stream()
+        print(str(context.exception))
+        self.assertTrue(
+            "must specify the START flag on the first request of the sequence"
+            in str(context.exception)
+        )
+
     def test_unsupported_sequence_scheduler(self):
         # Override model config with scheduler settings that do not support
         # request rescheduling.
@@ -145,7 +164,6 @@
                 # batcher expects sequence parameters to be provided explicitly
                 self.test_grpc_stream(sequence_id=sid, sequence_start=True)
             sid += 1
-            print(str(context.exception))
             self.assertTrue(
                 "Request is released with TRITONSERVER_REQUEST_RELEASE_RESCHEDULE"
                 in str(context.exception)
@@ -164,7 +182,6 @@
                 )
             with self.assertRaises(InferenceServerException) as context:
                 self.test_grpc_stream()
-            print(str(context.exception))
             self.assertTrue(
                 "Request is released with TRITONSERVER_REQUEST_RELEASE_RESCHEDULE"
                 in str(context.exception)

diff --git a/qa/L0_generative_sequence/test.sh b/qa/L0_generative_sequence/test.sh
@@ -44,7 +44,7 @@ RET=0
 
 CLIENT_LOG="./generative_sequence_client.log"
 TEST_PY=./generative_sequence_e2e.py
-EXPECTED_NUM_TESTS="4"
+EXPECTED_NUM_TESTS="5"
 TEST_RESULT_FILE='test_results.txt'
 
 

diff --git a/src/test/generative_sequence/src/generative_sequence.cc b/src/test/generative_sequence/src/generative_sequence.cc
@@ -533,6 +533,8 @@ TRITONBACKEND_ModelInstanceExecute(
     SET_TIMESTAMP(exec_end_ns);
     max_exec_end_ns = std::max(max_exec_end_ns, exec_end_ns);
 
+    // wait for 0.5 second before rescheduling the request.
+    std::this_thread::sleep_for(std::chrono::milliseconds(500));
     // Release the request first as the testing backend may be configured to
     // receive error on request release, in such a case, the error will be
     // propagated back through error response.