From 1d33f342f6a43ec9171249684be9f40535c206cd Mon Sep 17 00:00:00 2001
From: Katherine Yang <katheriney@nvidia.com>
Date: Wed, 8 Nov 2023 18:49:41 -0800
Subject: [PATCH] finalize testing and make utils take float instead of only
 int

---
 qa/L0_client_timeout/client_timeout_test.py | 99 ++++++++++++++-------
 qa/L0_client_timeout/test.sh                | 44 +++++----
 src/grpc/grpc_utils.cc                      |  5 +-
 3 files changed, 97 insertions(+), 51 deletions(-)

diff --git a/qa/L0_client_timeout/client_timeout_test.py b/qa/L0_client_timeout/client_timeout_test.py
index c0ea76bc8b..1f7ac7ae0f 100755
--- a/qa/L0_client_timeout/client_timeout_test.py
+++ b/qa/L0_client_timeout/client_timeout_test.py
@@ -60,22 +60,33 @@ def setUp(self):
         self.model_name_ = "custom_identity_int32"
         self.input0_data_ = np.array([[10]], dtype=np.int32)
         self.input0_data_byte_size_ = 32
-        self.SMALL_INTERVAL = sys.float_info.min  # guarantees a timeout
-        self.NORMAL_INTERVAL = 5  # seconds for server to load then receive request
+        self.SMALL_INTERVAL = 0.1  # seconds for a timeout
+        self.INFER_SMALL_INTERVAL = 2.0  # seconds for a timeout
+        self.NORMAL_INTERVAL = 5.0  # seconds for server to load then receive request
 
     def test_grpc_server_live(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.is_server_live(client_timeout=self.SMALL_INTERVAL)
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        self.assertTrue(
+            triton_client.is_server_live(client_timeout=self.NORMAL_INTERVAL)
+        )
+
+    def test_grpc_is_server_ready(self):
         triton_client = grpcclient.InferenceServerClient(
             url="localhost:8001", verbose=True
         )
         with self.assertRaises(InferenceServerException) as cm:
             _ = triton_client.is_server_ready(client_timeout=self.SMALL_INTERVAL)
         self.assertIn("Deadline Exceeded", str(cm.exception))
-        # server should already be ready
         self.assertTrue(
             triton_client.is_server_ready(client_timeout=self.NORMAL_INTERVAL)
         )
 
-    def test_grpc_model_ready(self):
+    def test_grpc_is_model_ready(self):
         triton_client = grpcclient.InferenceServerClient(
             url="localhost:8001", verbose=True
         )
@@ -90,7 +101,7 @@ def test_grpc_model_ready(self):
             )
         )
 
-    def test_grpc_server_metadata(self):
+    def test_grpc_get_server_metadata(self):
         triton_client = grpcclient.InferenceServerClient(
             url="localhost:8001", verbose=True
         )
@@ -100,7 +111,20 @@ def test_grpc_server_metadata(self):
 
         triton_client.get_server_metadata(client_timeout=self.NORMAL_INTERVAL)
 
-    def test_grpc_model_config(self):
+    def test_grpc_get_model_metadata(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.get_model_metadata(
+                model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.get_model_metadata(
+            model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL
+        )
+
+    def test_grpc_get_model_config(self):
         triton_client = grpcclient.InferenceServerClient(
             url="localhost:8001", verbose=True
         )
@@ -128,6 +152,7 @@ def test_grpc_load_model(self):
         triton_client = grpcclient.InferenceServerClient(
             url="localhost:8001", verbose=True
         )
+        triton_client.unload_model(model_name=self.model_name_)
         with self.assertRaises(InferenceServerException) as cm:
             _ = triton_client.load_model(
                 model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL
@@ -155,7 +180,7 @@ def test_grpc_unload_model(self):
         )
         triton_client.load_model(model_name=self.model_name_)
 
-    def test_grpc_inference_statistics(self):
+    def test_grpc_get_inference_statistics(self):
         triton_client = grpcclient.InferenceServerClient(
             url="localhost:8001", verbose=True
         )
@@ -221,51 +246,70 @@ def test_grpc_get_log_settings(self):
             as_json=True, client_timeout=self.NORMAL_INTERVAL
         )
 
+    def test_grpc_get_system_shared_memory_status(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.get_system_shared_memory_status(
+                client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.get_system_shared_memory_status(
+            client_timeout=self.NORMAL_INTERVAL
+        )
+
     def test_grpc_register_system_shared_memory(self):
         triton_client = grpcclient.InferenceServerClient(
             url="localhost:8001", verbose=True
         )
+        triton_client.unregister_system_shared_memory()
+        import tritonclient.utils.shared_memory as shm
 
+        shm_ip0_handle = shm.create_shared_memory_region(
+            "input0_data", "/input_simple", self.input0_data_byte_size_
+        )
+        shm.set_shared_memory_region(shm_ip0_handle, [self.input0_data_])
         with self.assertRaises(InferenceServerException) as cm:
             _ = triton_client.register_system_shared_memory(
-                "input_data",
+                "input0_data",
                 "/input_simple",
                 self.input0_data_byte_size_,
                 client_timeout=self.SMALL_INTERVAL,
             )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.unregister_system_shared_memory()
         triton_client.register_system_shared_memory(
-            "input_data",
+            "input0_data",
             "/input_simple",
             self.input0_data_byte_size_,
             client_timeout=self.NORMAL_INTERVAL,
         )
-        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.unregister_system_shared_memory()
 
-    def test_grpc_get_system_shared_memory(self):
+    def test_grpc_unregister_system_shared_memory(self):
         triton_client = grpcclient.InferenceServerClient(
             url="localhost:8001", verbose=True
         )
         with self.assertRaises(InferenceServerException) as cm:
-            _ = triton_client.get_system_shared_memory_status(
+            _ = triton_client.unregister_system_shared_memory(
                 client_timeout=self.SMALL_INTERVAL
             )
         self.assertIn("Deadline Exceeded", str(cm.exception))
-        triton_client.get_system_shared_memory_status(
+        triton_client.unregister_system_shared_memory(
             client_timeout=self.NORMAL_INTERVAL
         )
 
-    def test_grpc_unregister_system_shared_memory(self):
+    def test_grpc_get_cuda_shared_memory_status(self):
         triton_client = grpcclient.InferenceServerClient(
             url="localhost:8001", verbose=True
         )
         with self.assertRaises(InferenceServerException) as cm:
-            _ = triton_client.unregister_system_shared_memory(
+            _ = triton_client.get_cuda_shared_memory_status(
                 client_timeout=self.SMALL_INTERVAL
             )
         self.assertIn("Deadline Exceeded", str(cm.exception))
-        triton_client.unregister_system_shared_memory(
-            client_timeout=self.NORMAL_INTERVAL
-        )
+        triton_client.get_cuda_shared_memory_status(client_timeout=self.NORMAL_INTERVAL)
 
     def test_grpc_register_cuda_shared_memory(self):
         triton_client = grpcclient.InferenceServerClient(
@@ -298,18 +342,7 @@ def test_grpc_register_cuda_shared_memory(self):
         )
         cshm.destroy_shared_memory_region(shm_op0_handle)
 
-    def test_grpc_get_cuda_shared_memory_status(self):
-        triton_client = grpcclient.InferenceServerClient(
-            url="localhost:8001", verbose=True
-        )
-        with self.assertRaises(InferenceServerException) as cm:
-            _ = triton_client.get_cuda_shared_memory_status(
-                client_timeout=self.SMALL_INTERVAL
-            )
-        self.assertIn("Deadline Exceeded", str(cm.exception))
-        triton_client.get_cuda_shared_memory_status(client_timeout=self.NORMAL_INTERVAL)
-
-    def test_grpc_uregister_cuda_shared_memory(self):
+    def test_grpc_unregister_cuda_shared_memory(self):
         triton_client = grpcclient.InferenceServerClient(
             url="localhost:8001", verbose=True
         )
@@ -379,7 +412,7 @@ def test_grpc_async_infer(self):
                 inputs=self.inputs_,
                 callback=partial(callback, user_data),
                 outputs=self.outputs_,
-                client_timeout=2,
+                client_timeout=self.INFER_SMALL_INTERVAL,
             )
             data_item = user_data._completed_requests.get()
             if type(data_item) == InferenceServerException:
@@ -451,7 +484,9 @@ def test_http_infer(self):
         # response. Expect an exception for small timeout values.
         with self.assertRaises(socket.timeout) as cm:
             triton_client = httpclient.InferenceServerClient(
-                url="localhost:8000", verbose=True, network_timeout=2.0
+                url="localhost:8000",
+                verbose=True,
+                network_timeout=self.INFER_SMALL_INTERVAL,
             )
             _ = triton_client.infer(
                 model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
diff --git a/qa/L0_client_timeout/test.sh b/qa/L0_client_timeout/test.sh
index fe0562bb10..518c384235 100755
--- a/qa/L0_client_timeout/test.sh
+++ b/qa/L0_client_timeout/test.sh
@@ -40,6 +40,7 @@ fi
 
 export CUDA_VISIBLE_DEVICES=0
 TIMEOUT_VALUE=100000000
+SHORT_TIMEOUT_VALUE=1000
 RET=0
 
 CLIENT_TIMEOUT_TEST=client_timeout_test.py
@@ -53,11 +54,13 @@ CLIENT_LOG=`pwd`/client.log
 CLIENT_GRPC_TIMEOUTS_LOG=`pwd`/client.log.grpc
 DATADIR=`pwd`/models
 SERVER=/opt/tritonserver/bin/tritonserver
-SERVER_ARGS="--model-repository=$DATADIR --model-control-mode=explicit"
+SERVER_ARGS="--model-repository=$DATADIR --model-control-mode=explicit --load-model=custom_identity_int32 --log-verbose 2"
 source ../common/util.sh
 
 mkdir -p $DATADIR/custom_identity_int32/1
 
+# Test all APIs apart from Infer.
+export TRITONSERVER_SERVER_DELAY_GRPC_RESPONSE_SEC=1
 run_server
 if [ "$SERVER_PID" == "0" ]; then
     echo -e "\n***\n*** Failed to start $SERVER\n***"
@@ -66,10 +69,9 @@ if [ "$SERVER_PID" == "0" ]; then
 fi
 
 set +e
-
-# Test all APIs apart from Infer
+# Expect timeout for everything
 sed -i 's#value: { string_value: "0" }#value: { string_value: "1" }#' $DATADIR/custom_identity_int32/config.pbtxt
-$CLIENT_TIMEOUT_TEST_CPP -t 1000 -v -i grpc -p >> ${CLIENT_LOG}.c++.grpc_non_infer_apis 2>&1
+$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -i grpc -p >> ${CLIENT_LOG}.c++.grpc_non_infer_apis 2>&1
 if [ $? -eq 0 ]; then
     RET=1
 fi
@@ -78,7 +80,6 @@ if [ `grep -c "Deadline Exceeded" ${CLIENT_LOG}.c++.grpc_non_infer_apis` != "1"
     echo -e "\n***\n*** Test Failed\n***"
     RET=1
 fi
-
 # Test all APIs with long timeout
 $CLIENT_TIMEOUT_TEST_CPP -t $TIMEOUT_VALUE -v -i grpc -p >> ${CLIENT_LOG} 2>&1
 if [ $? -eq 1 ]; then
@@ -91,6 +92,7 @@ kill $SERVER_PID
 wait $SERVER_PID
 
 # Test infer APIs
+export TRITONSERVER_SERVER_DELAY_GRPC_RESPONSE_SEC=
 SERVER_ARGS="--model-repository=$DATADIR"
 sed -i 's#value: { string_value: "1" }#value: { string_value: "0" }#' $DATADIR/custom_identity_int32/config.pbtxt
 run_server
@@ -105,7 +107,7 @@ set +e
 # Note, the custom_identity_int32 is configured with a delay
 # of 3 sec.
 # Test request timeout in grpc synchronous inference
-$CLIENT_TIMEOUT_TEST_CPP -t 1000 -v -i grpc >> ${CLIENT_LOG}.c++.grpc_infer 2>&1
+$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -i grpc >> ${CLIENT_LOG}.c++.grpc_infer 2>&1
 if [ $? -eq 0 ]; then
     RET=1
 fi
@@ -116,7 +118,7 @@ if [ `grep -c "Deadline Exceeded" ${CLIENT_LOG}.c++.grpc_infer` != "1" ]; then
 fi
 
 # Test request timeout in grpc asynchronous inference
-$CLIENT_TIMEOUT_TEST_CPP -t 1000 -v -i grpc -a >> ${CLIENT_LOG}.c++.grpc_async_infer 2>&1
+$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -i grpc -a >> ${CLIENT_LOG}.c++.grpc_async_infer 2>&1
 if [ $? -eq 0 ]; then
     RET=1
 fi
@@ -127,7 +129,7 @@ if [ `grep -c "Deadline Exceeded" ${CLIENT_LOG}.c++.grpc_async_infer` != "1" ];
 fi
 
 # Test stream timeout in grpc asynchronous streaming inference
-$CLIENT_TIMEOUT_TEST_CPP -t 1000 -v -i grpc -s >> ${CLIENT_LOG}.c++.grpc_async_stream_infer 2>&1
+$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -i grpc -s >> ${CLIENT_LOG}.c++.grpc_async_stream_infer 2>&1
 if [ $? -eq 0 ]; then
     RET=1
 fi
@@ -138,7 +140,7 @@ if [ `grep -c "Stream has been closed" ${CLIENT_LOG}.c++.grpc_async_stream_infer
 fi
 
 # Test request timeout in http synchronous inference
-$CLIENT_TIMEOUT_TEST_CPP -t 1000 -v >> ${CLIENT_LOG}.c++.http_infer 2>&1
+$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v >> ${CLIENT_LOG}.c++.http_infer 2>&1
 if [ $? -eq 0 ]; then
     RET=1
 fi
@@ -150,7 +152,7 @@ fi
 
 
 # Test request timeout in http asynchronous inference
-$CLIENT_TIMEOUT_TEST_CPP -t 1000 -v -a >> ${CLIENT_LOG}.c++.http_async_infer 2>&1
+$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -a >> ${CLIENT_LOG}.c++.http_async_infer 2>&1
 if [ $? -eq 0 ]; then
     RET=1
 fi
@@ -238,7 +240,8 @@ kill $SERVER_PID
 wait $SERVER_PID
 
 # Test all APIs other than infer
-SERVER_ARGS="${SERVER_ARGS} --model-control-mode=explicit"
+export TRITONSERVER_SERVER_DELAY_GRPC_RESPONSE_SEC=1
+SERVER_ARGS="${SERVER_ARGS} --model-control-mode=explicit --load-model=custom_identity_int32 --log-verbose 2"
 sed -i 's#value: { string_value: "0" }#value: { string_value: "1" }#' $DATADIR/custom_identity_int32/config.pbtxt
 run_server
 if [ "$SERVER_PID" == "0" ]; then
@@ -247,20 +250,27 @@ if [ "$SERVER_PID" == "0" ]; then
     exit 1
 fi
 set +e
-export TRITONSERVER_SERVER_DELAY_GRPC_RESPONSE_SEC=1
-for i in test_grpc_load_model \
+
+for i in test_grpc_server_live \
+         test_grpc_is_server_ready \
+         test_grpc_is_model_ready \
+         test_grpc_get_server_metadata \
+         test_grpc_get_model_metadata \
+         test_grpc_get_model_config \
+         test_grpc_model_repository_index \
+         test_grpc_load_model \
          test_grpc_unload_model \
-         test_grpc_inference_statistics \
+         test_grpc_get_inference_statistics \
          test_grpc_update_trace_settings \
          test_grpc_get_trace_settings \
          test_grpc_update_log_settings \
          test_grpc_get_log_settings \
+         test_grpc_get_system_shared_memory_status \
          test_grpc_register_system_shared_memory \
-         test_grpc_get_system_shared_memory \
          test_grpc_unregister_system_shared_memory \
-         test_grpc_register_cuda_shared_memory \
          test_grpc_get_cuda_shared_memory_status \
-         test_grpc_uregister_cuda_shared_memory \
+         test_grpc_register_cuda_shared_memory \
+         test_grpc_unregister_cuda_shared_memory \
     ; do
     python $CLIENT_TIMEOUT_TEST ClientTimeoutTest.$i >>$CLIENT_LOG 2>&1
     if [ $? -ne 0 ]; then
diff --git a/src/grpc/grpc_utils.cc b/src/grpc/grpc_utils.cc
index cdd8b5cf8e..62fd93272b 100644
--- a/src/grpc/grpc_utils.cc
+++ b/src/grpc/grpc_utils.cc
@@ -27,6 +27,7 @@
 #include "grpc_utils.h"
 
 #include <chrono>
+#include <cstdlib>
 #include <thread>
 
 namespace triton { namespace server { namespace grpc {
@@ -83,8 +84,8 @@ GrpcStatusUtil::Create(::grpc::Status* status, TRITONSERVER_Error* err)
     // Will delay the write of the response by the specified time.
     // This can be used to test the flow where there are other
     // responses available to be written.
-    LOG_INFO << "Delaying the write of the response by " << delay_response
-             << " seconds";
+    LOG_VERBOSE(1) << "Delaying the write of the response by " << delay_response
+                   << " seconds";
     std::this_thread::sleep_for(std::chrono::seconds(delay_response));
   }