From 6e6567e5cc7b700edd9ac3b4f99284effe0d4095 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Mon, 29 May 2023 18:39:42 -0400
Subject: [PATCH 1/5] Add testing for GPU tensor error handling

---
 qa/L0_backend_python/python_test.py | 66 +++++++++++++++++++++++++++++
 qa/L0_backend_python/test.sh        |  4 ++
 2 files changed, 70 insertions(+)

diff --git a/qa/L0_backend_python/python_test.py b/qa/L0_backend_python/python_test.py
index 49413bce55..2520b8cb4b 100644
--- a/qa/L0_backend_python/python_test.py
+++ b/qa/L0_backend_python/python_test.py
@@ -38,6 +38,7 @@
 import os
 
 from tritonclient.utils import *
+import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
 import tritonclient.http as httpclient
 
 
@@ -59,6 +60,13 @@ def _infer_help(self, model_name, shape, data_type):
             output0 = result.as_numpy('OUTPUT0')
             self.assertTrue(np.all(input_data_0 == output0))
 
+    def _create_cuda_region(self, client, size, name):
+        shm0_handle = cuda_shared_memory.create_shared_memory_region(
+            name, byte_size=size, device_id=0)
+        client.register_cuda_shared_memory(
+            name, cuda_shared_memory.get_raw_handle(shm0_handle), 0, size)
+        return shm0_handle
+
     def _optional_input_infer(self, model_name, has_input0, has_input1):
         with httpclient.InferenceServerClient("localhost:8000") as client:
             shape = (1,)
@@ -144,6 +152,64 @@ def test_growth_error(self):
         with self._shm_leak_detector.Probe() as shm_probe:
             self._infer_help(model_name, shape, dtype)
 
+        # CUDA Shared memory is not supported on jetson
+        def test_gpu_tensor_error(self):
+            model_name = 'identity_bool'
+            with httpclient.InferenceServerClient("localhost:8000") as client:
+                input_data = np.array([[True] * 1000], dtype=bool)
+                inputs = [
+                    httpclient.InferInput("INPUT0", input_data.shape,
+                                          np_to_triton_dtype(input_data.dtype))
+                ]
+                inputs[0].set_data_from_numpy(input_data)
+
+                requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')]
+
+                # intentionally create a shared memory region with not enough size.
+                client.unregister_cuda_shared_memory()
+                shm0_handle = self._create_cuda_region(client, 1,
+                                                       'output0_data')
+
+                requested_outputs[0].set_shared_memory('output0_data', 1)
+                with self.assertRaises(InferenceServerException) as ex:
+                    client.infer(model_name, inputs, outputs=requested_outputs)
+                self.assertIn(
+                    "should be at least 1000 bytes to hold the results",
+                    str(ex.exception))
+                client.unregister_cuda_shared_memory()
+                cuda_shared_memory.destroy_shared_memory_region(shm0_handle)
+
+        def test_dlpack_tensor_error(self):
+            model_name = 'dlpack_identity'
+            with httpclient.InferenceServerClient("localhost:8000") as client:
+                input_data = np.array([[1] * 1000], dtype=np.float32)
+                inputs = [
+                    httpclient.InferInput("INPUT0", input_data.shape,
+                                          np_to_triton_dtype(input_data.dtype))
+                ]
+
+                requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')]
+                input_data_size = input_data.itemsize * input_data.size
+                client.unregister_cuda_shared_memory()
+                input_region = self._create_cuda_region(client, input_data_size,
+                                                        'input0_data')
+                inputs[0].set_shared_memory('input0_data', input_data_size)
+                cuda_shared_memory.set_shared_memory_region(
+                    input_region, [input_data])
+
+                # Intentionally create a small region to trigger an error
+                shm0_handle = self._create_cuda_region(client, 1,
+                                                       'output0_data')
+                requested_outputs[0].set_shared_memory('output0_data', 1)
+
+                with self.assertRaises(InferenceServerException) as ex:
+                    client.infer(model_name, inputs, outputs=requested_outputs)
+                self.assertIn(
+                    "should be at least 4000 bytes to hold the results",
+                    str(ex.exception))
+                client.unregister_cuda_shared_memory()
+                cuda_shared_memory.destroy_shared_memory_region(shm0_handle)
+
     def test_async_infer(self):
         model_name = "identity_uint8"
         request_parallelism = 4
diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh
index 587d1b8e13..659ddf18d2 100755
--- a/qa/L0_backend_python/test.sh
+++ b/qa/L0_backend_python/test.sh
@@ -128,6 +128,10 @@ mkdir -p models/string_fixed/1/
 cp ../python_models/string_fixed/model.py ./models/string_fixed/1/
 cp ../python_models/string_fixed/config.pbtxt ./models/string_fixed
 
+mkdir -p models/dlpack_identity/1/
+cp ../python_models/dlpack_identity/model.py ./models/dlpack_identity/1/
+cp ../python_models/dlpack_identity/config.pbtxt ./models/dlpack_identity
+
 # Skip torch install on Jetson since it is already installed.
 if [ "$TEST_JETSON" == "0" ]; then
   pip3 install torch==1.13.0+cpu -f https://download.pytorch.org/whl/torch_stable.html

From fd43c18d335667d64e5723dc1259763ce05cc3e0 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Tue, 6 Jun 2023 15:36:14 -0400
Subject: [PATCH 2/5] Fix up

---
 qa/L0_backend_python/python_test.py | 9 ++++++---
 qa/L0_backend_python/test.sh        | 7 ++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/qa/L0_backend_python/python_test.py b/qa/L0_backend_python/python_test.py
index 2520b8cb4b..b28b0c48b3 100644
--- a/qa/L0_backend_python/python_test.py
+++ b/qa/L0_backend_python/python_test.py
@@ -41,6 +41,8 @@
 import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
 import tritonclient.http as httpclient
 
+TEST_JETSON = bool(int(os.environ.get('TEST_JETSON', 0)))
+
 
 class PythonTest(tu.TestResultCollector):
 
@@ -152,6 +154,8 @@ def test_growth_error(self):
         with self._shm_leak_detector.Probe() as shm_probe:
             self._infer_help(model_name, shape, dtype)
 
+    # GPU tensors are not supported on jetson
+    if not TEST_JETSON:
         # CUDA Shared memory is not supported on jetson
         def test_gpu_tensor_error(self):
             model_name = 'identity_bool'
@@ -248,9 +252,8 @@ def test_async_infer(self):
 
                 # Make sure the requests ran in parallel.
                 stats = client.get_inference_statistics(model_name)
-                test_cond = (len(stats['model_stats'])
-                             != 1) or (stats['model_stats'][0]['name']
-                                       != model_name)
+                test_cond = (len(stats['model_stats']) != 1) or (
+                    stats['model_stats'][0]['name'] != model_name)
                 self.assertFalse(
                     test_cond,
                     "error: expected statistics for {}".format(model_name))
diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh
index 659ddf18d2..f53f830040 100755
--- a/qa/L0_backend_python/test.sh
+++ b/qa/L0_backend_python/test.sh
@@ -53,7 +53,7 @@ SERVER_ARGS="$BASE_SERVER_ARGS --backend-config=python,shm-default-byte-size=524
 PYTHON_BACKEND_BRANCH=$PYTHON_BACKEND_REPO_TAG
 CLIENT_PY=./python_test.py
 CLIENT_LOG="./client.log"
-EXPECTED_NUM_TESTS="9"
+EXPECTED_NUM_TESTS="11"
 TEST_RESULT_FILE='test_results.txt'
 SERVER_LOG="./inference_server.log"
 source ../common/util.sh
@@ -135,6 +135,9 @@ cp ../python_models/dlpack_identity/config.pbtxt ./models/dlpack_identity
 # Skip torch install on Jetson since it is already installed.
 if [ "$TEST_JETSON" == "0" ]; then
   pip3 install torch==1.13.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+else
+  # GPU tensor tests are disabled on jetson
+  EXPECTED_NUM_TESTS=9
 fi
 
 prev_num_pages=`get_shm_pages`
@@ -163,6 +166,8 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
+exit 0
+
 current_num_pages=`get_shm_pages`
 if [ $current_num_pages -ne $prev_num_pages ]; then
     ls /dev/shm

From a629ff1c123461d794b976feefa06b573dbea908 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Wed, 7 Jun 2023 17:56:38 -0400
Subject: [PATCH 3/5] Remove exit 0

---
 qa/L0_backend_python/test.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh
index f53f830040..539d3b14d8 100755
--- a/qa/L0_backend_python/test.sh
+++ b/qa/L0_backend_python/test.sh
@@ -166,8 +166,6 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-exit 0
-
 current_num_pages=`get_shm_pages`
 if [ $current_num_pages -ne $prev_num_pages ]; then
     ls /dev/shm

From eccc3cfac6576a4c5e5f1ae5066ed3a8e4805a89 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Mon, 12 Jun 2023 11:25:31 -0400
Subject: [PATCH 4/5] Fix jetson

---
 qa/L0_backend_python/python_test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/qa/L0_backend_python/python_test.py b/qa/L0_backend_python/python_test.py
index b28b0c48b3..71f3e68876 100644
--- a/qa/L0_backend_python/python_test.py
+++ b/qa/L0_backend_python/python_test.py
@@ -38,7 +38,6 @@
 import os
 
 from tritonclient.utils import *
-import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
 import tritonclient.http as httpclient
 
 TEST_JETSON = bool(int(os.environ.get('TEST_JETSON', 0)))
@@ -155,9 +154,11 @@ def test_growth_error(self):
             self._infer_help(model_name, shape, dtype)
 
     # GPU tensors are not supported on jetson
+    # CUDA Shared memory is not supported on jetson
     if not TEST_JETSON:
-        # CUDA Shared memory is not supported on jetson
+
         def test_gpu_tensor_error(self):
+            import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
             model_name = 'identity_bool'
             with httpclient.InferenceServerClient("localhost:8000") as client:
                 input_data = np.array([[True] * 1000], dtype=bool)
@@ -184,6 +185,7 @@ def test_gpu_tensor_error(self):
                 cuda_shared_memory.destroy_shared_memory_region(shm0_handle)
 
         def test_dlpack_tensor_error(self):
+            import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
             model_name = 'dlpack_identity'
             with httpclient.InferenceServerClient("localhost:8000") as client:
                 input_data = np.array([[1] * 1000], dtype=np.float32)

From a6d1d4a6eea7e8bc981e7042ec511a0206f992ec Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Wed, 28 Jun 2023 12:43:31 -0400
Subject: [PATCH 5/5] Fix up

---
 qa/L0_backend_python/python_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/qa/L0_backend_python/python_test.py b/qa/L0_backend_python/python_test.py
index 71f3e68876..ba4dc25ecb 100644
--- a/qa/L0_backend_python/python_test.py
+++ b/qa/L0_backend_python/python_test.py
@@ -62,6 +62,7 @@ def _infer_help(self, model_name, shape, data_type):
             self.assertTrue(np.all(input_data_0 == output0))
 
     def _create_cuda_region(self, client, size, name):
+        import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
         shm0_handle = cuda_shared_memory.create_shared_memory_region(
             name, byte_size=size, device_id=0)
         client.register_cuda_shared_memory(