From 6e6567e5cc7b700edd9ac3b4f99284effe0d4095 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 29 May 2023 18:39:42 -0400 Subject: [PATCH 1/5] Add testing for GPU tensor error handling --- qa/L0_backend_python/python_test.py | 66 +++++++++++++++++++++++++++++ qa/L0_backend_python/test.sh | 4 ++ 2 files changed, 70 insertions(+) diff --git a/qa/L0_backend_python/python_test.py b/qa/L0_backend_python/python_test.py index 49413bce55..2520b8cb4b 100644 --- a/qa/L0_backend_python/python_test.py +++ b/qa/L0_backend_python/python_test.py @@ -38,6 +38,7 @@ import os from tritonclient.utils import * +import tritonclient.utils.cuda_shared_memory as cuda_shared_memory import tritonclient.http as httpclient @@ -59,6 +60,13 @@ def _infer_help(self, model_name, shape, data_type): output0 = result.as_numpy('OUTPUT0') self.assertTrue(np.all(input_data_0 == output0)) + def _create_cuda_region(self, client, size, name): + shm0_handle = cuda_shared_memory.create_shared_memory_region( + name, byte_size=size, device_id=0) + client.register_cuda_shared_memory( + name, cuda_shared_memory.get_raw_handle(shm0_handle), 0, size) + return shm0_handle + def _optional_input_infer(self, model_name, has_input0, has_input1): with httpclient.InferenceServerClient("localhost:8000") as client: shape = (1,) @@ -144,6 +152,64 @@ def test_growth_error(self): with self._shm_leak_detector.Probe() as shm_probe: self._infer_help(model_name, shape, dtype) + # CUDA Shared memory is not supported on jetson + def test_gpu_tensor_error(self): + model_name = 'identity_bool' + with httpclient.InferenceServerClient("localhost:8000") as client: + input_data = np.array([[True] * 1000], dtype=bool) + inputs = [ + httpclient.InferInput("INPUT0", input_data.shape, + np_to_triton_dtype(input_data.dtype)) + ] + inputs[0].set_data_from_numpy(input_data) + + requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')] + + # intentionally create a shared memory region with not enough size. + client.unregister_cuda_shared_memory() + shm0_handle = self._create_cuda_region(client, 1, + 'output0_data') + + requested_outputs[0].set_shared_memory('output0_data', 1) + with self.assertRaises(InferenceServerException) as ex: + client.infer(model_name, inputs, outputs=requested_outputs) + self.assertIn( + "should be at least 1000 bytes to hold the results", + str(ex.exception)) + client.unregister_cuda_shared_memory() + cuda_shared_memory.destroy_shared_memory_region(shm0_handle) + + def test_dlpack_tensor_error(self): + model_name = 'dlpack_identity' + with httpclient.InferenceServerClient("localhost:8000") as client: + input_data = np.array([[1] * 1000], dtype=np.float32) + inputs = [ + httpclient.InferInput("INPUT0", input_data.shape, + np_to_triton_dtype(input_data.dtype)) + ] + + requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')] + input_data_size = input_data.itemsize * input_data.size + client.unregister_cuda_shared_memory() + input_region = self._create_cuda_region(client, input_data_size, + 'input0_data') + inputs[0].set_shared_memory('input0_data', input_data_size) + cuda_shared_memory.set_shared_memory_region( + input_region, [input_data]) + + # Intentionally create a small region to trigger an error + shm0_handle = self._create_cuda_region(client, 1, + 'output0_data') + requested_outputs[0].set_shared_memory('output0_data', 1) + + with self.assertRaises(InferenceServerException) as ex: + client.infer(model_name, inputs, outputs=requested_outputs) + self.assertIn( + "should be at least 4000 bytes to hold the results", + str(ex.exception)) + client.unregister_cuda_shared_memory() + cuda_shared_memory.destroy_shared_memory_region(shm0_handle) + def test_async_infer(self): model_name = "identity_uint8" request_parallelism = 4 diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh index 587d1b8e13..659ddf18d2 100755 --- a/qa/L0_backend_python/test.sh +++ b/qa/L0_backend_python/test.sh @@ -128,6 +128,10 @@ mkdir -p models/string_fixed/1/ cp ../python_models/string_fixed/model.py ./models/string_fixed/1/ cp ../python_models/string_fixed/config.pbtxt ./models/string_fixed +mkdir -p models/dlpack_identity/1/ +cp ../python_models/dlpack_identity/model.py ./models/dlpack_identity/1/ +cp ../python_models/dlpack_identity/config.pbtxt ./models/dlpack_identity + # Skip torch install on Jetson since it is already installed. if [ "$TEST_JETSON" == "0" ]; then pip3 install torch==1.13.0+cpu -f https://download.pytorch.org/whl/torch_stable.html From fd43c18d335667d64e5723dc1259763ce05cc3e0 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Tue, 6 Jun 2023 15:36:14 -0400 Subject: [PATCH 2/5] Fix up --- qa/L0_backend_python/python_test.py | 9 ++++++--- qa/L0_backend_python/test.sh | 7 ++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/qa/L0_backend_python/python_test.py b/qa/L0_backend_python/python_test.py index 2520b8cb4b..b28b0c48b3 100644 --- a/qa/L0_backend_python/python_test.py +++ b/qa/L0_backend_python/python_test.py @@ -41,6 +41,8 @@ import tritonclient.utils.cuda_shared_memory as cuda_shared_memory import tritonclient.http as httpclient +TEST_JETSON = bool(int(os.environ.get('TEST_JETSON', 0))) + class PythonTest(tu.TestResultCollector): @@ -152,6 +154,8 @@ def test_growth_error(self): with self._shm_leak_detector.Probe() as shm_probe: self._infer_help(model_name, shape, dtype) + # GPU tensors are not supported on jetson + if not TEST_JETSON: # CUDA Shared memory is not supported on jetson def test_gpu_tensor_error(self): model_name = 'identity_bool' @@ -248,9 +252,8 @@ def test_async_infer(self): # Make sure the requests ran in parallel. stats = client.get_inference_statistics(model_name) - test_cond = (len(stats['model_stats']) - != 1) or (stats['model_stats'][0]['name'] - != model_name) + test_cond = (len(stats['model_stats']) != 1) or ( + stats['model_stats'][0]['name'] != model_name) self.assertFalse( test_cond, "error: expected statistics for {}".format(model_name)) diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh index 659ddf18d2..f53f830040 100755 --- a/qa/L0_backend_python/test.sh +++ b/qa/L0_backend_python/test.sh @@ -53,7 +53,7 @@ SERVER_ARGS="$BASE_SERVER_ARGS --backend-config=python,shm-default-byte-size=524 PYTHON_BACKEND_BRANCH=$PYTHON_BACKEND_REPO_TAG CLIENT_PY=./python_test.py CLIENT_LOG="./client.log" -EXPECTED_NUM_TESTS="9" +EXPECTED_NUM_TESTS="11" TEST_RESULT_FILE='test_results.txt' SERVER_LOG="./inference_server.log" source ../common/util.sh @@ -135,6 +135,9 @@ cp ../python_models/dlpack_identity/config.pbtxt ./models/dlpack_identity # Skip torch install on Jetson since it is already installed. if [ "$TEST_JETSON" == "0" ]; then pip3 install torch==1.13.0+cpu -f https://download.pytorch.org/whl/torch_stable.html +else + # GPU tensor tests are disabled on jetson + EXPECTED_NUM_TESTS=9 fi prev_num_pages=`get_shm_pages` @@ -163,6 +166,8 @@ set -e kill $SERVER_PID wait $SERVER_PID +exit 0 + current_num_pages=`get_shm_pages` if [ $current_num_pages -ne $prev_num_pages ]; then ls /dev/shm From a629ff1c123461d794b976feefa06b573dbea908 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 7 Jun 2023 17:56:38 -0400 Subject: [PATCH 3/5] Remove exit 0 --- qa/L0_backend_python/test.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh index f53f830040..539d3b14d8 100755 --- a/qa/L0_backend_python/test.sh +++ b/qa/L0_backend_python/test.sh @@ -166,8 +166,6 @@ set -e kill $SERVER_PID wait $SERVER_PID -exit 0 - current_num_pages=`get_shm_pages` if [ $current_num_pages -ne $prev_num_pages ]; then ls /dev/shm From eccc3cfac6576a4c5e5f1ae5066ed3a8e4805a89 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 12 Jun 2023 11:25:31 -0400 Subject: [PATCH 4/5] Fix jetson --- qa/L0_backend_python/python_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/qa/L0_backend_python/python_test.py b/qa/L0_backend_python/python_test.py index b28b0c48b3..71f3e68876 100644 --- a/qa/L0_backend_python/python_test.py +++ b/qa/L0_backend_python/python_test.py @@ -38,7 +38,6 @@ import os from tritonclient.utils import * -import tritonclient.utils.cuda_shared_memory as cuda_shared_memory import tritonclient.http as httpclient TEST_JETSON = bool(int(os.environ.get('TEST_JETSON', 0))) @@ -155,9 +154,11 @@ def test_growth_error(self): self._infer_help(model_name, shape, dtype) # GPU tensors are not supported on jetson + # CUDA Shared memory is not supported on jetson if not TEST_JETSON: - # CUDA Shared memory is not supported on jetson + def test_gpu_tensor_error(self): + import tritonclient.utils.cuda_shared_memory as cuda_shared_memory model_name = 'identity_bool' with httpclient.InferenceServerClient("localhost:8000") as client: input_data = np.array([[True] * 1000], dtype=bool) @@ -184,6 +185,7 @@ def test_gpu_tensor_error(self): cuda_shared_memory.destroy_shared_memory_region(shm0_handle) def test_dlpack_tensor_error(self): + import tritonclient.utils.cuda_shared_memory as cuda_shared_memory model_name = 'dlpack_identity' with httpclient.InferenceServerClient("localhost:8000") as client: input_data = np.array([[1] * 1000], dtype=np.float32) From a6d1d4a6eea7e8bc981e7042ec511a0206f992ec Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 28 Jun 2023 12:43:31 -0400 Subject: [PATCH 5/5] Fix up --- qa/L0_backend_python/python_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/qa/L0_backend_python/python_test.py b/qa/L0_backend_python/python_test.py index 71f3e68876..ba4dc25ecb 100644 --- a/qa/L0_backend_python/python_test.py +++ b/qa/L0_backend_python/python_test.py @@ -62,6 +62,7 @@ def _infer_help(self, model_name, shape, data_type): self.assertTrue(np.all(input_data_0 == output0)) def _create_cuda_region(self, client, size, name): + import tritonclient.utils.cuda_shared_memory as cuda_shared_memory shm0_handle = cuda_shared_memory.create_shared_memory_region( name, byte_size=size, device_id=0) client.register_cuda_shared_memory(