Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add testing for GPU tensor error handling #5871

Merged
merged 5 commits into from
Jun 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 75 additions & 3 deletions qa/L0_backend_python/python_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
from tritonclient.utils import *
import tritonclient.http as httpclient

TEST_JETSON = bool(int(os.environ.get('TEST_JETSON', 0)))


class PythonTest(tu.TestResultCollector):

Expand All @@ -59,6 +61,14 @@ def _infer_help(self, model_name, shape, data_type):
output0 = result.as_numpy('OUTPUT0')
self.assertTrue(np.all(input_data_0 == output0))

def _create_cuda_region(self, client, size, name):
import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
shm0_handle = cuda_shared_memory.create_shared_memory_region(
name, byte_size=size, device_id=0)
client.register_cuda_shared_memory(
name, cuda_shared_memory.get_raw_handle(shm0_handle), 0, size)
return shm0_handle

def _optional_input_infer(self, model_name, has_input0, has_input1):
with httpclient.InferenceServerClient("localhost:8000") as client:
shape = (1,)
Expand Down Expand Up @@ -144,6 +154,69 @@ def test_growth_error(self):
with self._shm_leak_detector.Probe() as shm_probe:
self._infer_help(model_name, shape, dtype)

# GPU tensors are not supported on jetson
# CUDA Shared memory is not supported on jetson
if not TEST_JETSON:

def test_gpu_tensor_error(self):
Fixed Show fixed Hide fixed
import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
model_name = 'identity_bool'
with httpclient.InferenceServerClient("localhost:8000") as client:
input_data = np.array([[True] * 1000], dtype=bool)
inputs = [
httpclient.InferInput("INPUT0", input_data.shape,
np_to_triton_dtype(input_data.dtype))
]
inputs[0].set_data_from_numpy(input_data)

requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')]

# intentionally create a shared memory region with not enough size.
client.unregister_cuda_shared_memory()
shm0_handle = self._create_cuda_region(client, 1,
'output0_data')

requested_outputs[0].set_shared_memory('output0_data', 1)
with self.assertRaises(InferenceServerException) as ex:
client.infer(model_name, inputs, outputs=requested_outputs)
self.assertIn(
"should be at least 1000 bytes to hold the results",
str(ex.exception))
client.unregister_cuda_shared_memory()
cuda_shared_memory.destroy_shared_memory_region(shm0_handle)

def test_dlpack_tensor_error(self):
Fixed Show fixed Hide fixed
import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
model_name = 'dlpack_identity'
with httpclient.InferenceServerClient("localhost:8000") as client:
input_data = np.array([[1] * 1000], dtype=np.float32)
inputs = [
httpclient.InferInput("INPUT0", input_data.shape,
np_to_triton_dtype(input_data.dtype))
]

requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')]
input_data_size = input_data.itemsize * input_data.size
client.unregister_cuda_shared_memory()
input_region = self._create_cuda_region(client, input_data_size,
'input0_data')
inputs[0].set_shared_memory('input0_data', input_data_size)
cuda_shared_memory.set_shared_memory_region(
input_region, [input_data])

# Intentionally create a small region to trigger an error
shm0_handle = self._create_cuda_region(client, 1,
'output0_data')
requested_outputs[0].set_shared_memory('output0_data', 1)

with self.assertRaises(InferenceServerException) as ex:
client.infer(model_name, inputs, outputs=requested_outputs)
self.assertIn(
"should be at least 4000 bytes to hold the results",
str(ex.exception))
client.unregister_cuda_shared_memory()
cuda_shared_memory.destroy_shared_memory_region(shm0_handle)

def test_async_infer(self):
model_name = "identity_uint8"
request_parallelism = 4
Expand Down Expand Up @@ -182,9 +255,8 @@ def test_async_infer(self):

# Make sure the requests ran in parallel.
stats = client.get_inference_statistics(model_name)
test_cond = (len(stats['model_stats'])
!= 1) or (stats['model_stats'][0]['name']
!= model_name)
test_cond = (len(stats['model_stats']) != 1) or (
stats['model_stats'][0]['name'] != model_name)
self.assertFalse(
test_cond,
"error: expected statistics for {}".format(model_name))
Expand Down
9 changes: 8 additions & 1 deletion qa/L0_backend_python/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ SERVER_ARGS="$BASE_SERVER_ARGS --backend-config=python,shm-default-byte-size=524
PYTHON_BACKEND_BRANCH=$PYTHON_BACKEND_REPO_TAG
CLIENT_PY=./python_test.py
CLIENT_LOG="./client.log"
EXPECTED_NUM_TESTS="9"
EXPECTED_NUM_TESTS="11"
TEST_RESULT_FILE='test_results.txt'
SERVER_LOG="./inference_server.log"
source ../common/util.sh
Expand Down Expand Up @@ -128,9 +128,16 @@ mkdir -p models/string_fixed/1/
cp ../python_models/string_fixed/model.py ./models/string_fixed/1/
cp ../python_models/string_fixed/config.pbtxt ./models/string_fixed

mkdir -p models/dlpack_identity/1/
cp ../python_models/dlpack_identity/model.py ./models/dlpack_identity/1/
cp ../python_models/dlpack_identity/config.pbtxt ./models/dlpack_identity

# Skip torch install on Jetson since it is already installed.
if [ "$TEST_JETSON" == "0" ]; then
pip3 install torch==1.13.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
else
# GPU tensor tests are disabled on jetson
EXPECTED_NUM_TESTS=9
fi

prev_num_pages=`get_shm_pages`
Expand Down