Merge branch 'main' into kprashanth-tritonfrontend-rfeatures

triton-inference-server · Dec 13, 2024 · dae9159 · dae9159
2 parents c415fdb + bcff3da
commit dae9159
Show file tree

Hide file tree

Showing 21 changed files with 546 additions and 112 deletions.
diff --git a/build.py b/build.py
@@ -565,7 +565,7 @@ def backend_cmake_args(images, components, be, install_dir, library_paths):
     elif be == "tensorflow":
         args = tensorflow_cmake_args(images, library_paths)
     elif be == "python":
-        args = []
+        args = python_cmake_args()
     elif be == "dali":
         args = dali_cmake_args()
     elif be == "pytorch":
@@ -631,6 +631,18 @@ def backend_cmake_args(images, components, be, install_dir, library_paths):
     return cargs
 
 
+def python_cmake_args():
+    cargs = []
+    if target_platform() == "rhel":
+        cargs.append(
+            cmake_backend_arg(
+                "python", "PYBIND11_PYTHON_VERSION", "STRING", FLAGS.rhel_py_version
+            )
+        )
+
+    return cargs
+
+
 def pytorch_cmake_args(images):
     if "pytorch" in images:
         image = images["pytorch"]
@@ -924,6 +936,7 @@ def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
 
 ARG TRITON_VERSION
 ARG TRITON_CONTAINER_VERSION
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
 """
     df += """
 # Install docker docker buildx
@@ -957,6 +970,10 @@ def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
             pkg-config \\
             unzip \\
             wget \\
+            ncurses-devel \\
+            readline-devel \\
+            xz-devel \\
+            bzip2-devel \\
             zlib-devel \\
             libarchive-devel \\
             libxml2-devel \\
@@ -1025,6 +1042,7 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
 
 ARG TRITON_VERSION
 ARG TRITON_CONTAINER_VERSION
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
 """
     # Install the windows- or linux-specific buildbase dependencies
     if target_platform() == "windows":
@@ -1035,7 +1053,6 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
         df += """
 # Ensure apt-get won't prompt for selecting options
 ENV DEBIAN_FRONTEND=noninteractive
-ENV PIP_BREAK_SYSTEM_PACKAGES=1
 
 # Install docker docker buildx
 RUN apt-get update \\
@@ -1159,6 +1176,7 @@ def create_dockerfile_cibase(ddir, dockerfile_name, argmap):
 
 ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
 ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
 """
 
     with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
@@ -1198,6 +1216,8 @@ def create_dockerfile_linux(
 ##  Production stage: Create container with just inference server executable
 ############################################################################
 FROM ${BASE_IMAGE}
+
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
 """
 
     df += dockerfile_prepare_container_linux(
@@ -1399,7 +1419,6 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
     if "python" in backends:
         if target_platform() == "rhel":
             df += """
-ENV PIP_BREAK_SYSTEM_PACKAGES=1
 # python3, python3-pip and some pip installs required for the python backend
 RUN yum install -y \\
         libarchive-devel \\
@@ -1418,7 +1437,6 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
 """
         else:
             df += """
-ENV PIP_BREAK_SYSTEM_PACKAGES=1
 # python3, python3-pip and some pip installs required for the python backend
 RUN apt-get update \\
       && apt-get install -y --no-install-recommends \\
@@ -1542,7 +1560,7 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
 
 
 def change_default_python_version_rhel(version):
-    df = """
+    df = f"""
 # The python library version available for install via 'yum install python3.X-devel' does not
 # match the version of python inside the RHEL base container. This means that python packages
 # installed within the container will not be picked up by the python backend stub process pybind
@@ -1551,21 +1569,17 @@ def change_default_python_version_rhel(version):
 RUN curl https://pyenv.run | bash
 ENV PATH="${{PYENV_ROOT}}/bin:$PATH"
 RUN eval "$(pyenv init -)"
-RUN CONFIGURE_OPTS=\"--with-openssl=/usr/lib64\" && pyenv install {} \\
-    && cp ${{PYENV_ROOT}}/versions/{}/lib/libpython3* /usr/lib64/""".format(
-        version, version
-    )
-    df += """
+RUN CONFIGURE_OPTS=\"--with-openssl=/usr/lib64\" && pyenv install {version} \\
+    && cp ${{PYENV_ROOT}}/versions/{version}/lib/libpython3* /usr/lib64/
+
 # RHEL image has several python versions. It's important
 # to set the correct version, otherwise, packages that are
 # pip installed will not be found during testing.
-ENV PYVER={} PYTHONPATH=/opt/python/v
+ENV PYVER={version} PYTHONPATH=/opt/python/v
 RUN ln -sf ${{PYENV_ROOT}}/versions/${{PYVER}}* ${{PYTHONPATH}}
 ENV PYBIN=${{PYTHONPATH}}/bin
 ENV PYTHON_BIN_PATH=${{PYBIN}}/python${{PYVER}} PATH=${{PYBIN}}:${{PATH}}
-""".format(
-        version
-    )
+"""
     return df
 
 

diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py
@@ -65,11 +65,11 @@ def start_kserve_frontends(server, args):
         from tritonfrontend import KServeGrpc, KServeHttp
 
         http_options = KServeHttp.Options(address=args.host, port=args.kserve_http_port)
-        http_service = KServeHttp.Server(server, http_options)
+        http_service = KServeHttp(server, http_options)
         http_service.start()
 
         grpc_options = KServeGrpc.Options(address=args.host, port=args.kserve_grpc_port)
-        grpc_service = KServeGrpc.Server(server, grpc_options)
+        grpc_service = KServeGrpc(server, grpc_options)
         grpc_service.start()
 
     except ModuleNotFoundError:

diff --git a/python/openai/requirements.txt b/python/openai/requirements.txt
@@ -26,4 +26,7 @@
 
 # FastAPI Application
 fastapi==0.111.1
+# Fix httpx version to avoid bug in openai library:
+# https://community.openai.com/t/error-with-openai-1-56-0-client-init-got-an-unexpected-keyword-argument-proxies/1040332/3
+httpx==0.27.2
 openai==1.40.6
diff --git a/qa/L0_backend_python/common.sh b/qa/L0_backend_python/common.sh
@@ -42,7 +42,7 @@ install_conda() {
   eval "$(./miniconda/bin/conda shell.bash hook)"
 }
 
-install_build_deps() {
+install_build_deps_apt() {
   apt update && apt install software-properties-common rapidjson-dev -y
   # Using CMAKE installation instruction from:: https://apt.kitware.com/
   apt update -q=2 \
@@ -54,6 +54,18 @@ install_build_deps() {
     && apt-get install -y --no-install-recommends cmake=3.28.3* cmake-data=3.28.3*
 }
 
+install_build_deps_yum() {
+  yum install rapidjson-devel -y
+}
+
+install_build_deps() {
+  if [[ ${TRITON_RHEL} -eq "1" ]]; then
+    install_build_deps_yum
+  else
+    install_build_deps_apt
+  fi
+}
+
 create_conda_env() {
   local python_version=$1
   local env_name=$2
@@ -74,6 +86,6 @@ create_python_backend_stub() {
   rm -rf python_backend
   git clone ${TRITON_REPO_ORGANIZATION}/python_backend -b $PYTHON_BACKEND_REPO_TAG
   (cd python_backend/ && mkdir builddir && cd builddir && \
-  cmake -DTRITON_ENABLE_GPU=ON -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} -DTRITON_BACKEND_REPO_TAG=$TRITON_BACKEND_REPO_TAG -DTRITON_COMMON_REPO_TAG=$TRITON_COMMON_REPO_TAG -DTRITON_CORE_REPO_TAG=$TRITON_CORE_REPO_TAG ../ && \
+  cmake -DTRITON_ENABLE_GPU=ON -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} -DTRITON_BACKEND_REPO_TAG=$TRITON_BACKEND_REPO_TAG -DTRITON_COMMON_REPO_TAG=$TRITON_COMMON_REPO_TAG -DTRITON_CORE_REPO_TAG=$TRITON_CORE_REPO_TAG -DPYBIND11_PYTHON_VERSION=$PY_VERSION ../ && \
   make -j18 triton-python-backend-stub)
 }
diff --git a/qa/L0_backend_python/env/test.sh b/qa/L0_backend_python/env/test.sh
@@ -44,6 +44,7 @@ install_conda
 # Tensorflow 2.1.0 only works with Python 3.4 - 3.7. Successful execution of
 # the Python model indicates that the environment has been setup correctly.
 # Create a model with python 3.7 version
+export PY_VERSION="3.7"
 create_conda_env "3.7" "python-3-7"
 conda install numpy=1.20.1 -y
 conda install tensorflow=2.1.0 -y
@@ -67,6 +68,7 @@ conda deactivate
 # previous test.
 # Tensorflow 2.1.0 only works with Python 3.4 - 3.7. Successful execution of
 # the Python model indicates that the environment has been setup correctly.
+export PY_VERSION="3.7.1"
 path_to_conda_pack="$PWD/python-3-7-1"
 create_conda_env_with_specified_path "3.7" $path_to_conda_pack
 conda install numpy=1.20.3 -y
@@ -89,6 +91,7 @@ conda deactivate
 # Create a model with python 3.6 version
 # Tensorflow 2.1.0 only works with Python 3.4 - 3.7. Successful execution of
 # the Python model indicates that the environment has been setup correctly.
+export PY_VERSION="3.6"
 create_conda_env "3.6" "python-3-6"
 conda install -c conda-forge libstdcxx-ng=14 -y
 conda install numpy=1.18.1 -y
@@ -116,9 +119,13 @@ conda deactivate
 path_to_conda_pack='$$TRITON_MODEL_DIRECTORY/python_3_12_environment.tar.gz'
 create_conda_env "3.12" "python-3-12"
 conda install -c conda-forge libstdcxx-ng=14 -y
+TF_VERSION="2.16.2"
 conda install numpy=1.26.4 -y
-conda install tensorflow=2.16.2 -y
-PY312_VERSION_STRING="Python version is 3.12, NumPy version is 1.26.4, and Tensorflow version is 2.16.2"
+if [ $TRITON_RHEL -eq 1 ]; then
+    TF_VERSION="2.17.0"
+fi
+conda install tensorflow=${TF_VERSION} -y
+PY312_VERSION_STRING="Python version is 3.12, NumPy version is 1.26.4, and Tensorflow version is ${TF_VERSION}"
 conda pack -o python3.12.tar.gz
 mkdir -p models/python_3_12/1/
 cp ../../python_models/python_version/config.pbtxt ./models/python_3_12
@@ -137,8 +144,7 @@ if [ "$SERVER_PID" == "0" ]; then
     exit 1
 fi
 
-kill $SERVER_PID
-wait $SERVER_PID
+kill_server
 
 set +e
 for EXPECTED_VERSION_STRING in "$PY36_VERSION_STRING" "$PY37_VERSION_STRING" "$PY37_1_VERSION_STRING" "$PY312_VERSION_STRING"; do
@@ -154,6 +160,15 @@ done
 # NOTE: In certain pybind versions, the locale settings may not be propagated from parent to
 #       stub processes correctly. See https://github.com/triton-inference-server/python_backend/pull/260.
 export LC_ALL=INVALID
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+kill_server
+
 grep "Locale is (None, None)" $SERVER_LOG
     if [ $? -ne 0 ]; then
         cat $SERVER_LOG
@@ -175,8 +190,7 @@ if [ "$SERVER_PID" == "0" ]; then
     exit 1
 fi
 
-kill $SERVER_PID
-wait $SERVER_PID
+kill_server
 
 set +e
 grep "Locale is ('en_US', 'UTF-8')" $SERVER_LOG
@@ -207,8 +221,7 @@ touch -m models/python_3_12/python_3_12_environment.tar.gz
 # The environment should be re-extracted
 curl -v -X POST localhost:8000/v2/repository/models/python_3_12/load
 
-kill $SERVER_PID
-wait $SERVER_PID
+kill_server
 
 set +e
 
@@ -248,6 +261,8 @@ rm -rf models/python_3_7
 aws s3 cp models/ "${BUCKET_URL_SLASH}" --recursive --include "*"
 
 rm $SERVER_LOG
+# Occasionally needs more time to load
+SERVER_TIMEOUT=420
 
 SERVER_ARGS="--model-repository=$BUCKET_URL_SLASH --log-verbose=1"
 run_server
@@ -258,8 +273,7 @@ if [ "$SERVER_PID" == "0" ]; then
     exit 1
 fi
 
-kill $SERVER_PID
-wait $SERVER_PID
+kill_server
 
 set +e
 grep "$PY36_VERSION_STRING" $SERVER_LOG
@@ -292,8 +306,7 @@ if [ "$SERVER_PID" == "0" ]; then
     exit 1
 fi
 
-kill $SERVER_PID
-wait $SERVER_PID
+kill_server
 
 set +e
 for EXPECTED_VERSION_STRING in "$PY36_VERSION_STRING" "$PY312_VERSION_STRING"; do

diff --git a/qa/L0_backend_python/io/io_test.py b/qa/L0_backend_python/io/io_test.py
@@ -259,6 +259,33 @@ def test_requested_output_decoupled(self):
             self.assertTrue(np.allclose(gpu_output_data[1:], next_gpu_output_data))
         self.assertTrue(user_data._completed_requests.empty())
 
+    # Assert a prior crash is fixed regarding requested output on a decoupled model.
+    def test_requested_output_decoupled_prior_crash(self):
+        model_name = "llm"
+        prompt = "test"
+
+        text_input_data = np.array([[prompt]]).astype(object)
+        inputs = [grpcclient.InferInput("text_input", text_input_data.shape, "BYTES")]
+        inputs[-1].set_data_from_numpy(text_input_data)
+
+        requested_outputs = [grpcclient.InferRequestedOutput("text_output")]
+
+        user_data = UserData()
+        with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client:
+            client.start_stream(callback=partial(callback, user_data))
+            client.async_stream_infer(
+                model_name=model_name, inputs=inputs, outputs=requested_outputs
+            )
+            client.stop_stream()
+
+        outputs = ""
+        while not user_data._completed_requests.empty():
+            result = user_data._completed_requests.get(block=False)
+            if isinstance(result, InferenceServerException):
+                raise result
+            outputs += str(result.as_numpy("text_output")[0], encoding="utf-8")
+        self.assertGreater(len(outputs), 0, "text_output is empty")
+
 
 if __name__ == "__main__":
     unittest.main()