diff --git a/build.py b/build.py index 8985646e1d..b74d3bc96c 100755 --- a/build.py +++ b/build.py @@ -74,7 +74,9 @@ "2023.0.0", # ORT OpenVINO "2023.0.0", # Standalone OpenVINO "2.4.7", # DCGM version - "py310_23.1.0-1", # Conda version. + "py310_23.1.0-1", # Conda version + "9.1.0.1", # TRT version for building TRT-LLM backend + "12.2", # CUDA version for building TRT-LLM backend ) } @@ -564,6 +566,8 @@ def backend_cmake_args(images, components, be, install_dir, library_paths): args = fastertransformer_cmake_args() elif be == "tensorrt": args = tensorrt_cmake_args() + elif be == "tensorrtllm": + args = tensorrtllm_cmake_args(images) else: args = [] @@ -859,6 +863,39 @@ def fastertransformer_cmake_args(): return cargs +def tensorrtllm_cmake_args(images): + cargs = [ + cmake_backend_arg( + "tensorrtllm", + "TRT_LIB_DIR", + None, + "${TRT_ROOT}/targets/${ARCH}-linux-gnu/lib", + ), + cmake_backend_arg( + "tensorrtllm", "TRT_INCLUDE_DIR", None, "${TRT_ROOT}/include" + ), + cmake_backend_arg( + "tensorrtllm", + "TRTLLM_BUILD_CONTAINER", + None, + images["base"], + ), + cmake_backend_arg( + "tensorrtllm", + "TENSORRT_VERSION", + None, + TRITON_VERSION_MAP[FLAGS.version][7], + ), + cmake_backend_arg( + "tensorrtllm", + "CUDA_VERSION", + None, + TRITON_VERSION_MAP[FLAGS.version][8], + ), + ] + return cargs + + def install_dcgm_libraries(dcgm_version, target_machine): if dcgm_version == "": fail( @@ -1278,6 +1315,27 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach pip3 install --upgrade numpy && \ rm -rf /var/lib/apt/lists/* """ + # Add dependencies needed for tensorrtllm backend + if "tensorrtllm" in backends: + be = "tensorrtllm" + import importlib.util + + import requests + + # FIXME: Update the url + url = "https://gitlab-master.nvidia.com/krish/tensorrtllm_backend/-/raw/{}/tools/gen_trtllm_dockerfile.py".format( + backends[be] + ) + + response = requests.get(url) + spec = importlib.util.spec_from_loader( + "trtllm_buildscript", loader=None, origin=url + ) + trtllm_buildscript = importlib.util.module_from_spec(spec) + exec(response.content, trtllm_buildscript.__dict__) + df += trtllm_buildscript.create_postbuild( + argmap["TRT_LLM_TRT_VERSION"], argmap["TRT_LLM_CUDA_VERSION"] + ) df += """ WORKDIR /opt/tritonserver @@ -1441,6 +1499,8 @@ def create_build_dockerfiles( if FLAGS.version is None or FLAGS.version not in TRITON_VERSION_MAP else TRITON_VERSION_MAP[FLAGS.version][6], } + dockerfileargmap["TRT_LLM_TRT_VERSION"] = TRITON_VERSION_MAP[FLAGS.version][7] + dockerfileargmap["TRT_LLM_CUDA_VERSION"] = TRITON_VERSION_MAP[FLAGS.version][8] # For CPU-only image we need to copy some cuda libraries and dependencies # since we are using PyTorch and TensorFlow containers that @@ -1726,6 +1786,12 @@ def core_build( cmake_script.blankln() +def tensorrtllm_prebuild(cmake_script): + # Export the TRT_ROOT environment variable + cmake_script.cmd("export TRT_ROOT=/usr/local/tensorrt") + cmake_script.cmd("export ARCH=$(uname -m)") + + def backend_build( be, cmake_script, @@ -1746,7 +1812,16 @@ def backend_build( cmake_script.comment() cmake_script.mkdir(build_dir) cmake_script.cwd(build_dir) - cmake_script.gitclone(backend_repo(be), tag, be, github_organization) + # FIXME: Use GitHub repo + if be == "tensorrtllm": + cmake_script.gitclone( + backend_repo(be), tag, be, "https://gitlab-master.nvidia.com/krish" + ) + else: + cmake_script.gitclone(backend_repo(be), tag, be, github_organization) + + if be == "tensorrtllm": + tensorrtllm_prebuild(cmake_script) cmake_script.mkdir(repo_build_dir) cmake_script.cwd(repo_build_dir) @@ -1757,6 +1832,7 @@ def backend_build( cmake_script.mkdir(os.path.join(install_dir, "backends")) cmake_script.rmdir(os.path.join(install_dir, "backends", be)) + cmake_script.cpdir( os.path.join(repo_install_dir, "backends", be), os.path.join(install_dir, "backends"),