forked from InternLM/lmdeploy
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from zyearw1024/adjust/add_monkey_patch_20241009
Adjust/add monkey patch 20241009
- Loading branch information
Showing
7 changed files
with
339 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
/.github/ | ||
/.venv | ||
/build | ||
dist | ||
# Ignore deploy_docker | ||
deploy_docker/ | ||
docker/ | ||
*.so | ||
|
||
# LMDeploy | ||
workspace/ | ||
work_dir*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/usr/bin/env bash | ||
set -eux | ||
|
||
# Set CUDA version based on nvcc output | ||
export CUDAVER=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\.[0-9]\+\).*$/\1/p' | tr -d '.') | ||
|
||
# Determine the platform name based on the current architecture | ||
ARCH=$(uname -m) | ||
if [ "$ARCH" == "x86_64" ]; then | ||
export PLAT_NAME="manylinux2014_x86_64" | ||
elif [ "$ARCH" == "aarch64" ]; then | ||
export PLAT_NAME="manylinux2014_aarch64" | ||
else | ||
echo "Unsupported architecture: $ARCH" | ||
exit 1 | ||
fi | ||
|
||
# # Install necessary packages | ||
# apt update -y | ||
|
||
# Clean up and prepare the build directory | ||
rm -rf /tmpbuild | ||
mkdir -p /tmpbuild | ||
|
||
# Install build dependencies with caching | ||
mkdir -p /docker_build_cache/.pip | ||
pip3 install --cache-dir /docker_build_cache/.pip ninja cmake wheel | ||
|
||
# Ensure the target directory exists before copying | ||
mkdir -p /lmdeploy | ||
|
||
# Copy source files to the build directory | ||
cp -r /ant_lmdeploy/* /lmdeploy/ | ||
|
||
# Build the project | ||
cd /lmdeploy | ||
rm -rf /lmdeploy/lib | ||
mkdir -p build && cd build && rm -rf * | ||
|
||
# Use generate.sh to set up the build environment with external cache directory | ||
bash ../generate.sh | ||
|
||
ninja -j$(nproc) && ninja install || { echo "Build failed"; exit 1; } | ||
|
||
cd .. | ||
rm -rf build | ||
|
||
# Update version information if LMDEPLOY_VERSION is set | ||
if [ -n "$LMDEPLOY_VERSION" ]; then | ||
sed -i "s/__version__ = '.*'/__version__ = '$LMDEPLOY_VERSION'/" /lmdeploy/lmdeploy/version.py | ||
fi | ||
|
||
# Build the wheel with the determined platform name | ||
python setup.py bdist_wheel --cuda=${CUDAVER} --plat-name $PLAT_NAME -d /tmpbuild/ | ||
|
||
# Process the built wheel to include CUDA version information | ||
for whl in /tmpbuild/*.whl; do | ||
base_name=$(basename "$whl" .whl) | ||
|
||
# Extract version number and add CUDA information | ||
version=$(echo "$base_name" | sed -n 's/.*-\([0-9.]*\)-cp.*/\1/p') | ||
new_version="${version}+cu${CUDAVER}" | ||
|
||
# Construct the new file name | ||
new_base_name=$(echo "$base_name" | sed "s/${version}/${new_version}/") | ||
|
||
mv "$whl" "/tmpbuild/${new_base_name}.whl" | ||
|
||
# Check WRITE_WHL environment variable to determine if the wheel should be copied to /lmdeploy_build | ||
if [ "$WRITE_WHL" == "true" ]; then | ||
if [ ! -d "/lmdeploy_build" ]; then | ||
mkdir -p /lmdeploy_build | ||
fi | ||
cp "/tmpbuild/${new_base_name}.whl" "/lmdeploy_build/${new_base_name}.whl" | ||
fi | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
ARG LMDEPLOY_VERSION=0.6.1.1 | ||
|
||
# Stage 1: Build the WHL file | ||
FROM nvcr.io/nvidia/pytorch:24.02-py3 AS builder | ||
|
||
# Ubuntu 22.04 including Python 3.10 | ||
# NVIDIA CUDA 12.3.2 | ||
# NVIDIA cuBLAS 12.3.4.1 | ||
# NVIDIA cuDNN 9.0.0.306 | ||
# NVIDIA NCCL 2.19.4 | ||
# NVIDIA RAPIDS™ 23.12 | ||
# rdma-core 39.0 | ||
# NVIDIA HPC-X 2.16rc4 | ||
# OpenMPI 4.1.4+ | ||
# GDRCopy 2.3 | ||
# TensorBoard 2.9.0 | ||
# Nsight Compute 2023.3.1.1 | ||
# Nsight Systems 2023.4.1.97 | ||
# NVIDIA TensorRT™ 8.6.3 | ||
# Torch-TensorRT 2.2.0a0 | ||
# NVIDIA DALI® 1.34 | ||
# MAGMA 2.6.2 | ||
# JupyterLab 2.3.2 including Jupyter-TensorBoard | ||
# TransformerEngine 1.3 | ||
# PyTorch quantization wheel 2.1.2 | ||
ARG LMDEPLOY_VERSION | ||
ENV LMDEPLOY_VERSION=${LMDEPLOY_VERSION} | ||
RUN echo "Stage 1 LMDEPLOY_VERSION: ${LMDEPLOY_VERSION}" | ||
|
||
# Set environment variables | ||
ENV TZ=Asia/Shanghai | ||
ENV LOG_LEVEL=INFO | ||
ENV CUDA_VISIBLE_DEVICES=0 | ||
ENV WRITE_WHL="true" | ||
|
||
# Copy necessary files | ||
COPY ./../ /ant_lmdeploy | ||
COPY ./../builder/manywheel/entrypoint_build_ngc.sh /entrypoint_build.sh | ||
|
||
# Build the WHL file | ||
RUN sh /entrypoint_build.sh | ||
|
||
# List contents of /tmpbuild for debugging | ||
RUN ls -la /tmpbuild/ | ||
|
||
# Verify the WHL file | ||
RUN WHL_FILE=$(ls /tmpbuild/lmdeploy-${LMDEPLOY_VERSION}*.whl) && \ | ||
echo "Found WHL file: ${WHL_FILE}" | ||
|
||
# List contents of /lmdeploy_build for debugging | ||
RUN ls -la /lmdeploy_build/ | ||
|
||
# Stage 2: Create a minimal stage to copy the WHL file | ||
FROM scratch AS exporter | ||
COPY --from=builder /lmdeploy_build/*.whl . | ||
|
||
# Stage 3: Create the final image | ||
FROM nvcr.io/nvidia/pytorch:24.02-py3 | ||
|
||
ARG LMDEPLOY_VERSION | ||
ENV LMDEPLOY_VERSION=${LMDEPLOY_VERSION} | ||
|
||
# Set CUDA architecture list | ||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' | ||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} | ||
|
||
# Install dependencies | ||
COPY ./../requirements/ngc-build.txt /workspace/requirements-ngc-build.txt | ||
RUN pip3 install -r /workspace/requirements-ngc-build.txt | ||
|
||
# Display detailed information about the LMDEPLOY_VERSION for verification | ||
RUN echo "Stage 3 LMDEPLOY_VERSION: ${LMDEPLOY_VERSION}" && \ | ||
echo "Current CUDA architecture list: ${TORCH_CUDA_ARCH_LIST}" && \ | ||
echo "Environment variables set: TZ=${TZ}, LOG_LEVEL=${LOG_LEVEL}, CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, WRITE_WHL=${WRITE_WHL}" | ||
|
||
# Copy and install the WHL file | ||
COPY --from=builder /tmpbuild/lmdeploy-${LMDEPLOY_VERSION}*.whl /workspace/ | ||
|
||
# Ensure the WHL file is correctly specified | ||
RUN WHL_FILE=$(ls /workspace/lmdeploy-${LMDEPLOY_VERSION}*.whl) && \ | ||
echo "Installing WHL file: ${WHL_FILE}" && \ | ||
if [ -n "$WHL_FILE" ]; then pip3 install "$WHL_FILE" --no-deps; else echo "No WHL file found"; exit 1; fi | ||
|
||
# # Install triton | ||
# RUN pip3 install triton==2.1.0 | ||
|
||
WORKDIR /workspace | ||
|
||
|
||
# Example build commands | ||
|
||
# Step 1: If not in the docker directory, navigate to it first | ||
# cd docker | ||
|
||
# Step 2: Build and export the WHL file locally | ||
# This step builds the WHL file and exports it to the local ./lmdeploy_build directory | ||
# time DOCKER_BUILDKIT=1 docker build --progress=plain --platform linux/amd64 --build-arg LMDEPLOY_VERSION=0.6.1.3 --target exporter --output type=local,dest=./lmdeploy_build -f Dockerfile_ngc .. | ||
|
||
# Step 3: Build the final image | ||
# This step builds the complete image, including installing the WHL file | ||
# time DOCKER_BUILDKIT=1 docker build --progress=plain --platform linux/amd64 -t ant_lmdeploy:v0.6.1.3_cu123_$(date +"%Y%m%d") --build-arg LMDEPLOY_VERSION=0.6.1.3 -f Dockerfile_ngc .. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
version: "3.9" | ||
# FROM nvcr.io/nvidia/pytorch:24.02-py3 | ||
|
||
# Ubuntu 22.04 including Python 3.10 | ||
# NVIDIA CUDA 12.3.2 | ||
# NVIDIA cuBLAS 12.3.4.1 | ||
# NVIDIA cuDNN 9.0.0.306 | ||
# NVIDIA NCCL 2.19.4 | ||
# NVIDIA RAPIDS™ 23.12 | ||
# rdma-core 39.0 | ||
# NVIDIA HPC-X 2.16rc4 | ||
# OpenMPI 4.1.4+ | ||
# GDRCopy 2.3 | ||
# TensorBoard 2.9.0 | ||
# Nsight Compute 2023.3.1.1 | ||
# Nsight Systems 2023.4.1.97 | ||
# NVIDIA TensorRT™ 8.6.3 | ||
# Torch-TensorRT 2.2.0a0 | ||
# NVIDIA DALI® 1.34 | ||
# MAGMA 2.6.2 | ||
# JupyterLab 2.3.2 including Jupyter-TensorBoard | ||
# TransformerEngine 1.3 | ||
# PyTorch quantization wheel 2.1.2 | ||
x-node-common: | ||
&node-common | ||
platform: linux/amd64 | ||
environment: | ||
&node-common-env | ||
TZ: Asia/Shanghai | ||
LOG_LEVEL: INFO | ||
CUDA_VISIBLE_DEVICES: 0 | ||
LMDEPLOY_VERSION: ${LMDEPLOY_VERSION:-0.6.1.1} | ||
WRITE_WHL: "true" # 添加环境变量,默认值为 "true" | ||
image: nvcr.io/nvidia/pytorch:24.02-py3 | ||
logging: | ||
driver: json-file | ||
options: | ||
max-size: "100m" | ||
max-file: "10" | ||
|
||
services: | ||
build-lmdeploy-whl-amd64-01: | ||
<<: *node-common | ||
container_name: build-lmdeploy-whl-amd64-01 | ||
volumes: | ||
- ./../:/ant_lmdeploy | ||
- ./lmdeploy_build:/lmdeploy_build | ||
- ./docker_build_cache:/docker_build_cache | ||
- ./../builder/manywheel/entrypoint_build_ngc.sh:/entrypoint_build.sh | ||
entrypoint: sh /entrypoint_build.sh | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Copyright (c) OpenMMLab. All rights reserved. | ||
from lmdeploy.patch.monkey_patch import patch_all; patch_all() | ||
print("patching startup.py") | ||
from .entrypoint import run | ||
|
||
if __name__ == '__main__': | ||
run() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import logging | ||
|
||
from argparse import _SubParsersAction, ArgumentParser | ||
from functools import cache | ||
from lmdeploy.cli.serve import SubCliServe | ||
from lmdeploy.serve.openai import api_server as openai_api_serve | ||
from lmdeploy.serve.openai.protocol import StreamOptions | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
class ApiServeCliContext: | ||
args = None | ||
|
||
def _set_api_serve_cli_context(args): | ||
"""Set the API serve CLI context with the provided arguments.""" | ||
ApiServeCliContext.args = args | ||
|
||
@cache | ||
def get_stream_include_usage_status(): | ||
"""Get the status of whether to include stream usage data in the output.""" | ||
try: | ||
return ApiServeCliContext.args.enable_stream_include_usage | ||
except: | ||
return False | ||
|
||
_origin_parse_args = ArgumentParser.parse_args | ||
def _patch_parse_args(self, args=None, namespace=None): | ||
"""Patch the parse_args method to set the API serve CLI context if the command is 'serve'.""" | ||
parser_args = _origin_parse_args(self, args=args, namespace=namespace) | ||
command = getattr(parser_args, "command", None) | ||
if command != "serve": | ||
return parser_args | ||
_set_api_serve_cli_context(parser_args) | ||
|
||
return parser_args | ||
|
||
|
||
def get_api_serve_cli_context(): | ||
"""Get the API serve CLI context parser.""" | ||
api_server_parser = SubCliServe.subparsers.choices['api_server'] | ||
return api_server_parser | ||
|
||
|
||
_origin_api_serve_check_request = openai_api_serve.check_request | ||
def _patch_api_serve_check_request(request): | ||
"""Patch the check_request method to include stream usage data if enabled.""" | ||
enable_stream_include_usage_status = get_stream_include_usage_status() | ||
if enable_stream_include_usage_status: | ||
stream_options = getattr(request, "stream_options", None) | ||
if not stream_options: | ||
stream_options = StreamOptions(include_usage=True) | ||
request.stream_options = stream_options | ||
|
||
r = _origin_api_serve_check_request(request) | ||
return r | ||
|
||
|
||
def _patch_api_server_add_parser(parser): | ||
"""Patch the API server parser to add the --enable-stream-include-usage argument.""" | ||
parser.add_argument( | ||
"--enable-stream-include-usage", | ||
action="store_true", | ||
help="Enable the inclusion of stream usage data in the output, useful for monitoring performance.", | ||
) | ||
return parser | ||
|
||
|
||
def _patch_sub_parser_add_parser(self, name, **kwargs): | ||
"""Patch the subparser add_parser method to include the --enable-stream-include-usage argument for 'api_server'.""" | ||
_parser = self._origin_sub_parser_add_parser(name, **kwargs) | ||
|
||
if name != "api_server": | ||
return _parser | ||
logger.info("patching api_server add_parser") | ||
_parser = _patch_api_server_add_parser(_parser) | ||
|
||
return _parser | ||
|
||
|
||
def patch_all(): | ||
"""Apply all monkey patches.""" | ||
logger.info("monkey patching all") | ||
|
||
# Patch ArgumentParser | ||
_SubParsersAction._origin_sub_parser_add_parser = _SubParsersAction.add_parser | ||
_SubParsersAction.add_parser = _patch_sub_parser_add_parser | ||
|
||
# Patch openai_api_serve | ||
openai_api_serve.check_request = _patch_api_serve_check_request | ||
|
||
# Patch ArgumentParser | ||
ArgumentParser.parse_args = _patch_parse_args |