Merge pull request #5 from zyearw1024/adjust/add_monkey_patch_20241009

Adjust/add monkey patch 20241009
zyearw1024 · Oct 10, 2024 · 58f1ae6 · 58f1ae6
2 parents d8432ae + 21747da
commit 58f1ae6
Show file tree

Hide file tree

Showing 7 changed files with 339 additions and 0 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,12 @@
+/.github/
+/.venv
+/build
+dist
+# Ignore deploy_docker
+deploy_docker/
+docker/
+*.so
+
+# LMDeploy
+workspace/
+work_dir*/
diff --git a/builder/manywheel/entrypoint_build_ngc.sh b/builder/manywheel/entrypoint_build_ngc.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+set -eux
+
+# Set CUDA version based on nvcc output
+export CUDAVER=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\.[0-9]\+\).*$/\1/p' | tr -d '.')
+
+# Determine the platform name based on the current architecture
+ARCH=$(uname -m)
+if [ "$ARCH" == "x86_64" ]; then
+    export PLAT_NAME="manylinux2014_x86_64"
+elif [ "$ARCH" == "aarch64" ]; then
+    export PLAT_NAME="manylinux2014_aarch64"
+else
+    echo "Unsupported architecture: $ARCH"
+    exit 1
+fi
+
+# # Install necessary packages
+# apt update -y
+
+# Clean up and prepare the build directory
+rm -rf /tmpbuild
+mkdir -p /tmpbuild
+
+# Install build dependencies with caching
+mkdir -p /docker_build_cache/.pip
+pip3 install --cache-dir /docker_build_cache/.pip ninja cmake wheel
+
+# Ensure the target directory exists before copying
+mkdir -p /lmdeploy
+
+# Copy source files to the build directory
+cp -r /ant_lmdeploy/* /lmdeploy/
+
+# Build the project
+cd /lmdeploy
+rm -rf /lmdeploy/lib
+mkdir -p build && cd build && rm -rf *
+
+# Use generate.sh to set up the build environment with external cache directory
+bash ../generate.sh
+
+ninja -j$(nproc) && ninja install || { echo "Build failed"; exit 1; }
+
+cd ..
+rm -rf build
+
+# Update version information if LMDEPLOY_VERSION is set
+if [ -n "$LMDEPLOY_VERSION" ]; then
+    sed -i "s/__version__ = '.*'/__version__ = '$LMDEPLOY_VERSION'/" /lmdeploy/lmdeploy/version.py
+fi
+
+# Build the wheel with the determined platform name
+python setup.py bdist_wheel --cuda=${CUDAVER} --plat-name $PLAT_NAME -d /tmpbuild/
+
+# Process the built wheel to include CUDA version information
+for whl in /tmpbuild/*.whl; do
+    base_name=$(basename "$whl" .whl)
+
+    # Extract version number and add CUDA information
+    version=$(echo "$base_name" | sed -n 's/.*-\([0-9.]*\)-cp.*/\1/p')
+    new_version="${version}+cu${CUDAVER}"
+
+    # Construct the new file name
+    new_base_name=$(echo "$base_name" | sed "s/${version}/${new_version}/")
+
+    mv "$whl" "/tmpbuild/${new_base_name}.whl"
+
+    # Check WRITE_WHL environment variable to determine if the wheel should be copied to /lmdeploy_build
+    if [ "$WRITE_WHL" == "true" ]; then
+        if [ ! -d "/lmdeploy_build" ]; then
+            mkdir -p /lmdeploy_build
+        fi
+        cp "/tmpbuild/${new_base_name}.whl" "/lmdeploy_build/${new_base_name}.whl"
+    fi
+done
diff --git a/docker/Dockerfile_ngc b/docker/Dockerfile_ngc
@@ -0,0 +1,101 @@
+ARG LMDEPLOY_VERSION=0.6.1.1
+
+# Stage 1: Build the WHL file
+FROM nvcr.io/nvidia/pytorch:24.02-py3 AS builder
+
+# Ubuntu 22.04 including Python 3.10
+# NVIDIA CUDA 12.3.2
+# NVIDIA cuBLAS 12.3.4.1
+# NVIDIA cuDNN 9.0.0.306
+# NVIDIA NCCL 2.19.4
+# NVIDIA RAPIDS™ 23.12
+# rdma-core 39.0
+# NVIDIA HPC-X 2.16rc4
+# OpenMPI 4.1.4+
+# GDRCopy 2.3
+# TensorBoard 2.9.0
+# Nsight Compute 2023.3.1.1
+# Nsight Systems 2023.4.1.97
+# NVIDIA TensorRT™ 8.6.3
+# Torch-TensorRT 2.2.0a0
+# NVIDIA DALI® 1.34
+# MAGMA 2.6.2
+# JupyterLab 2.3.2 including Jupyter-TensorBoard
+# TransformerEngine 1.3
+# PyTorch quantization wheel 2.1.2
+ARG LMDEPLOY_VERSION
+ENV LMDEPLOY_VERSION=${LMDEPLOY_VERSION}
+RUN echo "Stage 1 LMDEPLOY_VERSION: ${LMDEPLOY_VERSION}"
+
+# Set environment variables
+ENV TZ=Asia/Shanghai
+ENV LOG_LEVEL=INFO
+ENV CUDA_VISIBLE_DEVICES=0
+ENV WRITE_WHL="true"
+
+# Copy necessary files
+COPY ./../ /ant_lmdeploy
+COPY ./../builder/manywheel/entrypoint_build_ngc.sh /entrypoint_build.sh
+
+# Build the WHL file
+RUN sh /entrypoint_build.sh
+
+# List contents of /tmpbuild for debugging
+RUN ls -la /tmpbuild/
+
+# Verify the WHL file
+RUN WHL_FILE=$(ls /tmpbuild/lmdeploy-${LMDEPLOY_VERSION}*.whl) && \
+    echo "Found WHL file: ${WHL_FILE}"
+
+# List contents of /lmdeploy_build for debugging
+RUN ls -la /lmdeploy_build/
+
+# Stage 2: Create a minimal stage to copy the WHL file
+FROM scratch AS exporter
+COPY --from=builder /lmdeploy_build/*.whl .
+
+# Stage 3: Create the final image
+FROM nvcr.io/nvidia/pytorch:24.02-py3
+
+ARG LMDEPLOY_VERSION
+ENV LMDEPLOY_VERSION=${LMDEPLOY_VERSION}
+
+# Set CUDA architecture list
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+
+# Install dependencies
+COPY ./../requirements/ngc-build.txt /workspace/requirements-ngc-build.txt
+RUN pip3 install -r /workspace/requirements-ngc-build.txt
+
+# Display detailed information about the LMDEPLOY_VERSION for verification
+RUN echo "Stage 3 LMDEPLOY_VERSION: ${LMDEPLOY_VERSION}" && \
+    echo "Current CUDA architecture list: ${TORCH_CUDA_ARCH_LIST}" && \
+    echo "Environment variables set: TZ=${TZ}, LOG_LEVEL=${LOG_LEVEL}, CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, WRITE_WHL=${WRITE_WHL}"
+
+# Copy and install the WHL file
+COPY --from=builder /tmpbuild/lmdeploy-${LMDEPLOY_VERSION}*.whl /workspace/
+
+# Ensure the WHL file is correctly specified
+RUN WHL_FILE=$(ls /workspace/lmdeploy-${LMDEPLOY_VERSION}*.whl) && \
+    echo "Installing WHL file: ${WHL_FILE}" && \
+    if [ -n "$WHL_FILE" ]; then pip3 install "$WHL_FILE" --no-deps; else echo "No WHL file found"; exit 1; fi
+
+ # # Install triton
+# RUN pip3 install triton==2.1.0
+
+WORKDIR /workspace
+
+
+# Example build commands
+
+# Step 1: If not in the docker directory, navigate to it first
+# cd docker
+
+# Step 2: Build and export the WHL file locally
+# This step builds the WHL file and exports it to the local ./lmdeploy_build directory
+# time DOCKER_BUILDKIT=1 docker build --progress=plain --platform linux/amd64 --build-arg LMDEPLOY_VERSION=0.6.1.3   --target exporter --output type=local,dest=./lmdeploy_build -f Dockerfile_ngc ..
+
+# Step 3: Build the final image
+# This step builds the complete image, including installing the WHL file
+# time DOCKER_BUILDKIT=1 docker build --progress=plain --platform linux/amd64 -t ant_lmdeploy:v0.6.1.3_cu123_$(date +"%Y%m%d") --build-arg LMDEPLOY_VERSION=0.6.1.3 -f Dockerfile_ngc ..
diff --git a/docker/docker-compose-ngc-build-amd64-dist.yml b/docker/docker-compose-ngc-build-amd64-dist.yml
@@ -0,0 +1,51 @@
+version: "3.9"
+# FROM nvcr.io/nvidia/pytorch:24.02-py3
+
+# Ubuntu 22.04 including Python 3.10
+# NVIDIA CUDA 12.3.2
+# NVIDIA cuBLAS 12.3.4.1
+# NVIDIA cuDNN 9.0.0.306
+# NVIDIA NCCL 2.19.4
+# NVIDIA RAPIDS™ 23.12
+# rdma-core 39.0
+# NVIDIA HPC-X 2.16rc4
+# OpenMPI 4.1.4+
+# GDRCopy 2.3
+# TensorBoard 2.9.0
+# Nsight Compute 2023.3.1.1
+# Nsight Systems 2023.4.1.97
+# NVIDIA TensorRT™ 8.6.3
+# Torch-TensorRT 2.2.0a0
+# NVIDIA DALI® 1.34
+# MAGMA 2.6.2
+# JupyterLab 2.3.2 including Jupyter-TensorBoard
+# TransformerEngine 1.3
+# PyTorch quantization wheel 2.1.2
+x-node-common:
+  &node-common
+  platform: linux/amd64
+  environment:
+    &node-common-env
+    TZ: Asia/Shanghai
+    LOG_LEVEL: INFO
+    CUDA_VISIBLE_DEVICES: 0
+    LMDEPLOY_VERSION: ${LMDEPLOY_VERSION:-0.6.1.1}
+    WRITE_WHL: "true"  # 添加环境变量，默认值为 "true"
+  image: nvcr.io/nvidia/pytorch:24.02-py3
+  logging:
+    driver: json-file
+    options:
+      max-size: "100m"
+      max-file: "10"
+
+services:
+  build-lmdeploy-whl-amd64-01:
+    <<: *node-common
+    container_name: build-lmdeploy-whl-amd64-01
+    volumes:
+      - ./../:/ant_lmdeploy
+      - ./lmdeploy_build:/lmdeploy_build
+      - ./docker_build_cache:/docker_build_cache
+      - ./../builder/manywheel/entrypoint_build_ngc.sh:/entrypoint_build.sh
+    entrypoint: sh /entrypoint_build.sh
+
diff --git a/lmdeploy/cli/startup.py b/lmdeploy/cli/startup.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from lmdeploy.patch.monkey_patch import patch_all; patch_all()
+print("patching startup.py")
+from .entrypoint import run
+
+if __name__ == '__main__':
+    run()
diff --git a/lmdeploy/patch/__init__.py b/lmdeploy/patch/__init__.py
diff --git a/lmdeploy/patch/monkey_patch.py b/lmdeploy/patch/monkey_patch.py
@@ -0,0 +1,92 @@
+import logging
+
+from argparse import _SubParsersAction, ArgumentParser
+from functools import cache
+from lmdeploy.cli.serve import SubCliServe
+from lmdeploy.serve.openai import api_server as openai_api_serve
+from lmdeploy.serve.openai.protocol import StreamOptions
+
+logger = logging.getLogger(__name__)
+
+class ApiServeCliContext:
+    args = None
+
+def _set_api_serve_cli_context(args):
+    """Set the API serve CLI context with the provided arguments."""
+    ApiServeCliContext.args = args
+
+@cache
+def get_stream_include_usage_status():
+    """Get the status of whether to include stream usage data in the output."""
+    try:
+        return ApiServeCliContext.args.enable_stream_include_usage
+    except:
+        return False
+
+_origin_parse_args = ArgumentParser.parse_args
+def _patch_parse_args(self, args=None, namespace=None):
+    """Patch the parse_args method to set the API serve CLI context if the command is 'serve'."""
+    parser_args = _origin_parse_args(self, args=args, namespace=namespace)
+    command = getattr(parser_args, "command", None)
+    if command != "serve":
+        return parser_args
+    _set_api_serve_cli_context(parser_args)
+
+    return parser_args
+
+
+def get_api_serve_cli_context():
+    """Get the API serve CLI context parser."""
+    api_server_parser = SubCliServe.subparsers.choices['api_server']
+    return api_server_parser
+
+
+_origin_api_serve_check_request = openai_api_serve.check_request
+def _patch_api_serve_check_request(request):
+    """Patch the check_request method to include stream usage data if enabled."""
+    enable_stream_include_usage_status = get_stream_include_usage_status()
+    if enable_stream_include_usage_status:
+        stream_options = getattr(request, "stream_options", None)
+        if not stream_options:
+            stream_options = StreamOptions(include_usage=True)
+            request.stream_options = stream_options
+
+    r = _origin_api_serve_check_request(request)
+    return r
+
+
+def _patch_api_server_add_parser(parser):
+    """Patch the API server parser to add the --enable-stream-include-usage argument."""
+    parser.add_argument(
+        "--enable-stream-include-usage",
+        action="store_true",
+        help="Enable the inclusion of stream usage data in the output, useful for monitoring performance.",
+    )
+    return parser
+
+
+def _patch_sub_parser_add_parser(self, name, **kwargs):
+    """Patch the subparser add_parser method to include the --enable-stream-include-usage argument for 'api_server'."""
+    _parser = self._origin_sub_parser_add_parser(name, **kwargs)
+
+    if name != "api_server":
+        return _parser
+    logger.info("patching api_server add_parser")
+    _parser = _patch_api_server_add_parser(_parser)
+
+    return _parser
+
+
+def patch_all():
+    """Apply all monkey patches."""
+    logger.info("monkey patching all")
+
+    # Patch ArgumentParser
+    _SubParsersAction._origin_sub_parser_add_parser = _SubParsersAction.add_parser
+    _SubParsersAction.add_parser = _patch_sub_parser_add_parser
+
+    # Patch openai_api_serve
+    openai_api_serve.check_request = _patch_api_serve_check_request
+
+    # Patch ArgumentParser
+    ArgumentParser.parse_args = _patch_parse_args